diff --git a/buildroot-external/Config.in b/buildroot-external/Config.in index ea3b8079..c8abd68a 100644 --- a/buildroot-external/Config.in +++ b/buildroot-external/Config.in @@ -296,6 +296,7 @@ menu "Plugins" source "$BR2_EXTERNAL_OPENVOICEOS_PATH/package/python-ovos-stt-plugin-pocketsphinx/Config.in" source "$BR2_EXTERNAL_OPENVOICEOS_PATH/package/python-ovos-stt-plugin-selene/Config.in" source "$BR2_EXTERNAL_OPENVOICEOS_PATH/package/python-ovos-stt-plugin-vosk/Config.in" + source "$BR2_EXTERNAL_OPENVOICEOS_PATH/package/python-ovos-stt-plugin-whispercpp/Config.in" source "$BR2_EXTERNAL_OPENVOICEOS_PATH/package/python-ovos-stt-server-plugin/Config.in" source "$BR2_EXTERNAL_OPENVOICEOS_PATH/package/python-neon-tts-plugin-larynx_server/Config.in" source "$BR2_EXTERNAL_OPENVOICEOS_PATH/package/python-ovos-tts-plugin-marytts/Config.in" diff --git a/buildroot-external/configs/rpi4_64-gui_defconfig b/buildroot-external/configs/rpi4_64-gui_defconfig index 3da2f290..48e99982 100644 --- a/buildroot-external/configs/rpi4_64-gui_defconfig +++ b/buildroot-external/configs/rpi4_64-gui_defconfig @@ -726,6 +726,7 @@ BR2_PACKAGE_PYTHON_OVOS_STT_HTTP_SERVER=y BR2_PACKAGE_PYTHON_OVOS_STT_PLUGIN_CHROMIUM=y BR2_PACKAGE_PYTHON_OVOS_STT_PLUGIN_SELENE=y BR2_PACKAGE_PYTHON_OVOS_STT_PLUGIN_VOSK=y +BR2_PACKAGE_PYTHON_OVOS_STT_PLUGIN_WHISPERCPP=y BR2_PACKAGE_PYTHON_OVOS_STT_SERVER_PLUGIN=y BR2_PACKAGE_PYTHON_NEON_TTS_PLUGIN_LARYNX_SERVER=y BR2_PACKAGE_PYTHON_OVOS_TTS_PLUGIN_MARYTTS=y diff --git a/buildroot-external/package/python-ovos-stt-plugin-whispercpp/0001-python-bindings.patch b/buildroot-external/package/python-ovos-stt-plugin-whispercpp/0001-python-bindings.patch new file mode 100644 index 00000000..c5c3e54e --- /dev/null +++ b/buildroot-external/package/python-ovos-stt-plugin-whispercpp/0001-python-bindings.patch @@ -0,0 +1,340 @@ +From 23942ff3134d3897a40a62ed3a69e7a5c78392df Mon Sep 17 00:00:00 2001 +From: jarbasai +Date: Tue, 25 Oct 2022 00:45:13 +0100 +Subject: [PATCH 1/2] feat/python_bindings + +--- + README.md | 20 ++-- + build_lib.sh | 8 ++ + ovos_stt_plugin_whispercpp/__init__.py | 130 ++++++++++++++++++++----- + 3 files changed, 127 insertions(+), 31 deletions(-) + create mode 100644 build_lib.sh + +diff --git a/README.md b/README.md +index 0eea31d..f41d8de 100644 +--- a/README.md ++++ b/README.md +@@ -19,16 +19,19 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp + + ### WhisperCPP + +-First let's install whisper.cpp from source and move the binary to the plugin expected default path ++First let's build whisper.cpp from source and move the shared library to the plugin expected default path + + ```bash +-git clone https://github.com/ggerganov/whisper.cpp +-cd whisper.cpp +-make +-cp main ~/.local/bin/whispercpp ++# build shared libwhisper.so ++git clone https://github.com/ggerganov/whisper.cpp /tmp/whispercpp ++cd /tmp/whispercpp ++# last commit before a breaking change ++git checkout d6b84b2a23220dd8b8792872a3ab6802cd24b424 ++gcc -O3 -std=c11 -pthread -mavx -mavx2 -mfma -mf16c -fPIC -c ggml.c ++g++ -O3 -std=c++11 -pthread --shared -fPIC -static-libstdc++ whisper.cpp ggml.o -o libwhisper.so ++cp libwhisper.so ~/.local/bin/libwhisper.so + ``` + +-Models will be autodownloaded to `/home/user/.local/share/whispercpp/{model_name}` when plugin is loaded + + ## Configuration + +@@ -42,7 +45,7 @@ available models are `"tiny.en", "tiny", "base.en", "base", "small.en", "small", + "stt": { + "module": "ovos-stt-plugin-whispercpp", + "ovos-stt-plugin-whispercpp": { +- "bin": "/home/user/.local/bin/whispercpp", ++ "lib": "~/.local/bin/libwhisper.so", + "model": "tiny" + } + } +@@ -51,6 +54,9 @@ available models are `"tiny.en", "tiny", "base.en", "base", "small.en", "small", + + ## Models + ++Models will be autodownloaded to `/home/user/.local/share/whispercpp/{model_name}` when plugin is loaded ++ ++ + Memory usage + + | Model | Disk | Mem | +diff --git a/build_lib.sh b/build_lib.sh +new file mode 100644 +index 0000000..2b69c5e +--- /dev/null ++++ b/build_lib.sh +@@ -0,0 +1,8 @@ ++# build shared libwhisper.so ++git clone https://github.com/ggerganov/whisper.cpp /tmp/whispercpp ++cd /tmp/whispercpp ++# last commit before a breaking change ++git checkout d6b84b2a23220dd8b8792872a3ab6802cd24b424 ++gcc -O3 -std=c11 -pthread -mavx -mavx2 -mfma -mf16c -fPIC -c ggml.c ++g++ -O3 -std=c++11 -pthread --shared -fPIC -static-libstdc++ whisper.cpp ggml.o -o libwhisper.so ++cp libwhisper.so $HOME/.local/bin/libwhisper.so +\ No newline at end of file +diff --git a/ovos_stt_plugin_whispercpp/__init__.py b/ovos_stt_plugin_whispercpp/__init__.py +index d347898..91982eb 100644 +--- a/ovos_stt_plugin_whispercpp/__init__.py ++++ b/ovos_stt_plugin_whispercpp/__init__.py +@@ -1,11 +1,101 @@ ++import ctypes + import os +-import subprocess +-from tempfile import NamedTemporaryFile ++import pathlib + ++# this is needed to read the WAV file properly ++import numpy + import requests + from ovos_plugin_manager.templates.stt import STT + from ovos_utils.log import LOG + from ovos_utils.xdg_utils import xdg_data_home ++from speech_recognition import AudioData ++ ++ ++# this needs to match the C struct in whisper.h ++class WhisperFullParams(ctypes.Structure): ++ _fields_ = [ ++ ("strategy", ctypes.c_int), ++ ("n_threads", ctypes.c_int), ++ ("offset_ms", ctypes.c_int), ++ ("translate", ctypes.c_bool), ++ ("no_context", ctypes.c_bool), ++ ("print_special_tokens", ctypes.c_bool), ++ ("print_progress", ctypes.c_bool), ++ ("print_realtime", ctypes.c_bool), ++ ("print_timestamps", ctypes.c_bool), ++ ("language", ctypes.c_char_p), ++ ("greedy", ctypes.c_int * 1), ++ ] ++ ++ ++class WhisperEngine: ++ def __init__(self, libname, model_path): ++ # load library and model ++ self.libname = pathlib.Path().absolute() / libname ++ self.whisper = ctypes.CDLL(libname) ++ ++ # tell Python what are the return types of the functions ++ self.whisper.whisper_init.restype = ctypes.c_void_p ++ self.whisper.whisper_full_default_params.restype = WhisperFullParams ++ self.whisper.whisper_full_get_segment_text.restype = ctypes.c_char_p ++ ++ # initialize whisper.cpp context ++ self.ctx = self.whisper.whisper_init(model_path.encode("utf-8")) ++ ++ # get default whisper parameters and adjust as needed ++ self.params = self.whisper.whisper_full_default_params(0) ++ self.params.print_realtime = True ++ self.params.print_progress = False ++ self.params.print_timestamps = False ++ self.params.n_threads = os.cpu_count() - 1 ++ self.params.translate = False ++ ++ def audiodata2array(self, audio_data): ++ assert isinstance(audio_data, AudioData) ++ # Convert buffer to float32 using NumPy ++ audio_as_np_int16 = numpy.frombuffer(audio_data.get_wav_data(), dtype=numpy.int16) ++ audio_as_np_float32 = audio_as_np_int16.astype(numpy.float32) ++ ++ # Normalise float32 array so that values are between -1.0 and +1.0 ++ max_int16 = 2 ** 15 ++ data = audio_as_np_float32 / max_int16 ++ return data ++ ++ def transcribe_wav(self, wav, lang="en"): ++ self.params.language = lang.encode() ++ ++ if isinstance(wav, str): ++ with AudioFile(wav) as source: ++ audio = Recognizer().record(source) ++ elif isinstance(wav, AudioData): ++ audio = wav ++ else: ++ raise ValueError(f"invalid audio: {wav}") ++ ++ return self.transcribe_audio(audio, lang) ++ ++ def transcribe_audio(self, audio, lang="en"): ++ self.params.language = lang.encode() ++ ++ data = self.audiodata2array(audio) ++ ++ # run the inference ++ result = self.whisper.whisper_full(ctypes.c_void_p(self.ctx), self.params, ++ data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), ++ len(data)) ++ if result != 0: ++ raise RuntimeError(f"Error: {result}") ++ ++ # print results from Python ++ n_segments = self.whisper.whisper_full_n_segments(ctypes.c_void_p(self.ctx)) ++ txt = b"" ++ for i in range(n_segments): ++ txt += self.whisper.whisper_full_get_segment_text(ctypes.c_void_p(self.ctx), i) ++ return txt.decode("utf-8") ++ ++ def shutdown(self): ++ # free the memory ++ self.whisper.whisper_free(ctypes.c_void_p(self.ctx)) + + + class WhispercppSTT(STT): +@@ -115,11 +205,12 @@ class WhispercppSTT(STT): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) +- self.bin = self.config.get("binary") or os.path.expanduser("~/.local/bin/whispercpp") ++ lib = self.config.get("lib") or "~/.local/bin/libwhisper.so" ++ self.lib = os.path.expanduser(lib) + # self.bin = os.path.expanduser("~/whisper.cpp/main") +- if not self.bin: +- LOG.error( +- "you need to manually install whisper.cpp, please provide full path to ./main binary generated by compiling https://github.com/ggerganov/whisper.cpp") ++ if not os.path.isfile(self.lib): ++ LOG.error("you need to provicde libwhisper.so, please follow the README.md instructions") ++ raise ImportError("libwhisper.so not found") + + self.model_folder = self.config.get("model_folder") or f"{xdg_data_home()}/whispercpp" + model = self.config.get("model") +@@ -129,7 +220,8 @@ def __init__(self, *args, **kwargs): + else: + model = "tiny" + os.makedirs(self.model_folder, exist_ok=True) +- self.model_path = self.get_model(model) ++ model_path = self.get_model(model) ++ self.engine = WhisperEngine(self.lib, model_path) + + def get_model(self, model_name): + if os.path.isfile(model_name): +@@ -145,27 +237,18 @@ def get_model(self, model_name): + f.write(data) + return model_path + +- def execute(self, audio, language=None, model=None): ++ def execute(self, audio, language=None): + lang = language or self.lang +- if model: +- if not os.path.isfile(model): +- model = self.get_model(model) +- +- model = model or self.model_path +- +- with NamedTemporaryFile() as f: +- f.write(audio.get_wav_data()) +- lang = lang.split('-')[0].lower() +- cmd = f"{self.bin} -m {model} -l {lang} -f {f.name}" +- t = subprocess.check_output(cmd, shell=True, stderr=subprocess.PIPE).decode("utf-8") +- transcript = t.split("] ")[-1].strip() +- +- return transcript ++ return self.engine.transcribe_audio(audio, lang) + + @property + def available_languages(self) -> set: + return set(self.LANGUAGES.keys()) + ++ def __del__(self): ++ if self.engine: ++ self.engine.shutdown() ++ + + WhispercppSTTConfig = { + lang: [{"model": "tiny", +@@ -195,12 +278,11 @@ def available_languages(self) -> set: + + if __name__ == "__main__": + b = WhispercppSTT() +- + from speech_recognition import Recognizer, AudioFile + + with AudioFile("/home/user/PycharmProjects/selene_api/test/test.wav") as source: + audio = Recognizer().record(source) + +- a = b.execute(audio, model="base") ++ a = b.execute(audio) + + print(a) + +From ed15bfab9643d68a39f91c621e21ab450a03a628 Mon Sep 17 00:00:00 2001 +From: jarbasai +Date: Tue, 25 Oct 2022 01:20:07 +0100 +Subject: [PATCH 2/2] remove print + +--- + ovos_stt_plugin_whispercpp/__init__.py | 29 +++++++++----------------- + 1 file changed, 10 insertions(+), 19 deletions(-) + +diff --git a/ovos_stt_plugin_whispercpp/__init__.py b/ovos_stt_plugin_whispercpp/__init__.py +index 91982eb..7daf309 100644 +--- a/ovos_stt_plugin_whispercpp/__init__.py ++++ b/ovos_stt_plugin_whispercpp/__init__.py +@@ -44,7 +44,7 @@ def __init__(self, libname, model_path): + + # get default whisper parameters and adjust as needed + self.params = self.whisper.whisper_full_default_params(0) +- self.params.print_realtime = True ++ self.params.print_realtime = False + self.params.print_progress = False + self.params.print_timestamps = False + self.params.n_threads = os.cpu_count() - 1 +@@ -62,19 +62,12 @@ def audiodata2array(self, audio_data): + return data + + def transcribe_wav(self, wav, lang="en"): +- self.params.language = lang.encode() +- +- if isinstance(wav, str): +- with AudioFile(wav) as source: +- audio = Recognizer().record(source) +- elif isinstance(wav, AudioData): +- audio = wav +- else: +- raise ValueError(f"invalid audio: {wav}") +- ++ with AudioFile(wav) as source: ++ audio = Recognizer().record(source) + return self.transcribe_audio(audio, lang) + + def transcribe_audio(self, audio, lang="en"): ++ lang = lang.lower().split("-")[0] + self.params.language = lang.encode() + + data = self.audiodata2array(audio) +@@ -91,7 +84,7 @@ def transcribe_audio(self, audio, lang="en"): + txt = b"" + for i in range(n_segments): + txt += self.whisper.whisper_full_get_segment_text(ctypes.c_void_p(self.ctx), i) +- return txt.decode("utf-8") ++ return txt.decode("utf-8").strip() + + def shutdown(self): + # free the memory +@@ -215,10 +208,7 @@ def __init__(self, *args, **kwargs): + self.model_folder = self.config.get("model_folder") or f"{xdg_data_home()}/whispercpp" + model = self.config.get("model") + if not model: +- if self.lang.startswith("en"): +- model = "tiny.en" +- else: +- model = "tiny" ++ model = "tiny" + os.makedirs(self.model_folder, exist_ok=True) + model_path = self.get_model(model) + self.engine = WhisperEngine(self.lib, model_path) +@@ -280,9 +270,10 @@ def __del__(self): + b = WhispercppSTT() + from speech_recognition import Recognizer, AudioFile + +- with AudioFile("/home/user/PycharmProjects/selene_api/test/test.wav") as source: ++ jfk = "/home/user/whisper.cpp/samples/jfk.wav" ++ with AudioFile(jfk) as source: + audio = Recognizer().record(source) + +- a = b.execute(audio) +- ++ a = b.execute(audio, language="en") + print(a) ++ diff --git a/buildroot-external/package/python-ovos-stt-plugin-whispercpp/Config.in b/buildroot-external/package/python-ovos-stt-plugin-whispercpp/Config.in new file mode 100644 index 00000000..bafb4e5f --- /dev/null +++ b/buildroot-external/package/python-ovos-stt-plugin-whispercpp/Config.in @@ -0,0 +1,6 @@ +config BR2_PACKAGE_PYTHON_OVOS_STT_PLUGIN_WHISPERCPP + bool "python-ovos-stt-plugin-whispercpp" + help + OVOS STT plugin for whispercpp + + https://github.com/OpenVoiceOS/ovos-stt-plugin-whispercpp diff --git a/buildroot-external/package/python-ovos-stt-plugin-whispercpp/python-ovos-stt-plugin-whispercpp.hash b/buildroot-external/package/python-ovos-stt-plugin-whispercpp/python-ovos-stt-plugin-whispercpp.hash new file mode 100644 index 00000000..08b11772 --- /dev/null +++ b/buildroot-external/package/python-ovos-stt-plugin-whispercpp/python-ovos-stt-plugin-whispercpp.hash @@ -0,0 +1,2 @@ +# sha256 locally computed +sha256 efe2ec892f6a0e9f1f9b24ae804d6975f7fbd8aaaf0b264e0d5bf4917b0e7a71 python-ovos-stt-plugin-whispercpp-630db8ff8b2ea8f56c30f39254d880b0f572c921.tar.gz diff --git a/buildroot-external/package/python-ovos-stt-plugin-whispercpp/python-ovos-stt-plugin-whispercpp.mk b/buildroot-external/package/python-ovos-stt-plugin-whispercpp/python-ovos-stt-plugin-whispercpp.mk new file mode 100644 index 00000000..0f2dbb31 --- /dev/null +++ b/buildroot-external/package/python-ovos-stt-plugin-whispercpp/python-ovos-stt-plugin-whispercpp.mk @@ -0,0 +1,12 @@ +################################################################################ +# +# python-ovos-stt-plugin-whispercpp +# +################################################################################ + +PYTHON_OVOS_STT_PLUGIN_WHISPERCPP_VERSION = 630db8ff8b2ea8f56c30f39254d880b0f572c921 +PYTHON_OVOS_STT_PLUGIN_WHISPERCPP_SITE = $(call github,OpenVoiceOS,ovos-stt-plugin-whispercpp,$(PYTHON_OVOS_STT_PLUGIN_WHISPERCPP_VERSION)) +PYTHON_OVOS_STT_PLUGIN_WHISPERCPP_SETUP_TYPE = setuptools +PYTHON_OVOS_STT_PLUGIN_WHISPERCPP_ENV = MYCROFT_LOOSE_REQUIREMENTS=true + +$(eval $(python-package))