diff --git a/app/src/pages/LanguageSettings.tsx b/app/src/pages/LanguageSettings.tsx index d93c290b..0a491d5c 100644 --- a/app/src/pages/LanguageSettings.tsx +++ b/app/src/pages/LanguageSettings.tsx @@ -264,6 +264,16 @@ export function LanguageSettingsPage(): JSX.Element { + + Whisper Models for {language.lang} + + + Transcription Models for {language.lang} diff --git a/app/src/pages/ModelManager.tsx b/app/src/pages/ModelManager.tsx index db6410e2..08f278b8 100644 --- a/app/src/pages/ModelManager.tsx +++ b/app/src/pages/ModelManager.tsx @@ -41,6 +41,7 @@ export function ModelManagerPage(): JSX.Element { Language Transcription Models + Whisper Models @@ -61,6 +62,11 @@ export function ModelManagerPage(): JSX.Element { lang={lang.lang} downloaded={downloaded} /> + diff --git a/app/src/state/models.ts b/app/src/state/models.ts index 672f1e13..db36bf6b 100644 --- a/app/src/state/models.ts +++ b/app/src/state/models.ts @@ -25,6 +25,7 @@ export interface Model { export interface Language { lang: string; transcription_models: Model[]; + whisper_models: Model[]; } export type DownloadingModel = Model & { diff --git a/server/app/models.py b/server/app/models.py index 0f0bcbc6..139805b3 100644 --- a/server/app/models.py +++ b/server/app/models.py @@ -8,6 +8,7 @@ from urllib.parse import urlparse from zipfile import ZipFile +import huggingface_hub import requests import yaml from vosk import Model @@ -40,7 +41,7 @@ class ModelDescription: size: str type: str lang: str - compressed: bool = field(default=False) + download_type: str = field(default=False) model_id: str = field(default=None) def __post_init__(self): @@ -58,9 +59,10 @@ def is_downloaded(self) -> bool: class Language: lang: str transcription_models: List[ModelDescription] = field(default_factory=list) + whisper_models: List[ModelDescription] = field(default_factory=list) def all_models(self): - return self.transcription_models + return self.transcription_models + self.whisper_models class ModelDefaultDict(defaultdict): @@ -81,6 +83,8 @@ def __init__(self): models[model_description.model_id] = model_description if model["type"] == "transcription": languages[lang].transcription_models.append(model_description) + elif model["type"] == "whisper": + languages[lang].whisper_models.append(model_description) self.available = dict(languages) self.model_descriptions = models @@ -122,38 +126,61 @@ def get(self, model_id: str) -> Union[Model]: def download(self, model_id: str, task_uuid: str): task: DownloadModelTask = tasks.get(task_uuid) model = self.get_model_description(model_id) - with tempfile.TemporaryFile(dir=CACHE_DIR) as f: - response = requests.get(model.url, stream=True) - task.total = int(response.headers.get("content-length")) - task.state = DownloadModelState.DOWNLOADING - - for data in response.iter_content( - chunk_size=max(int(task.total / 1000), 1024 * 1024) - ): - task.add_progress(len(data)) - - f.write(data) - if task.canceled: - return - - task.state = DownloadModelState.EXTRACTING - if model.compressed: - with ZipFile(f) as archive: - target_dir = model.path() - for info in archive.infolist(): - if info.is_dir(): - continue - path = target_dir / Path("/".join(info.filename.split("/")[1:])) - path.parent.mkdir(exist_ok=True, parents=True) - - source = archive.open(info.filename) - target = open(path, "wb") - with source, target: - shutil.copyfileobj(source, target) - else: - f.seek(0) - with open(model.path(), "wb") as target: - shutil.copyfileobj(f, target) + + if model.download_type.startswith("http"): + with tempfile.TemporaryFile(dir=CACHE_DIR) as f: + response = requests.get(model.url, stream=True) + task.total = int(response.headers.get("content-length")) + task.state = DownloadModelState.DOWNLOADING + + for data in response.iter_content( + chunk_size=max(int(task.total / 1000), 1024 * 1024) + ): + task.add_progress(len(data)) + + f.write(data) + if task.canceled: + return + + task.state = DownloadModelState.EXTRACTING + if model.download_type.endswith("+zip"): + with ZipFile(f) as archive: + target_dir = model.path() + for info in archive.infolist(): + if info.is_dir(): + continue + path = target_dir / Path( + "/".join(info.filename.split("/")[1:]) + ) + path.parent.mkdir(exist_ok=True, parents=True) + + source = archive.open(info.filename) + target = open(path, "wb") + with source, target: + shutil.copyfileobj(source, target) + else: + f.seek(0) + with open(model.path(), "wb") as target: + shutil.copyfileobj(f, target) + elif model.download_type == "huggingface": + api = huggingface_hub.HfApi() + repo_info = api.repo_info(model.url, files_metadata=True) + task.total = sum(f.size for f in repo_info.siblings) + with tempfile.TemporaryDirectory(dir=CACHE_DIR) as dir: + for f in repo_info.siblings: + url = huggingface_hub.hf_hub_url(model.url, f.rfilename) + with open(Path(dir) / f.rfilename, "wb") as file: + task.state = DownloadModelState.DOWNLOADING + response = requests.get(url, stream=True) + for data in response.iter_content( + chunk_size=max(int(task.total / 1000), 1024 * 1024) + ): + task.add_progress(len(data)) + + file.write(data) + if task.canceled: + return + shutil.copytree(dir, model.path()) task.state = DownloadModelState.DONE diff --git a/server/app/models.yml b/server/app/models.yml index 65659f2a..0560f6cd 100644 --- a/server/app/models.yml +++ b/server/app/models.yml @@ -1,253 +1,377 @@ # this file is autogenerated by the ../scripts/generate_models_list.py script. # do not edit manually! +Universal: +- name: whisper-tiny + url: guillaumekln/faster-whisper-tiny + description: Whisper model doing both transcription and punctuation reconstruction + size: 74M + type: whisper + download_type: huggingface +- name: whisper-base + url: guillaumekln/faster-whisper-base + description: Whisper model doing both transcription and punctuation reconstruction + size: 141M + type: whisper + download_type: huggingface +- name: whisper-small + url: guillaumekln/faster-whisper-small + description: Whisper model doing both transcription and punctuation reconstruction + size: 463M + type: whisper + download_type: huggingface +- name: whisper-medium + url: guillaumekln/faster-whisper-medium + description: Whisper model doing both transcription and punctuation reconstruction + size: 1459M + type: whisper + download_type: huggingface +- name: whisper-large-v1 + url: guillaumekln/faster-whisper-large-v1 + description: Whisper model doing both transcription and punctuation reconstruction + size: 2946M + type: whisper + download_type: huggingface +- name: whisper-large-v2 + url: guillaumekln/faster-whisper-large-v2 + description: Whisper model doing both transcription and punctuation reconstruction + size: 2946M + type: whisper + download_type: huggingface English: +- name: whisper-tiny.en + url: guillaumekln/faster-whisper-tiny.en + description: Whisper model doing both transcription and punctuation reconstruction + size: 74M + type: whisper + download_type: huggingface +- name: whisper-base.en + url: guillaumekln/faster-whisper-base.en + description: Whisper model doing both transcription and punctuation reconstruction + size: 140M + type: whisper + download_type: huggingface +- name: whisper-small.en + url: guillaumekln/faster-whisper-small.en + description: Whisper model doing both transcription and punctuation reconstruction + size: 463M + type: whisper + download_type: huggingface +- name: whisper-medium.en + url: guillaumekln/faster-whisper-medium.en + description: Whisper model doing both transcription and punctuation reconstruction + size: 1459M + type: whisper + download_type: huggingface - name: small url: https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip description: Lightweight wideband model for Android and RPi size: 40M type: transcription - compressed: true + download_type: http+zip - name: big url: https://alphacephei.com/vosk/models/vosk-model-en-us-0.22.zip description: Accurate generic US English model size: 1.8G type: transcription - compressed: true + download_type: http+zip - name: lgraph url: https://alphacephei.com/vosk/models/vosk-model-en-us-0.22-lgraph.zip description: Big US English model with dynamic graph size: 128M type: transcription - compressed: true + download_type: http+zip +- name: big-2 + url: https://alphacephei.com/vosk/models/vosk-model-en-us-0.42-gigaspeech.zip + description: Accurate generic US English model trained by Kaldi on Gigaspeech. + Mostly for podcasts, not for telephony + size: 2.3G + type: transcription + download_type: http+zip +- name: big-3 + url: https://alphacephei.com/vosk/models/vosk-model-en-us-daanzu-20200905.zip + description: Wideband model for dictation from Kaldi-active-grammar + project + size: 1.0G + type: transcription + download_type: http+zip +- name: lgraph-2 + url: https://alphacephei.com/vosk/models/vosk-model-en-us-daanzu-20200905-lgraph.zip + description: Wideband model for dictation from Kaldi-active-grammar + project with configurable graph + size: 129M + type: transcription + download_type: http+zip +- name: big-4 + url: https://alphacephei.com/vosk/models/vosk-model-en-us-librispeech-0.2.zip + description: Repackaged Librispeech model from Kaldi, + not very accurate + size: 845M + type: transcription + download_type: http+zip +- name: small-2 + url: https://alphacephei.com/vosk/models/vosk-model-small-en-us-zamia-0.5.zip + description: Repackaged Zamia model f_250, mainly for research + size: 49M + type: transcription + download_type: http+zip +- name: big-5 + url: https://alphacephei.com/vosk/models/vosk-model-en-us-aspire-0.2.zip + description: Kaldi original ASPIRE model, not very accurate + size: 1.4G + type: transcription + download_type: http+zip +- name: big-6 + url: https://alphacephei.com/vosk/models/vosk-model-en-us-0.21.zip + description: Wideband model previous generation + size: 1.6G + type: transcription + download_type: http+zip Indian English: - name: big url: https://alphacephei.com/vosk/models/vosk-model-en-in-0.5.zip description: Generic Indian English model for telecom and broadcast size: 1G type: transcription - compressed: true + download_type: http+zip - name: small url: https://alphacephei.com/vosk/models/vosk-model-small-en-in-0.4.zip description: Lightweight Indian English model for mobile applications size: 36M type: transcription - compressed: true + download_type: http+zip Chinese: - name: small url: https://alphacephei.com/vosk/models/vosk-model-small-cn-0.22.zip description: Lightweight model for Android and RPi size: 42M type: transcription - compressed: true + download_type: http+zip - name: big url: https://alphacephei.com/vosk/models/vosk-model-cn-0.22.zip description: Big generic Chinese model for server processing size: 1.3G type: transcription - compressed: true -Chinese Other: -- name: big + download_type: http+zip +- name: big-2 url: https://alphacephei.com/vosk/models/vosk-model-cn-kaldi-multicn-0.15.zip description: Original Wideband Kaldi multi-cn model from Kaldi with Vosk LM size: 1.5G type: transcription - compressed: true + download_type: http+zip Russian: - name: big url: https://alphacephei.com/vosk/models/vosk-model-ru-0.42.zip description: Big mixed band Russian model for servers size: 1.8G type: transcription - compressed: true + download_type: http+zip - name: small url: https://alphacephei.com/vosk/models/vosk-model-small-ru-0.22.zip description: Lightweight wideband model for Android/iOS and RPi size: 45M type: transcription - compressed: true -Russian Other: -- name: big + download_type: http+zip +- name: big-2 url: https://alphacephei.com/vosk/models/vosk-model-ru-0.22.zip description: Big mixed band Russian model for servers size: 1.5G type: transcription - compressed: true -- name: big-2 + download_type: http+zip +- name: big-3 url: https://alphacephei.com/vosk/models/vosk-model-ru-0.10.zip description: Big narrowband Russian model for servers size: 2.5G type: transcription - compressed: true + download_type: http+zip French: - name: small url: https://alphacephei.com/vosk/models/vosk-model-small-fr-0.22.zip description: Lightweight wideband model for Android/iOS and RPi size: 41M type: transcription - compressed: true + download_type: http+zip - name: big url: https://alphacephei.com/vosk/models/vosk-model-fr-0.22.zip description: Big accurate model for servers size: 1.4G type: transcription - compressed: true -French Other: -- name: small + download_type: http+zip +- name: small-2 url: https://alphacephei.com/vosk/models/vosk-model-small-fr-pguyot-0.3.zip description: Lightweight wideband model for Android and RPi trained by Paul Guyot size: 39M type: transcription - compressed: true + download_type: http+zip - name: linto-2.2 url: https://alphacephei.com/vosk/models/vosk-model-fr-0.6-linto-2.2.0.zip description: Model from LINTO project size: 1.5G type: transcription - compressed: true + download_type: http+zip German: - name: big url: https://alphacephei.com/vosk/models/vosk-model-de-0.21.zip description: Big German model for telephony and server size: 1.9G type: transcription - compressed: true + download_type: http+zip - name: big-2 url: https://alphacephei.com/vosk/models/vosk-model-de-tuda-0.6-900k.zip description: Latest big wideband model from Tuda-DE project size: 4.4G type: transcription - compressed: true + download_type: http+zip - name: small + url: https://alphacephei.com/vosk/models/vosk-model-small-de-zamia-0.3.zip + description: Zamia f_250 small model repackaged (not recommended) + size: 49M + type: transcription + download_type: http+zip +- name: small-2 url: https://alphacephei.com/vosk/models/vosk-model-small-de-0.15.zip description: Lightweight wideband model for Android and RPi size: 45M type: transcription - compressed: true + download_type: http+zip Spanish: - name: small url: https://alphacephei.com/vosk/models/vosk-model-small-es-0.42.zip description: Lightweight wideband model for Android and RPi size: 39M type: transcription - compressed: true + download_type: http+zip - name: big url: https://alphacephei.com/vosk/models/vosk-model-es-0.42.zip description: Big model for Spanish size: 1.4G type: transcription - compressed: true + download_type: http+zip Portuguese/Brazilian Portuguese: - name: small url: https://alphacephei.com/vosk/models/vosk-model-small-pt-0.3.zip description: Lightweight wideband model for Android and RPi size: 31M type: transcription - compressed: true + download_type: http+zip - name: big url: https://alphacephei.com/vosk/models/vosk-model-pt-fb-v0.1.1-20220516_2113.zip description: Big model from FalaBrazil size: 1.6G type: transcription - compressed: true + download_type: http+zip +Greek: +- name: big + url: https://alphacephei.com/vosk/models/vosk-model-el-gr-0.7.zip + description: Big narrowband Greek model for server processing, not extremely accurate + though + size: 1.1G + type: transcription + download_type: http+zip Turkish: - name: small url: https://alphacephei.com/vosk/models/vosk-model-small-tr-0.3.zip description: Lightweight wideband model for Android and RPi size: 35M type: transcription - compressed: true + download_type: http+zip Vietnamese: - name: small url: https://alphacephei.com/vosk/models/vosk-model-small-vn-0.4.zip description: Lightweight Vietnamese model size: 32M type: transcription - compressed: true + download_type: http+zip - name: big url: https://alphacephei.com/vosk/models/vosk-model-vn-0.4.zip description: Bigger Vietnamese model for server size: 78M type: transcription - compressed: true + download_type: http+zip Italian: - name: small url: https://alphacephei.com/vosk/models/vosk-model-small-it-0.22.zip description: Lightweight model for Android and RPi size: 48M type: transcription - compressed: true + download_type: http+zip - name: big url: https://alphacephei.com/vosk/models/vosk-model-it-0.22.zip description: Big generic Italian model for servers size: 1.2G type: transcription - compressed: true + download_type: http+zip Dutch: - name: small url: https://alphacephei.com/vosk/models/vosk-model-small-nl-0.22.zip description: Lightweight model for Dutch size: 39M type: transcription - compressed: true -Dutch Other: + download_type: http+zip - name: big url: https://alphacephei.com/vosk/models/vosk-model-nl-spraakherkenning-0.6.zip description: Medium Dutch model from Kaldi_NL size: 860M type: transcription - compressed: true + download_type: http+zip - name: lgraph url: https://alphacephei.com/vosk/models/vosk-model-nl-spraakherkenning-0.6-lgraph.zip description: Smaller model with dynamic graph size: 100M type: transcription - compressed: true + download_type: http+zip Catalan: - name: small url: https://alphacephei.com/vosk/models/vosk-model-small-ca-0.4.zip description: Lightweight wideband model for Android and RPi for Catalan size: 42M type: transcription - compressed: true + download_type: http+zip Arabic: - name: big url: https://alphacephei.com/vosk/models/vosk-model-ar-mgb2-0.4.zip description: Repackaged Arabic model trained on MGB2 dataset from Kaldi size: 318M type: transcription - compressed: true + download_type: http+zip - name: big-2 url: https://alphacephei.com/vosk/models/vosk-model-ar-0.22-linto-1.1.0.zip description: Big model from LINTO project size: 1.3G type: transcription - compressed: true + download_type: http+zip Farsi: - name: small url: https://alphacephei.com/vosk/models/vosk-model-small-fa-0.4.zip description: Lightweight wideband model for Android and RPi for Farsi (Persian) size: 47M type: transcription - compressed: true + download_type: http+zip +- name: big + url: https://alphacephei.com/vosk/models/vosk-model-fa-0.5.zip + description: Model with large vocabulary, not yet accurate but better than before + (Persian) + size: 1G + type: transcription + download_type: http+zip - name: small-2 url: https://alphacephei.com/vosk/models/vosk-model-small-fa-0.5.zip description: Bigger small model for desktop application (Persian) size: 60M type: transcription - compressed: true + download_type: http+zip Filipino: - name: big url: https://alphacephei.com/vosk/models/vosk-model-tl-ph-generic-0.6.zip description: Medium wideband model for Filipino (Tagalog) by feddybear size: 320M type: transcription - compressed: true + download_type: http+zip Ukrainian: - name: small url: https://alphacephei.com/vosk/models/vosk-model-small-uk-v3-nano.zip @@ -255,41 +379,41 @@ Ukrainian: Recognition for Ukrainian size: 73M type: transcription - compressed: true + download_type: http+zip - name: small-2 url: https://alphacephei.com/vosk/models/vosk-model-small-uk-v3-small.zip description: Small model from Speech Recognition for Ukrainian size: 133M type: transcription - compressed: true + download_type: http+zip - name: big url: https://alphacephei.com/vosk/models/vosk-model-uk-v3.zip description: Bigger model from Speech Recognition for Ukrainian size: 343M type: transcription - compressed: true + download_type: http+zip - name: lgraph url: https://alphacephei.com/vosk/models/vosk-model-uk-v3-lgraph.zip description: Big dynamic model from Speech Recognition for Ukrainian size: 325M type: transcription - compressed: true + download_type: http+zip Kazakh: - name: small url: https://alphacephei.com/vosk/models/vosk-model-small-kz-0.15.zip description: Small mobile model from SAIDA_Kazakh size: 42M type: transcription - compressed: true + download_type: http+zip - name: big url: https://alphacephei.com/vosk/models/vosk-model-kz-0.15.zip description: Bigger wideband model SAIDA_Kazakh size: 378M type: transcription - compressed: true + download_type: http+zip Swedish: - name: small url: https://alphacephei.com/vosk/models/vosk-model-small-sv-rhasspy-0.15.zip @@ -297,68 +421,68 @@ Swedish: project size: 289M type: transcription - compressed: true + download_type: http+zip Japanese: - name: small url: https://alphacephei.com/vosk/models/vosk-model-small-ja-0.22.zip description: Lightweight wideband model for Japanese size: 48M type: transcription - compressed: true + download_type: http+zip - name: big url: https://alphacephei.com/vosk/models/vosk-model-ja-0.22.zip description: Big model for Japanese size: 1Gb type: transcription - compressed: true + download_type: http+zip Esperanto: - name: small url: https://alphacephei.com/vosk/models/vosk-model-small-eo-0.42.zip description: Lightweight model for Esperanto size: 42M type: transcription - compressed: true + download_type: http+zip Hindi: - name: small url: https://alphacephei.com/vosk/models/vosk-model-small-hi-0.22.zip description: Lightweight model for Hindi size: 42M type: transcription - compressed: true + download_type: http+zip - name: big url: https://alphacephei.com/vosk/models/vosk-model-hi-0.22.zip description: Big accurate model for servers size: 1.5Gb type: transcription - compressed: true + download_type: http+zip Czech: - name: small url: https://alphacephei.com/vosk/models/vosk-model-small-cs-0.4-rhasspy.zip description: Lightweight model for Czech from Rhasspy project size: 44M type: transcription - compressed: true + download_type: http+zip Polish: - name: small url: https://alphacephei.com/vosk/models/vosk-model-small-pl-0.22.zip description: Lightweight model for Polish size: 50M type: transcription - compressed: true + download_type: http+zip Uzbek: - name: small url: https://alphacephei.com/vosk/models/vosk-model-small-uz-0.22.zip description: Lightweight model for Uzbek size: 49M type: transcription - compressed: true + download_type: http+zip Korean: - name: small url: https://alphacephei.com/vosk/models/vosk-model-small-ko-0.22.zip description: Lightweight model for Korean size: 82M type: transcription - compressed: true + download_type: http+zip Breton: - name: big url: https://alphacephei.com/vosk/models/vosk-model-br-0.8.zip @@ -366,4 +490,4 @@ Breton: project size: 70M type: transcription - compressed: true + download_type: http+zip diff --git a/server/scripts/generate_models_list.py b/server/scripts/generate_models_list.py index 2d3737ef..2092a5d6 100644 --- a/server/scripts/generate_models_list.py +++ b/server/scripts/generate_models_list.py @@ -1,12 +1,45 @@ from collections import defaultdict from pathlib import Path +import huggingface_hub import requests import yaml from bs4 import BeautifulSoup +WHISPER_MODELS = { + "tiny": "guillaumekln/faster-whisper-tiny", + "base": "guillaumekln/faster-whisper-base", + "small": "guillaumekln/faster-whisper-small", + "medium": "guillaumekln/faster-whisper-medium", + "large-v1": "guillaumekln/faster-whisper-large-v1", + "large-v2": "guillaumekln/faster-whisper-large-v2", + "tiny.en": "guillaumekln/faster-whisper-tiny.en", + "base.en": "guillaumekln/faster-whisper-base.en", + "small.en": "guillaumekln/faster-whisper-small.en", + "medium.en": "guillaumekln/faster-whisper-medium.en", +} + HARDCODED_MODELS = [] +models = [] + +api = huggingface_hub.HfApi() +for name, url in WHISPER_MODELS.items(): + repo_info = api.repo_info(url, files_metadata=True) + models.append( + { + "lang": "English" if name.endswith(".en") else "Universal", + "name": f"whisper-{name}", + "url": url, + "description": "Whisper model doing both transcription and punctuation reconstruction", + "size": f"{int(sum(f.size for f in repo_info.siblings) / 1024 / 1024)}M", + "type": "whisper", + "download_type": "huggingface", + }, + ) + +models.extend(HARDCODED_MODELS) + r = requests.get("https://alphacephei.com/vosk/models") assert r.status_code == 200 soup = BeautifulSoup(r.content, "html.parser") @@ -14,8 +47,6 @@ columns = [x.text for x in table.find_all("th")] rows = table.find("tbody").find_all("tr") - -models = HARDCODED_MODELS current_lang = None for row in rows: if strong := row.find("strong"): @@ -26,8 +57,7 @@ ), "no previous language heading found, probably the format changed :(" raw = {k: v for k, v in zip(columns, row.find_all("td"))} - if current_lang == "English Other" or "not" in raw["Notes"].text.lower(): - continue + current_lang = current_lang.replace("Other", "").strip() if current_lang == "Speaker identification model": continue @@ -46,7 +76,7 @@ description=raw["Notes"].decode_contents(), size=raw["Size"].text, type="transcription", - compressed=True, + download_type="http+zip", ) models += [model]