Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Fedir Zadniprovskyi
commited on
Commit
Β·
43cc67a
1
Parent(s):
9922993
rename to `speaches`
Browse files- Dockerfile +4 -4
- README.md +13 -9
- Taskfile.yaml +2 -2
- compose.cpu.yaml +3 -3
- compose.cuda-cdi.yaml +2 -2
- compose.cuda.yaml +3 -3
- compose.observability.yaml +1 -1
- compose.yaml +2 -2
- docs/configuration.md +2 -2
- docs/installation.md +19 -19
- docs/introduction.md +5 -4
- docs/openapi.json +1 -1
- docs/usage/open-webui-integration.md +4 -4
- docs/usage/text-to-speech.md +4 -5
- examples/javascript/index.js +1 -1
- examples/live-audio/script.sh +3 -3
- examples/youtube/script.sh +3 -3
- mkdocs.yml +3 -3
- pyproject.toml +1 -1
- src/{faster_whisper_server β speaches}/__init__.py +0 -0
- src/{faster_whisper_server β speaches}/api_models.py +2 -2
- src/{faster_whisper_server β speaches}/asr.py +3 -3
- src/{faster_whisper_server β speaches}/audio.py +1 -1
- src/{faster_whisper_server β speaches}/config.py +0 -0
- src/{faster_whisper_server β speaches}/dependencies.py +4 -4
- src/{faster_whisper_server β speaches}/gradio_app.py +5 -5
- src/{faster_whisper_server β speaches}/hf_utils.py +1 -1
- src/{faster_whisper_server β speaches}/logger.py +0 -0
- src/{faster_whisper_server β speaches}/main.py +7 -7
- src/{faster_whisper_server β speaches}/model_manager.py +2 -2
- src/{faster_whisper_server β speaches}/routers/__init__.py +0 -0
- src/{faster_whisper_server β speaches}/routers/misc.py +2 -2
- src/{faster_whisper_server β speaches}/routers/models.py +2 -2
- src/{faster_whisper_server β speaches}/routers/speech.py +2 -2
- src/{faster_whisper_server β speaches}/routers/stt.py +8 -8
- src/{faster_whisper_server β speaches}/text_utils.py +2 -2
- src/{faster_whisper_server β speaches}/text_utils_test.py +2 -2
- src/{faster_whisper_server β speaches}/transcriber.py +4 -4
- tests/api_timestamp_granularities_test.py +1 -1
- tests/conftest.py +6 -6
- tests/model_manager_test.py +1 -1
- tests/openai_timestamp_granularities_test.py +1 -1
- tests/speech_test.py +1 -1
- tests/sse_test.py +1 -1
- uv.lock +109 -109
Dockerfile
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
ARG BASE_IMAGE=nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04
|
2 |
# hadolint ignore=DL3006
|
3 |
FROM ${BASE_IMAGE}
|
4 |
-
LABEL org.opencontainers.image.source="https://github.com/
|
5 |
LABEL org.opencontainers.image.licenses="MIT"
|
6 |
# `ffmpeg` is installed because without it `gradio` won't work with mp3(possible others as well) files
|
7 |
# hadolint ignore=DL3008
|
@@ -15,7 +15,7 @@ RUN apt-get update && \
|
|
15 |
USER ubuntu
|
16 |
ENV HOME=/home/ubuntu \
|
17 |
PATH=/home/ubuntu/.local/bin:$PATH
|
18 |
-
WORKDIR $HOME/
|
19 |
# https://docs.astral.sh/uv/guides/integration/docker/#installing-uv
|
20 |
COPY --chown=ubuntu --from=ghcr.io/astral-sh/uv:0.5.14 /uv /bin/uv
|
21 |
# https://docs.astral.sh/uv/guides/integration/docker/#intermediate-layers
|
@@ -35,7 +35,7 @@ RUN mkdir -p $HOME/.cache/huggingface/hub
|
|
35 |
ENV WHISPER__MODEL=Systran/faster-whisper-large-v3
|
36 |
ENV UVICORN_HOST=0.0.0.0
|
37 |
ENV UVICORN_PORT=8000
|
38 |
-
ENV PATH="$HOME/
|
39 |
# https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhubenablehftransfer
|
40 |
# NOTE: I've disabled this because it doesn't inside of Docker container. I couldn't pinpoint the exact reason. This doesn't happen when running the server locally.
|
41 |
# RuntimeError: An error occurred while downloading using `hf_transfer`. Consider disabling HF_HUB_ENABLE_HF_TRANSFER for better error handling.
|
@@ -44,4 +44,4 @@ ENV HF_HUB_ENABLE_HF_TRANSFER=0
|
|
44 |
# https://www.reddit.com/r/StableDiffusion/comments/1f6asvd/gradio_sends_ip_address_telemetry_by_default/
|
45 |
ENV DO_NOT_TRACK=1
|
46 |
EXPOSE 8000
|
47 |
-
CMD ["uvicorn", "--factory", "
|
|
|
1 |
ARG BASE_IMAGE=nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04
|
2 |
# hadolint ignore=DL3006
|
3 |
FROM ${BASE_IMAGE}
|
4 |
+
LABEL org.opencontainers.image.source="https://github.com/speaches-ai/speaches"
|
5 |
LABEL org.opencontainers.image.licenses="MIT"
|
6 |
# `ffmpeg` is installed because without it `gradio` won't work with mp3(possible others as well) files
|
7 |
# hadolint ignore=DL3008
|
|
|
15 |
USER ubuntu
|
16 |
ENV HOME=/home/ubuntu \
|
17 |
PATH=/home/ubuntu/.local/bin:$PATH
|
18 |
+
WORKDIR $HOME/speaches
|
19 |
# https://docs.astral.sh/uv/guides/integration/docker/#installing-uv
|
20 |
COPY --chown=ubuntu --from=ghcr.io/astral-sh/uv:0.5.14 /uv /bin/uv
|
21 |
# https://docs.astral.sh/uv/guides/integration/docker/#intermediate-layers
|
|
|
35 |
ENV WHISPER__MODEL=Systran/faster-whisper-large-v3
|
36 |
ENV UVICORN_HOST=0.0.0.0
|
37 |
ENV UVICORN_PORT=8000
|
38 |
+
ENV PATH="$HOME/speaches/.venv/bin:$PATH"
|
39 |
# https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhubenablehftransfer
|
40 |
# NOTE: I've disabled this because it doesn't inside of Docker container. I couldn't pinpoint the exact reason. This doesn't happen when running the server locally.
|
41 |
# RuntimeError: An error occurred while downloading using `hf_transfer`. Consider disabling HF_HUB_ENABLE_HF_TRANSFER for better error handling.
|
|
|
44 |
# https://www.reddit.com/r/StableDiffusion/comments/1f6asvd/gradio_sends_ip_address_telemetry_by_default/
|
45 |
ENV DO_NOT_TRACK=1
|
46 |
EXPOSE 8000
|
47 |
+
CMD ["uvicorn", "--factory", "speaches.main:create_app"]
|
README.md
CHANGED
@@ -1,11 +1,15 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
`faster-whisper-server` is an OpenAI API-compatible transcription server which uses [faster-whisper](https://github.com/SYSTRAN/faster-whisper) as its backend.
|
4 |
Features:
|
5 |
|
6 |
- GPU and CPU support.
|
7 |
- Easily deployable using Docker.
|
8 |
-
- **Configurable through environment variables (see [config.py](./src/
|
9 |
- OpenAI API compatible.
|
10 |
- Streaming support (transcription is sent via [SSE](https://en.wikipedia.org/wiki/Server-sent_events) as the audio is transcribed. You don't need to wait for the audio to fully be transcribed before receiving it).
|
11 |
- Live transcription support (audio is sent via websocket as it's generated).
|
@@ -18,7 +22,7 @@ Please create an issue if you find a bug, have a question, or a feature suggesti
|
|
18 |
See [OpenAI API reference](https://platform.openai.com/docs/api-reference/audio) for more information.
|
19 |
|
20 |
- Audio file transcription via `POST /v1/audio/transcriptions` endpoint.
|
21 |
-
- Unlike OpenAI's API, `
|
22 |
- Audio file translation via `POST /v1/audio/translations` endpoint.
|
23 |
- Live audio transcription via `WS /v1/audio/transcriptions` endpoint.
|
24 |
- LocalAgreement2 ([paper](https://aclanthology.org/2023.ijcnlp-demo.3.pdf) | [original implementation](https://github.com/ufal/whisper_streaming)) algorithm is used for live transcription.
|
@@ -35,13 +39,13 @@ See [OpenAI API reference](https://platform.openai.com/docs/api-reference/audio)
|
|
35 |
NOTE: I'm using newer Docker Compsose features. If you are using an older version of Docker Compose, you may need need to update.
|
36 |
|
37 |
```bash
|
38 |
-
curl --silent --remote-name https://raw.githubusercontent.com/
|
39 |
|
40 |
# for GPU support
|
41 |
-
curl --silent --remote-name https://raw.githubusercontent.com/
|
42 |
docker compose --file compose.cuda.yaml up --detach
|
43 |
# for CPU only (use this if you don't have a GPU, as the image is much smaller)
|
44 |
-
curl --silent --remote-name https://raw.githubusercontent.com/
|
45 |
docker compose --file compose.cpu.yaml up --detach
|
46 |
```
|
47 |
|
@@ -49,9 +53,9 @@ docker compose --file compose.cpu.yaml up --detach
|
|
49 |
|
50 |
```bash
|
51 |
# for GPU support
|
52 |
-
docker run --gpus=all --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --detach
|
53 |
# for CPU only (use this if you don't have a GPU, as the image is much smaller)
|
54 |
-
docker run --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=Systran/faster-whisper-small --detach
|
55 |
```
|
56 |
|
57 |
### Using Kubernetes
|
|
|
1 |
+
> [!NOTE]
|
2 |
+
> This project was previously named `faster-whisper-server`. I've decided to change the name from `faster-whisper-server`, as the project has evolved to support more than just transcription.
|
3 |
+
|
4 |
+
# Speaches
|
5 |
+
|
6 |
+
`speaches` is an OpenAI API-compatible server supporting transcription, translation, and speech generation. For transcription/translation it uses [faster-whisper](https://github.com/SYSTRAN/faster-whisper) and for text-to-speech [piper](https://github.com/rhasspy/piper) is used.
|
7 |
|
|
|
8 |
Features:
|
9 |
|
10 |
- GPU and CPU support.
|
11 |
- Easily deployable using Docker.
|
12 |
+
- **Configurable through environment variables (see [config.py](./src/speaches/config.py))**.
|
13 |
- OpenAI API compatible.
|
14 |
- Streaming support (transcription is sent via [SSE](https://en.wikipedia.org/wiki/Server-sent_events) as the audio is transcribed. You don't need to wait for the audio to fully be transcribed before receiving it).
|
15 |
- Live transcription support (audio is sent via websocket as it's generated).
|
|
|
22 |
See [OpenAI API reference](https://platform.openai.com/docs/api-reference/audio) for more information.
|
23 |
|
24 |
- Audio file transcription via `POST /v1/audio/transcriptions` endpoint.
|
25 |
+
- Unlike OpenAI's API, `speaches` also supports streaming transcriptions (and translations). This is useful for when you want to process large audio files and would rather receive the transcription in chunks as they are processed, rather than waiting for the whole file to be transcribed. It works similarly to chat messages when chatting with LLMs.
|
26 |
- Audio file translation via `POST /v1/audio/translations` endpoint.
|
27 |
- Live audio transcription via `WS /v1/audio/transcriptions` endpoint.
|
28 |
- LocalAgreement2 ([paper](https://aclanthology.org/2023.ijcnlp-demo.3.pdf) | [original implementation](https://github.com/ufal/whisper_streaming)) algorithm is used for live transcription.
|
|
|
39 |
NOTE: I'm using newer Docker Compsose features. If you are using an older version of Docker Compose, you may need need to update.
|
40 |
|
41 |
```bash
|
42 |
+
curl --silent --remote-name https://raw.githubusercontent.com/speaches-ai/speaches/master/compose.yaml
|
43 |
|
44 |
# for GPU support
|
45 |
+
curl --silent --remote-name https://raw.githubusercontent.com/speaches-ai/speaches/master/compose.cuda.yaml
|
46 |
docker compose --file compose.cuda.yaml up --detach
|
47 |
# for CPU only (use this if you don't have a GPU, as the image is much smaller)
|
48 |
+
curl --silent --remote-name https://raw.githubusercontent.com/speaches-ai/speaches/master/compose.cpu.yaml
|
49 |
docker compose --file compose.cpu.yaml up --detach
|
50 |
```
|
51 |
|
|
|
53 |
|
54 |
```bash
|
55 |
# for GPU support
|
56 |
+
docker run --gpus=all --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --detach ghcr.io/speaches-ai/speaches:latest-cuda
|
57 |
# for CPU only (use this if you don't have a GPU, as the image is much smaller)
|
58 |
+
docker run --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=Systran/faster-whisper-small --detach ghcr.io/speaches-ai/speaches:latest-cpu
|
59 |
```
|
60 |
|
61 |
### Using Kubernetes
|
Taskfile.yaml
CHANGED
@@ -2,8 +2,8 @@ version: "3"
|
|
2 |
tasks:
|
3 |
server:
|
4 |
cmds:
|
5 |
-
- pkill --signal SIGKILL --echo --full 'uvicorn --factory --host 0.0.0.0
|
6 |
-
- opentelemetry-instrument uvicorn --factory --host 0.0.0.0
|
7 |
sources:
|
8 |
- src/**/*.py
|
9 |
test:
|
|
|
2 |
tasks:
|
3 |
server:
|
4 |
cmds:
|
5 |
+
- pkill --signal SIGKILL --echo --full 'uvicorn --factory --host 0.0.0.0 speaches.main:create_app' || true
|
6 |
+
- opentelemetry-instrument uvicorn --factory --host 0.0.0.0 speaches.main:create_app {{.CLI_ARGS}}
|
7 |
sources:
|
8 |
- src/**/*.py
|
9 |
test:
|
compose.cpu.yaml
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
# include:
|
2 |
# - compose.observability.yaml
|
3 |
services:
|
4 |
-
|
5 |
extends:
|
6 |
file: compose.yaml
|
7 |
-
service:
|
8 |
-
image:
|
9 |
build:
|
10 |
args:
|
11 |
BASE_IMAGE: ubuntu:24.04
|
|
|
1 |
# include:
|
2 |
# - compose.observability.yaml
|
3 |
services:
|
4 |
+
speaches:
|
5 |
extends:
|
6 |
file: compose.yaml
|
7 |
+
service: speaches
|
8 |
+
image: ghcr.io/speaches-ai/speaches:latest-cpu
|
9 |
build:
|
10 |
args:
|
11 |
BASE_IMAGE: ubuntu:24.04
|
compose.cuda-cdi.yaml
CHANGED
@@ -4,10 +4,10 @@
|
|
4 |
# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html
|
5 |
# https://docs.docker.com/reference/cli/dockerd/#enable-cdi-devices
|
6 |
services:
|
7 |
-
|
8 |
extends:
|
9 |
file: compose.cuda.yaml
|
10 |
-
service:
|
11 |
volumes:
|
12 |
- hf-hub-cache:/home/ubuntu/.cache/huggingface/hub
|
13 |
deploy:
|
|
|
4 |
# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html
|
5 |
# https://docs.docker.com/reference/cli/dockerd/#enable-cdi-devices
|
6 |
services:
|
7 |
+
speaches:
|
8 |
extends:
|
9 |
file: compose.cuda.yaml
|
10 |
+
service: speaches
|
11 |
volumes:
|
12 |
- hf-hub-cache:/home/ubuntu/.cache/huggingface/hub
|
13 |
deploy:
|
compose.cuda.yaml
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
# include:
|
2 |
# - compose.observability.yaml
|
3 |
services:
|
4 |
-
|
5 |
extends:
|
6 |
file: compose.yaml
|
7 |
-
service:
|
8 |
-
image:
|
9 |
build:
|
10 |
args:
|
11 |
BASE_IMAGE: nvidia/cuda:12.6.2-cudnn-runtime-ubuntu24.04
|
|
|
1 |
# include:
|
2 |
# - compose.observability.yaml
|
3 |
services:
|
4 |
+
speaches:
|
5 |
extends:
|
6 |
file: compose.yaml
|
7 |
+
service: speaches
|
8 |
+
image: ghcr.io/speaches-ai/speaches:latest-cuda
|
9 |
build:
|
10 |
args:
|
11 |
BASE_IMAGE: nvidia/cuda:12.6.2-cudnn-runtime-ubuntu24.04
|
compose.observability.yaml
CHANGED
@@ -5,7 +5,7 @@ services:
|
|
5 |
volumes:
|
6 |
- ./configuration/opentelemetry-collector.yaml:/etc/opentelemetry-collector.yaml
|
7 |
ports:
|
8 |
-
# NOTE: when `
|
9 |
- 4317:4317 # OTLP gRPC receiver
|
10 |
# - 4318:4318 # OTLP HTTP receiver
|
11 |
# - 8888:8888 # Prometheus metrics exposed by the Collector
|
|
|
5 |
volumes:
|
6 |
- ./configuration/opentelemetry-collector.yaml:/etc/opentelemetry-collector.yaml
|
7 |
ports:
|
8 |
+
# NOTE: when `speaches` is also running as a Docker Compose service, this doesn't need to be exposed.
|
9 |
- 4317:4317 # OTLP gRPC receiver
|
10 |
# - 4318:4318 # OTLP HTTP receiver
|
11 |
# - 8888:8888 # Prometheus metrics exposed by the Collector
|
compose.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
# TODO: https://docs.astral.sh/uv/guides/integration/docker/#configuring-watch-with-docker-compose
|
2 |
services:
|
3 |
-
|
4 |
-
container_name:
|
5 |
build:
|
6 |
dockerfile: Dockerfile
|
7 |
context: .
|
|
|
1 |
# TODO: https://docs.astral.sh/uv/guides/integration/docker/#configuring-watch-with-docker-compose
|
2 |
services:
|
3 |
+
speaches:
|
4 |
+
container_name: speaches
|
5 |
build:
|
6 |
dockerfile: Dockerfile
|
7 |
context: .
|
docs/configuration.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
<!-- https://mkdocstrings.github.io/python/usage/configuration/general/ -->
|
2 |
-
:::
|
3 |
options:
|
4 |
show_bases: true
|
5 |
show_if_no_docstring: true
|
@@ -16,7 +16,7 @@
|
|
16 |
- "!speech_*"
|
17 |
- "!transcription_*"
|
18 |
|
19 |
-
:::
|
20 |
|
21 |
<!-- TODO: nested model `whisper` -->
|
22 |
<!-- TODO: Insert new lines for multi-line docstrings -->
|
|
|
1 |
<!-- https://mkdocstrings.github.io/python/usage/configuration/general/ -->
|
2 |
+
::: speaches.config.Config
|
3 |
options:
|
4 |
show_bases: true
|
5 |
show_if_no_docstring: true
|
|
|
16 |
- "!speech_*"
|
17 |
- "!transcription_*"
|
18 |
|
19 |
+
::: speaches.config.WhisperConfig
|
20 |
|
21 |
<!-- TODO: nested model `whisper` -->
|
22 |
<!-- TODO: Insert new lines for multi-line docstrings -->
|
docs/installation.md
CHANGED
@@ -9,25 +9,25 @@ Download the necessary Docker Compose files
|
|
9 |
=== "CUDA"
|
10 |
|
11 |
```bash
|
12 |
-
curl --silent --remote-name https://raw.githubusercontent.com/
|
13 |
-
curl --silent --remote-name https://raw.githubusercontent.com/
|
14 |
export COMPOSE_FILE=compose.cuda.yaml
|
15 |
```
|
16 |
|
17 |
=== "CUDA (with CDI feature enabled)"
|
18 |
|
19 |
```bash
|
20 |
-
curl --silent --remote-name https://raw.githubusercontent.com/
|
21 |
-
curl --silent --remote-name https://raw.githubusercontent.com/
|
22 |
-
curl --silent --remote-name https://raw.githubusercontent.com/
|
23 |
export COMPOSE_FILE=compose.cuda-cdi.yaml
|
24 |
```
|
25 |
|
26 |
=== "CPU"
|
27 |
|
28 |
```bash
|
29 |
-
curl --silent --remote-name https://raw.githubusercontent.com/
|
30 |
-
curl --silent --remote-name https://raw.githubusercontent.com/
|
31 |
export COMPOSE_FILE=compose.cpu.yaml
|
32 |
```
|
33 |
|
@@ -58,10 +58,10 @@ docker compose up --detach
|
|
58 |
--rm \
|
59 |
--detach \
|
60 |
--publish 8000:8000 \
|
61 |
-
--name
|
62 |
--volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub \
|
63 |
--gpus=all \
|
64 |
-
|
65 |
```
|
66 |
|
67 |
=== "CUDA (with CDI feature enabled)"
|
@@ -71,10 +71,10 @@ docker compose up --detach
|
|
71 |
--rm \
|
72 |
--detach \
|
73 |
--publish 8000:8000 \
|
74 |
-
--name
|
75 |
--volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub \
|
76 |
--device=nvidia.com/gpu=all \
|
77 |
-
|
78 |
```
|
79 |
|
80 |
=== "CPU"
|
@@ -84,31 +84,31 @@ docker compose up --detach
|
|
84 |
--rm \
|
85 |
--detach \
|
86 |
--publish 8000:8000 \
|
87 |
-
--name
|
88 |
--volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub \
|
89 |
-
|
90 |
```
|
91 |
|
92 |
??? note "Build from source"
|
93 |
|
94 |
```bash
|
95 |
-
docker build --tag
|
96 |
|
97 |
# NOTE: you need to install and enable [buildx](https://github.com/docker/buildx) for multi-platform builds
|
98 |
# Build image for both amd64 and arm64
|
99 |
-
docker buildx build --tag
|
100 |
|
101 |
# Build image without CUDA support
|
102 |
-
docker build --tag
|
103 |
```
|
104 |
|
105 |
## Python (requires Python 3.12+ and `uv` package manager)
|
106 |
|
107 |
```bash
|
108 |
-
git clone https://github.com/
|
109 |
-
cd
|
110 |
uv venv
|
111 |
sourve .venv/bin/activate
|
112 |
uv sync --all-extras
|
113 |
-
uvicorn --factory --host 0.0.0.0
|
114 |
```
|
|
|
9 |
=== "CUDA"
|
10 |
|
11 |
```bash
|
12 |
+
curl --silent --remote-name https://raw.githubusercontent.com/speaches-ai/speaches/master/compose.yaml
|
13 |
+
curl --silent --remote-name https://raw.githubusercontent.com/speaches-ai/speaches/master/compose.cuda.yaml
|
14 |
export COMPOSE_FILE=compose.cuda.yaml
|
15 |
```
|
16 |
|
17 |
=== "CUDA (with CDI feature enabled)"
|
18 |
|
19 |
```bash
|
20 |
+
curl --silent --remote-name https://raw.githubusercontent.com/speaches-ai/speaches/master/compose.yaml
|
21 |
+
curl --silent --remote-name https://raw.githubusercontent.com/speaches-ai/speaches/master/compose.cuda.yaml
|
22 |
+
curl --silent --remote-name https://raw.githubusercontent.com/speaches-ai/speaches/master/compose.cuda-cdi.yaml
|
23 |
export COMPOSE_FILE=compose.cuda-cdi.yaml
|
24 |
```
|
25 |
|
26 |
=== "CPU"
|
27 |
|
28 |
```bash
|
29 |
+
curl --silent --remote-name https://raw.githubusercontent.com/speaches-ai/speaches/master/compose.yaml
|
30 |
+
curl --silent --remote-name https://raw.githubusercontent.com/speaches-ai/speaches/master/compose.cpu.yaml
|
31 |
export COMPOSE_FILE=compose.cpu.yaml
|
32 |
```
|
33 |
|
|
|
58 |
--rm \
|
59 |
--detach \
|
60 |
--publish 8000:8000 \
|
61 |
+
--name speaches \
|
62 |
--volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub \
|
63 |
--gpus=all \
|
64 |
+
ghcr.io/speaches-ai/speaches:latest-cuda
|
65 |
```
|
66 |
|
67 |
=== "CUDA (with CDI feature enabled)"
|
|
|
71 |
--rm \
|
72 |
--detach \
|
73 |
--publish 8000:8000 \
|
74 |
+
--name speaches \
|
75 |
--volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub \
|
76 |
--device=nvidia.com/gpu=all \
|
77 |
+
ghcr.io/speaches-ai/speaches:latest-cuda
|
78 |
```
|
79 |
|
80 |
=== "CPU"
|
|
|
84 |
--rm \
|
85 |
--detach \
|
86 |
--publish 8000:8000 \
|
87 |
+
--name speaches \
|
88 |
--volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub \
|
89 |
+
ghcr.io/speaches-ai/speaches:latest-cpu
|
90 |
```
|
91 |
|
92 |
??? note "Build from source"
|
93 |
|
94 |
```bash
|
95 |
+
docker build --tag speaches .
|
96 |
|
97 |
# NOTE: you need to install and enable [buildx](https://github.com/docker/buildx) for multi-platform builds
|
98 |
# Build image for both amd64 and arm64
|
99 |
+
docker buildx build --tag speaches --platform linux/amd64,linux/arm64 .
|
100 |
|
101 |
# Build image without CUDA support
|
102 |
+
docker build --tag speaches --build-arg BASE_IMAGE=ubuntu:24.04 .
|
103 |
```
|
104 |
|
105 |
## Python (requires Python 3.12+ and `uv` package manager)
|
106 |
|
107 |
```bash
|
108 |
+
git clone https://github.com/speaches-ai/speaches.git
|
109 |
+
cd speaches
|
110 |
uv venv
|
111 |
sourve .venv/bin/activate
|
112 |
uv sync --all-extras
|
113 |
+
uvicorn --factory --host 0.0.0.0 speaches.main:create_app
|
114 |
```
|
docs/introduction.md
CHANGED
@@ -8,19 +8,20 @@
|
|
8 |
|
9 |
TODO: add HuggingFace Space URL
|
10 |
|
11 |
-
#
|
12 |
|
13 |
-
`
|
14 |
|
15 |
## Features:
|
16 |
|
17 |
- GPU and CPU support.
|
18 |
- [Deployable via Docker Compose / Docker](./installation.md)
|
19 |
- [Highly configurable](./configuration.md)
|
20 |
-
- OpenAI API compatible. All tools and SDKs that work with OpenAI's API should work with `
|
21 |
- Streaming support (transcription is sent via [SSE](https://en.wikipedia.org/wiki/Server-sent_events) as the audio is transcribed. You don't need to wait for the audio to fully be transcribed before receiving it).
|
22 |
- Live transcription support (audio is sent via websocket as it's generated).
|
23 |
- Dynamic model loading / offloading. Just specify which model you want to use in the request and it will be loaded automatically. It will then be unloaded after a period of inactivity.
|
|
|
24 |
- (Coming soon) Audio generation (chat completions endpoint) | [OpenAI Documentation](https://platform.openai.com/docs/guides/realtime)
|
25 |
- Generate a spoken audio summary of a body of text (text in, audio out)
|
26 |
- Perform sentiment analysis on a recording (audio in, text out)
|
@@ -34,7 +35,7 @@ Please create an issue if you find a bug, have a question, or a feature suggesti
|
|
34 |
See [OpenAI API reference](https://platform.openai.com/docs/api-reference/audio) for more information.
|
35 |
|
36 |
- Audio file transcription via `POST /v1/audio/transcriptions` endpoint.
|
37 |
-
- Unlike OpenAI's API, `
|
38 |
- Audio file translation via `POST /v1/audio/translations` endpoint.
|
39 |
- Live audio transcription via `WS /v1/audio/transcriptions` endpoint.
|
40 |
- LocalAgreement2 ([paper](https://aclanthology.org/2023.ijcnlp-demo.3.pdf) | [original implementation](https://github.com/ufal/whisper_streaming)) algorithm is used for live transcription.
|
|
|
8 |
|
9 |
TODO: add HuggingFace Space URL
|
10 |
|
11 |
+
# Speaches
|
12 |
|
13 |
+
`speaches` is an OpenAI API-compatible server supporting transcription, translation, and speech generation. For transcription/translation it uses [faster-whisper](https://github.com/SYSTRAN/faster-whisper) and for text-to-speech [piper](https://github.com/rhasspy/piper) is used.
|
14 |
|
15 |
## Features:
|
16 |
|
17 |
- GPU and CPU support.
|
18 |
- [Deployable via Docker Compose / Docker](./installation.md)
|
19 |
- [Highly configurable](./configuration.md)
|
20 |
+
- OpenAI API compatible. All tools and SDKs that work with OpenAI's API should work with `speaches`.
|
21 |
- Streaming support (transcription is sent via [SSE](https://en.wikipedia.org/wiki/Server-sent_events) as the audio is transcribed. You don't need to wait for the audio to fully be transcribed before receiving it).
|
22 |
- Live transcription support (audio is sent via websocket as it's generated).
|
23 |
- Dynamic model loading / offloading. Just specify which model you want to use in the request and it will be loaded automatically. It will then be unloaded after a period of inactivity.
|
24 |
+
- [Text-to-speech (TTS) via `piper`]
|
25 |
- (Coming soon) Audio generation (chat completions endpoint) | [OpenAI Documentation](https://platform.openai.com/docs/guides/realtime)
|
26 |
- Generate a spoken audio summary of a body of text (text in, audio out)
|
27 |
- Perform sentiment analysis on a recording (audio in, text out)
|
|
|
35 |
See [OpenAI API reference](https://platform.openai.com/docs/api-reference/audio) for more information.
|
36 |
|
37 |
- Audio file transcription via `POST /v1/audio/transcriptions` endpoint.
|
38 |
+
- Unlike OpenAI's API, `speaches` also supports streaming transcriptions (and translations). This is useful for when you want to process large audio files and would rather receive the transcription in chunks as they are processed, rather than waiting for the whole file to be transcribed. It works similarly to chat messages when chatting with LLMs.
|
39 |
- Audio file translation via `POST /v1/audio/translations` endpoint.
|
40 |
- Live audio transcription via `WS /v1/audio/transcriptions` endpoint.
|
41 |
- LocalAgreement2 ([paper](https://aclanthology.org/2023.ijcnlp-demo.3.pdf) | [original implementation](https://github.com/ufal/whisper_streaming)) algorithm is used for live transcription.
|
docs/openapi.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"openapi":"3.1.0","info":{"title":"FastAPI","version":"0.1.0"},"paths":{"/v1/audio/translations":{"post":{"tags":["automatic-speech-recognition"],"summary":"Translate File","operationId":"translate_file_v1_audio_translations_post","requestBody":{"content":{"application/x-www-form-urlencoded":{"schema":{"$ref":"#/components/schemas/Body_translate_file_v1_audio_translations_post"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"anyOf":[{"type":"string"},{"$ref":"#/components/schemas/CreateTranscriptionResponseJson"},{"$ref":"#/components/schemas/CreateTranscriptionResponseVerboseJson"}],"title":"Response Translate File V1 Audio Translations Post"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/v1/audio/transcriptions":{"post":{"tags":["automatic-speech-recognition"],"summary":"Transcribe File","operationId":"transcribe_file_v1_audio_transcriptions_post","requestBody":{"content":{"application/x-www-form-urlencoded":{"schema":{"$ref":"#/components/schemas/Body_transcribe_file_v1_audio_transcriptions_post"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"anyOf":[{"type":"string"},{"$ref":"#/components/schemas/CreateTranscriptionResponseJson"},{"$ref":"#/components/schemas/CreateTranscriptionResponseVerboseJson"}],"title":"Response Transcribe File V1 Audio Transcriptions Post"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/v1/models":{"get":{"tags":["models"],"summary":"Get Models","operationId":"get_models_v1_models_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ListModelsResponse"}}}}}}},"/v1/models/{model_name}":{"get":{"tags":["models"],"summary":"Get Model","operationId":"get_model_v1_models__model_name__get","parameters":[{"name":"model_name","in":"path","required":true,"schema":{"type":"string","title":"Model Name"},"example":"Systran/faster-distil-whisper-large-v3"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/Model"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/health":{"get":{"tags":["diagnostic"],"summary":"Health","operationId":"health_health_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}},"/api/pull/{model_name}":{"post":{"tags":["experimental"],"summary":"Download a model from Hugging Face.","operationId":"pull_model_api_pull__model_name__post","parameters":[{"name":"model_name","in":"path","required":true,"schema":{"type":"string","title":"Model Name"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/ps":{"get":{"tags":["experimental"],"summary":"Get a list of loaded models.","operationId":"get_running_models_api_ps_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":{"items":{"type":"string"},"type":"array"},"type":"object","title":"Response Get Running Models Api Ps Get"}}}}}}},"/api/ps/{model_name}":{"post":{"tags":["experimental"],"summary":"Load a model into memory.","operationId":"load_model_route_api_ps__model_name__post","parameters":[{"name":"model_name","in":"path","required":true,"schema":{"type":"string","title":"Model Name"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"tags":["experimental"],"summary":"Unload a model from memory.","operationId":"stop_running_model_api_ps__model_name__delete","parameters":[{"name":"model_name","in":"path","required":true,"schema":{"type":"string","title":"Model Name"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/v1/audio/speech":{"post":{"tags":["speech-to-text"],"summary":"Synthesize","operationId":"synthesize_v1_audio_speech_post","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/CreateSpeechRequestBody"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/v1/audio/speech/voices":{"get":{"tags":["speech-to-text"],"summary":"List Voices","operationId":"list_voices_v1_audio_speech_voices_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"items":{"$ref":"#/components/schemas/PiperModel"},"type":"array","title":"Response List Voices V1 Audio Speech Voices Get"}}}}}}}},"components":{"schemas":{"Body_transcribe_file_v1_audio_transcriptions_post":{"properties":{"model":{"anyOf":[{"type":"string","description":"The ID of the model. You can get a list of available models by calling `/v1/models`.","examples":["Systran/faster-distil-whisper-large-v3","bofenghuang/whisper-large-v2-cv11-french-ct2"]},{"type":"null"}],"title":"Model"},"language":{"anyOf":[{"$ref":"#/components/schemas/Language"},{"type":"null"}]},"prompt":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Prompt"},"response_format":{"anyOf":[{"$ref":"#/components/schemas/faster_whisper_server__config__ResponseFormat"},{"type":"null"}]},"temperature":{"type":"number","title":"Temperature","default":0.0},"timestamp_granularities":{"items":{"type":"string","enum":["segment","word"]},"type":"array","title":"Timestamp Granularities","default":["segment"]},"stream":{"type":"boolean","title":"Stream","default":false},"hotwords":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Hotwords"},"vad_filter":{"type":"boolean","title":"Vad Filter","default":false},"file":{"type":"string","format":"binary","title":"File"}},"type":"object","required":["file"],"title":"Body_transcribe_file_v1_audio_transcriptions_post"},"Body_translate_file_v1_audio_translations_post":{"properties":{"model":{"anyOf":[{"type":"string","description":"The ID of the model. You can get a list of available models by calling `/v1/models`.","examples":["Systran/faster-distil-whisper-large-v3","bofenghuang/whisper-large-v2-cv11-french-ct2"]},{"type":"null"}],"title":"Model"},"prompt":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Prompt"},"response_format":{"anyOf":[{"$ref":"#/components/schemas/faster_whisper_server__config__ResponseFormat"},{"type":"null"}]},"temperature":{"type":"number","title":"Temperature","default":0.0},"stream":{"type":"boolean","title":"Stream","default":false},"vad_filter":{"type":"boolean","title":"Vad Filter","default":false},"file":{"type":"string","format":"binary","title":"File"}},"type":"object","required":["file"],"title":"Body_translate_file_v1_audio_translations_post"},"CreateSpeechRequestBody":{"properties":{"model":{"type":"string","enum":["piper"],"const":"piper","title":"Model","description":"The ID of the model. The only supported model is 'piper'.","default":"piper","examples":["piper"]},"input":{"type":"string","title":"Input","description":"The text to generate audio for. ","examples":["A rainbow is an optical phenomenon caused by refraction, internal reflection and dispersion of light in water droplets resulting in a continuous spectrum of light appearing in the sky. The rainbow takes the form of a multicoloured circular arc. Rainbows caused by sunlight always appear in the section of sky directly opposite the Sun. Rainbows can be caused by many forms of airborne water. These include not only rain, but also mist, spray, and airborne dew."]},"voice":{"type":"string","title":"Voice","default":"en_US-amy-medium"},"response_format":{"$ref":"#/components/schemas/faster_whisper_server__routers__speech__ResponseFormat","description":"The format to audio in. Supported formats are mp3, flac, wav, pcm. opus, aac are not supported","default":"mp3","examples":["mp3","flac","wav","pcm"]},"speed":{"type":"number","maximum":4.0,"minimum":0.25,"title":"Speed","default":1.0},"sample_rate":{"anyOf":[{"type":"integer","maximum":48000.0,"minimum":8000.0},{"type":"null"}],"title":"Sample Rate"}},"type":"object","required":["input"],"title":"CreateSpeechRequestBody"},"CreateTranscriptionResponseJson":{"properties":{"text":{"type":"string","title":"Text"}},"type":"object","required":["text"],"title":"CreateTranscriptionResponseJson"},"CreateTranscriptionResponseVerboseJson":{"properties":{"task":{"type":"string","title":"Task","default":"transcribe"},"language":{"type":"string","title":"Language"},"duration":{"type":"number","title":"Duration"},"text":{"type":"string","title":"Text"},"words":{"anyOf":[{"items":{"$ref":"#/components/schemas/TranscriptionWord"},"type":"array"},{"type":"null"}],"title":"Words"},"segments":{"items":{"$ref":"#/components/schemas/TranscriptionSegment"},"type":"array","title":"Segments"}},"type":"object","required":["language","duration","text","words","segments"],"title":"CreateTranscriptionResponseVerboseJson"},"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"Language":{"type":"string","enum":["af","am","ar","as","az","ba","be","bg","bn","bo","br","bs","ca","cs","cy","da","de","el","en","es","et","eu","fa","fi","fo","fr","gl","gu","ha","haw","he","hi","hr","ht","hu","hy","id","is","it","ja","jw","ka","kk","km","kn","ko","la","lb","ln","lo","lt","lv","mg","mi","mk","ml","mn","mr","ms","mt","my","ne","nl","nn","no","oc","pa","pl","ps","pt","ro","ru","sa","sd","si","sk","sl","sn","so","sq","sr","su","sv","sw","ta","te","tg","th","tk","tl","tr","tt","uk","ur","uz","vi","yi","yo","yue","zh"],"title":"Language"},"ListModelsResponse":{"properties":{"data":{"items":{"$ref":"#/components/schemas/Model"},"type":"array","title":"Data"},"object":{"type":"string","enum":["list"],"const":"list","title":"Object","default":"list"}},"type":"object","required":["data"],"title":"ListModelsResponse"},"Model":{"properties":{"id":{"type":"string","title":"Id"},"created":{"type":"integer","title":"Created"},"object":{"type":"string","enum":["model"],"const":"model","title":"Object"},"owned_by":{"type":"string","title":"Owned By"},"language":{"items":{"type":"string"},"type":"array","title":"Language"}},"type":"object","required":["id","created","object","owned_by"],"title":"Model","examples":[{"created":1700732060,"id":"Systran/faster-whisper-large-v3","object":"model","owned_by":"Systran"},{"created":1711378296,"id":"Systran/faster-distil-whisper-large-v3","object":"model","owned_by":"Systran"},{"created":1687968011,"id":"bofenghuang/whisper-large-v2-cv11-french-ct2","object":"model","owned_by":"bofenghuang"}]},"PiperModel":{"properties":{"object":{"type":"string","enum":["model"],"const":"model","title":"Object","default":"model"},"created":{"type":"integer","title":"Created"},"owned_by":{"type":"string","enum":["rhasspy"],"const":"rhasspy","title":"Owned By","default":"rhasspy"},"model_path":{"type":"string","format":"path","title":"Model Path","examples":["/home/nixos/.cache/huggingface/hub/models--rhasspy--piper-voices/snapshots/3d796cc2f2c884b3517c527507e084f7bb245aea/en/en_US/amy/medium/en_US-amy-medium.onnx"]},"id":{"type":"string","title":"Id","readOnly":true,"examples":["rhasspy/piper-voices/en_US-amy-medium"]},"voice":{"type":"string","title":"Voice","readOnly":true,"examples":["rhasspy/piper-voices/en_US-amy-medium"]},"config_path":{"type":"string","format":"path","title":"Config Path","readOnly":true},"quality":{"type":"string","enum":["x_low","low","medium","high"],"title":"Quality","readOnly":true},"sample_rate":{"type":"integer","title":"Sample Rate","readOnly":true}},"type":"object","required":["created","model_path","id","voice","config_path","quality","sample_rate"],"title":"PiperModel","description":"Similar structure to the GET /v1/models response but with extra fields."},"TranscriptionSegment":{"properties":{"id":{"type":"integer","title":"Id"},"seek":{"type":"integer","title":"Seek"},"start":{"type":"number","title":"Start"},"end":{"type":"number","title":"End"},"text":{"type":"string","title":"Text"},"tokens":{"items":{"type":"integer"},"type":"array","title":"Tokens"},"temperature":{"type":"number","title":"Temperature"},"avg_logprob":{"type":"number","title":"Avg Logprob"},"compression_ratio":{"type":"number","title":"Compression Ratio"},"no_speech_prob":{"type":"number","title":"No Speech Prob"},"words":{"anyOf":[{"items":{"$ref":"#/components/schemas/TranscriptionWord"},"type":"array"},{"type":"null"}],"title":"Words"}},"type":"object","required":["id","seek","start","end","text","tokens","temperature","avg_logprob","compression_ratio","no_speech_prob","words"],"title":"TranscriptionSegment"},"TranscriptionWord":{"properties":{"start":{"type":"number","title":"Start"},"end":{"type":"number","title":"End"},"word":{"type":"string","title":"Word"},"probability":{"type":"number","title":"Probability"}},"type":"object","required":["start","end","word","probability"],"title":"TranscriptionWord"},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"faster_whisper_server__config__ResponseFormat":{"type":"string","enum":["text","json","verbose_json","srt","vtt"],"title":"ResponseFormat"},"faster_whisper_server__routers__speech__ResponseFormat":{"type":"string","enum":["mp3","flac","wav","pcm"]}}},"tags":[{"name":"automatic-speech-recognition"},{"name":"speech-to-text"},{"name":"models"},{"name":"diagnostic"},{"name":"experimental","description":"Not meant for public use yet. May change or be removed at any time."}]}
|
|
|
1 |
+
{"openapi":"3.1.0","info":{"title":"FastAPI","version":"0.1.0"},"paths":{"/v1/audio/translations":{"post":{"tags":["automatic-speech-recognition"],"summary":"Translate File","operationId":"translate_file_v1_audio_translations_post","requestBody":{"content":{"application/x-www-form-urlencoded":{"schema":{"$ref":"#/components/schemas/Body_translate_file_v1_audio_translations_post"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"anyOf":[{"type":"string"},{"$ref":"#/components/schemas/CreateTranscriptionResponseJson"},{"$ref":"#/components/schemas/CreateTranscriptionResponseVerboseJson"}],"title":"Response Translate File V1 Audio Translations Post"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/v1/audio/transcriptions":{"post":{"tags":["automatic-speech-recognition"],"summary":"Transcribe File","operationId":"transcribe_file_v1_audio_transcriptions_post","requestBody":{"content":{"application/x-www-form-urlencoded":{"schema":{"$ref":"#/components/schemas/Body_transcribe_file_v1_audio_transcriptions_post"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"anyOf":[{"type":"string"},{"$ref":"#/components/schemas/CreateTranscriptionResponseJson"},{"$ref":"#/components/schemas/CreateTranscriptionResponseVerboseJson"}],"title":"Response Transcribe File V1 Audio Transcriptions Post"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/v1/models":{"get":{"tags":["models"],"summary":"Get Models","operationId":"get_models_v1_models_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ListModelsResponse"}}}}}}},"/v1/models/{model_name}":{"get":{"tags":["models"],"summary":"Get Model","operationId":"get_model_v1_models__model_name__get","parameters":[{"name":"model_name","in":"path","required":true,"schema":{"type":"string","title":"Model Name"},"example":"Systran/faster-distil-whisper-large-v3"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/Model"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/health":{"get":{"tags":["diagnostic"],"summary":"Health","operationId":"health_health_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}},"/api/pull/{model_name}":{"post":{"tags":["experimental"],"summary":"Download a model from Hugging Face.","operationId":"pull_model_api_pull__model_name__post","parameters":[{"name":"model_name","in":"path","required":true,"schema":{"type":"string","title":"Model Name"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/ps":{"get":{"tags":["experimental"],"summary":"Get a list of loaded models.","operationId":"get_running_models_api_ps_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":{"items":{"type":"string"},"type":"array"},"type":"object","title":"Response Get Running Models Api Ps Get"}}}}}}},"/api/ps/{model_name}":{"post":{"tags":["experimental"],"summary":"Load a model into memory.","operationId":"load_model_route_api_ps__model_name__post","parameters":[{"name":"model_name","in":"path","required":true,"schema":{"type":"string","title":"Model Name"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"tags":["experimental"],"summary":"Unload a model from memory.","operationId":"stop_running_model_api_ps__model_name__delete","parameters":[{"name":"model_name","in":"path","required":true,"schema":{"type":"string","title":"Model Name"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/v1/audio/speech":{"post":{"tags":["speech-to-text"],"summary":"Synthesize","operationId":"synthesize_v1_audio_speech_post","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/CreateSpeechRequestBody"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/v1/audio/speech/voices":{"get":{"tags":["speech-to-text"],"summary":"List Voices","operationId":"list_voices_v1_audio_speech_voices_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"items":{"$ref":"#/components/schemas/PiperModel"},"type":"array","title":"Response List Voices V1 Audio Speech Voices Get"}}}}}}}},"components":{"schemas":{"Body_transcribe_file_v1_audio_transcriptions_post":{"properties":{"model":{"anyOf":[{"type":"string","description":"The ID of the model. You can get a list of available models by calling `/v1/models`.","examples":["Systran/faster-distil-whisper-large-v3","bofenghuang/whisper-large-v2-cv11-french-ct2"]},{"type":"null"}],"title":"Model"},"language":{"anyOf":[{"$ref":"#/components/schemas/Language"},{"type":"null"}]},"prompt":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Prompt"},"response_format":{"anyOf":[{"$ref":"#/components/schemas/speaches__config__ResponseFormat"},{"type":"null"}]},"temperature":{"type":"number","title":"Temperature","default":0.0},"timestamp_granularities":{"items":{"type":"string","enum":["segment","word"]},"type":"array","title":"Timestamp Granularities","default":["segment"]},"stream":{"type":"boolean","title":"Stream","default":false},"hotwords":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Hotwords"},"vad_filter":{"type":"boolean","title":"Vad Filter","default":false},"file":{"type":"string","format":"binary","title":"File"}},"type":"object","required":["file"],"title":"Body_transcribe_file_v1_audio_transcriptions_post"},"Body_translate_file_v1_audio_translations_post":{"properties":{"model":{"anyOf":[{"type":"string","description":"The ID of the model. You can get a list of available models by calling `/v1/models`.","examples":["Systran/faster-distil-whisper-large-v3","bofenghuang/whisper-large-v2-cv11-french-ct2"]},{"type":"null"}],"title":"Model"},"prompt":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Prompt"},"response_format":{"anyOf":[{"$ref":"#/components/schemas/speaches__config__ResponseFormat"},{"type":"null"}]},"temperature":{"type":"number","title":"Temperature","default":0.0},"stream":{"type":"boolean","title":"Stream","default":false},"vad_filter":{"type":"boolean","title":"Vad Filter","default":false},"file":{"type":"string","format":"binary","title":"File"}},"type":"object","required":["file"],"title":"Body_translate_file_v1_audio_translations_post"},"CreateSpeechRequestBody":{"properties":{"model":{"type":"string","enum":["piper"],"const":"piper","title":"Model","description":"The ID of the model. The only supported model is 'piper'.","default":"piper","examples":["piper"]},"input":{"type":"string","title":"Input","description":"The text to generate audio for. ","examples":["A rainbow is an optical phenomenon caused by refraction, internal reflection and dispersion of light in water droplets resulting in a continuous spectrum of light appearing in the sky. The rainbow takes the form of a multicoloured circular arc. Rainbows caused by sunlight always appear in the section of sky directly opposite the Sun. Rainbows can be caused by many forms of airborne water. These include not only rain, but also mist, spray, and airborne dew."]},"voice":{"type":"string","title":"Voice","default":"en_US-amy-medium"},"response_format":{"$ref":"#/components/schemas/speaches__routers__speech__ResponseFormat","description":"The format to audio in. Supported formats are mp3, flac, wav, pcm. opus, aac are not supported","default":"mp3","examples":["mp3","flac","wav","pcm"]},"speed":{"type":"number","maximum":4.0,"minimum":0.25,"title":"Speed","default":1.0},"sample_rate":{"anyOf":[{"type":"integer","maximum":48000.0,"minimum":8000.0},{"type":"null"}],"title":"Sample Rate"}},"type":"object","required":["input"],"title":"CreateSpeechRequestBody"},"CreateTranscriptionResponseJson":{"properties":{"text":{"type":"string","title":"Text"}},"type":"object","required":["text"],"title":"CreateTranscriptionResponseJson"},"CreateTranscriptionResponseVerboseJson":{"properties":{"task":{"type":"string","title":"Task","default":"transcribe"},"language":{"type":"string","title":"Language"},"duration":{"type":"number","title":"Duration"},"text":{"type":"string","title":"Text"},"words":{"anyOf":[{"items":{"$ref":"#/components/schemas/TranscriptionWord"},"type":"array"},{"type":"null"}],"title":"Words"},"segments":{"items":{"$ref":"#/components/schemas/TranscriptionSegment"},"type":"array","title":"Segments"}},"type":"object","required":["language","duration","text","words","segments"],"title":"CreateTranscriptionResponseVerboseJson"},"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"Language":{"type":"string","enum":["af","am","ar","as","az","ba","be","bg","bn","bo","br","bs","ca","cs","cy","da","de","el","en","es","et","eu","fa","fi","fo","fr","gl","gu","ha","haw","he","hi","hr","ht","hu","hy","id","is","it","ja","jw","ka","kk","km","kn","ko","la","lb","ln","lo","lt","lv","mg","mi","mk","ml","mn","mr","ms","mt","my","ne","nl","nn","no","oc","pa","pl","ps","pt","ro","ru","sa","sd","si","sk","sl","sn","so","sq","sr","su","sv","sw","ta","te","tg","th","tk","tl","tr","tt","uk","ur","uz","vi","yi","yo","yue","zh"],"title":"Language"},"ListModelsResponse":{"properties":{"data":{"items":{"$ref":"#/components/schemas/Model"},"type":"array","title":"Data"},"object":{"type":"string","enum":["list"],"const":"list","title":"Object","default":"list"}},"type":"object","required":["data"],"title":"ListModelsResponse"},"Model":{"properties":{"id":{"type":"string","title":"Id"},"created":{"type":"integer","title":"Created"},"object":{"type":"string","enum":["model"],"const":"model","title":"Object"},"owned_by":{"type":"string","title":"Owned By"},"language":{"items":{"type":"string"},"type":"array","title":"Language"}},"type":"object","required":["id","created","object","owned_by"],"title":"Model","examples":[{"created":1700732060,"id":"Systran/faster-whisper-large-v3","object":"model","owned_by":"Systran"},{"created":1711378296,"id":"Systran/faster-distil-whisper-large-v3","object":"model","owned_by":"Systran"},{"created":1687968011,"id":"bofenghuang/whisper-large-v2-cv11-french-ct2","object":"model","owned_by":"bofenghuang"}]},"PiperModel":{"properties":{"object":{"type":"string","enum":["model"],"const":"model","title":"Object","default":"model"},"created":{"type":"integer","title":"Created"},"owned_by":{"type":"string","enum":["rhasspy"],"const":"rhasspy","title":"Owned By","default":"rhasspy"},"model_path":{"type":"string","format":"path","title":"Model Path","examples":["/home/nixos/.cache/huggingface/hub/models--rhasspy--piper-voices/snapshots/3d796cc2f2c884b3517c527507e084f7bb245aea/en/en_US/amy/medium/en_US-amy-medium.onnx"]},"id":{"type":"string","title":"Id","readOnly":true,"examples":["rhasspy/piper-voices/en_US-amy-medium"]},"voice":{"type":"string","title":"Voice","readOnly":true,"examples":["rhasspy/piper-voices/en_US-amy-medium"]},"config_path":{"type":"string","format":"path","title":"Config Path","readOnly":true},"quality":{"type":"string","enum":["x_low","low","medium","high"],"title":"Quality","readOnly":true},"sample_rate":{"type":"integer","title":"Sample Rate","readOnly":true}},"type":"object","required":["created","model_path","id","voice","config_path","quality","sample_rate"],"title":"PiperModel","description":"Similar structure to the GET /v1/models response but with extra fields."},"TranscriptionSegment":{"properties":{"id":{"type":"integer","title":"Id"},"seek":{"type":"integer","title":"Seek"},"start":{"type":"number","title":"Start"},"end":{"type":"number","title":"End"},"text":{"type":"string","title":"Text"},"tokens":{"items":{"type":"integer"},"type":"array","title":"Tokens"},"temperature":{"type":"number","title":"Temperature"},"avg_logprob":{"type":"number","title":"Avg Logprob"},"compression_ratio":{"type":"number","title":"Compression Ratio"},"no_speech_prob":{"type":"number","title":"No Speech Prob"},"words":{"anyOf":[{"items":{"$ref":"#/components/schemas/TranscriptionWord"},"type":"array"},{"type":"null"}],"title":"Words"}},"type":"object","required":["id","seek","start","end","text","tokens","temperature","avg_logprob","compression_ratio","no_speech_prob","words"],"title":"TranscriptionSegment"},"TranscriptionWord":{"properties":{"start":{"type":"number","title":"Start"},"end":{"type":"number","title":"End"},"word":{"type":"string","title":"Word"},"probability":{"type":"number","title":"Probability"}},"type":"object","required":["start","end","word","probability"],"title":"TranscriptionWord"},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"speaches__config__ResponseFormat":{"type":"string","enum":["text","json","verbose_json","srt","vtt"],"title":"ResponseFormat"},"speaches__routers__speech__ResponseFormat":{"type":"string","enum":["mp3","flac","wav","pcm"]}}},"tags":[{"name":"automatic-speech-recognition"},{"name":"speech-to-text"},{"name":"models"},{"name":"diagnostic"},{"name":"experimental","description":"Not meant for public use yet. May change or be removed at any time."}]}
|
docs/usage/open-webui-integration.md
CHANGED
@@ -6,7 +6,7 @@
|
|
6 |
2. Click on the "Audio" tab
|
7 |
3. Update settings
|
8 |
- Speech-to-Text Engine: OpenAI
|
9 |
-
- API Base URL: http://
|
10 |
- API Key: does-not-matter-what-you-put-but-should-not-be-empty
|
11 |
- Model: Systran/faster-distil-whisper-large-v3
|
12 |
4. Click "Save"
|
@@ -27,10 +27,10 @@ services:
|
|
27 |
...
|
28 |
# Environment variables are documented here https://docs.openwebui.com/getting-started/env-configuration#speech-to-text
|
29 |
AUDIO_STT_ENGINE: "openai"
|
30 |
-
AUDIO_STT_OPENAI_API_BASE_URL: "http://
|
31 |
AUDIO_STT_OPENAI_API_KEY: "does-not-matter-what-you-put-but-should-not-be-empty"
|
32 |
AUDIO_STT_MODEL: "Systran/faster-distil-whisper-large-v3"
|
33 |
-
|
34 |
-
image:
|
35 |
...
|
36 |
```
|
|
|
6 |
2. Click on the "Audio" tab
|
7 |
3. Update settings
|
8 |
- Speech-to-Text Engine: OpenAI
|
9 |
+
- API Base URL: http://speaches:8000/v1
|
10 |
- API Key: does-not-matter-what-you-put-but-should-not-be-empty
|
11 |
- Model: Systran/faster-distil-whisper-large-v3
|
12 |
4. Click "Save"
|
|
|
27 |
...
|
28 |
# Environment variables are documented here https://docs.openwebui.com/getting-started/env-configuration#speech-to-text
|
29 |
AUDIO_STT_ENGINE: "openai"
|
30 |
+
AUDIO_STT_OPENAI_API_BASE_URL: "http://speaches:8000/v1"
|
31 |
AUDIO_STT_OPENAI_API_KEY: "does-not-matter-what-you-put-but-should-not-be-empty"
|
32 |
AUDIO_STT_MODEL: "Systran/faster-distil-whisper-large-v3"
|
33 |
+
speaches:
|
34 |
+
image: ghcr.io/speaches-ai/speaches:latest-cuda
|
35 |
...
|
36 |
```
|
docs/usage/text-to-speech.md
CHANGED
@@ -2,7 +2,6 @@
|
|
2 |
|
3 |
This feature not supported on ARM devices only x86_64. I was unable to build [piper-phonemize](https://github.com/rhasspy/piper-phonemize)(my [fork](https://github.com/fedirz/piper-phonemize))
|
4 |
|
5 |
-
http://localhost:8001/faster-whisper-server/api/
|
6 |
TODO: add a note about automatic downloads
|
7 |
TODO: add a demo
|
8 |
TODO: add a note about tts only running on cpu
|
@@ -19,13 +18,13 @@ Download the piper voices from [HuggingFace model repository](https://huggingfac
|
|
19 |
|
20 |
```bash
|
21 |
# Download all voices (~15 minutes / 7.7 Gbs)
|
22 |
-
docker exec -it
|
23 |
# Download all English voices (~4.5 minutes)
|
24 |
-
docker exec -it
|
25 |
# Download all qualities of a specific voice (~4 seconds)
|
26 |
-
docker exec -it
|
27 |
# Download specific quality of a specific voice (~2 seconds)
|
28 |
-
docker exec -it
|
29 |
```
|
30 |
|
31 |
!!! note
|
|
|
2 |
|
3 |
This feature not supported on ARM devices only x86_64. I was unable to build [piper-phonemize](https://github.com/rhasspy/piper-phonemize)(my [fork](https://github.com/fedirz/piper-phonemize))
|
4 |
|
|
|
5 |
TODO: add a note about automatic downloads
|
6 |
TODO: add a demo
|
7 |
TODO: add a note about tts only running on cpu
|
|
|
18 |
|
19 |
```bash
|
20 |
# Download all voices (~15 minutes / 7.7 Gbs)
|
21 |
+
docker exec -it speaches huggingface-cli download rhasspy/piper-voices
|
22 |
# Download all English voices (~4.5 minutes)
|
23 |
+
docker exec -it speaches huggingface-cli download rhasspy/piper-voices --include 'en/**/*' 'voices.json'
|
24 |
# Download all qualities of a specific voice (~4 seconds)
|
25 |
+
docker exec -it speaches huggingface-cli download rhasspy/piper-voices --include 'en/en_US/amy/**/*' 'voices.json'
|
26 |
# Download specific quality of a specific voice (~2 seconds)
|
27 |
+
docker exec -it speaches huggingface-cli download rhasspy/piper-voices --include 'en/en_US/amy/medium/*' 'voices.json'
|
28 |
```
|
29 |
|
30 |
!!! note
|
examples/javascript/index.js
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
/**
|
2 |
-
* Example provided by https://github.com/Gan-Xing in https://github.com/
|
3 |
*/
|
4 |
import 'dotenv/config';
|
5 |
import fs from 'node:fs';
|
|
|
1 |
/**
|
2 |
+
* Example provided by https://github.com/Gan-Xing in https://github.com/speaches-ai/speaches/issues/26
|
3 |
*/
|
4 |
import 'dotenv/config';
|
5 |
import fs from 'node:fs';
|
examples/live-audio/script.sh
CHANGED
@@ -9,10 +9,10 @@ set -e
|
|
9 |
|
10 |
export WHISPER__MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
|
11 |
|
12 |
-
# Ensure you have `
|
13 |
-
docker run --detach --gpus=all --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL
|
14 |
# or you can run it on a CPU
|
15 |
-
# docker run --detach --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL
|
16 |
|
17 |
# `pv` is used to limit the rate at which the audio is streamed to the server. Audio is being streamed at a rate of 32kb/s(16000 sample rate * 16-bit sample / 8 bits per byte = 32000 bytes per second). This emulutes live audio input from a microphone: `ffmpeg -loglevel quiet -f alsa -i default -ac 1 -ar 16000 -f s16le -`
|
18 |
# shellcheck disable=SC2002
|
|
|
9 |
|
10 |
export WHISPER__MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
|
11 |
|
12 |
+
# Ensure you have `speaches` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
|
13 |
+
docker run --detach --gpus=all --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL ghcr.io/speaches-ai/speaches:latest-cuda
|
14 |
# or you can run it on a CPU
|
15 |
+
# docker run --detach --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL ghcr.io/speaches-ai/speaches:latest-cpu
|
16 |
|
17 |
# `pv` is used to limit the rate at which the audio is streamed to the server. Audio is being streamed at a rate of 32kb/s(16000 sample rate * 16-bit sample / 8 bits per byte = 32000 bytes per second). This emulutes live audio input from a microphone: `ffmpeg -loglevel quiet -f alsa -i default -ac 1 -ar 16000 -f s16le -`
|
18 |
# shellcheck disable=SC2002
|
examples/youtube/script.sh
CHANGED
@@ -5,10 +5,10 @@ set -e
|
|
5 |
# NOTE: do not use any distil-* model other than the large ones as they don't work on long audio files for some reason.
|
6 |
export WHISPER__MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
|
7 |
|
8 |
-
# Ensure you have `
|
9 |
-
docker run --detach --gpus=all --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL
|
10 |
# or you can run it on a CPU
|
11 |
-
# docker run --detach --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL
|
12 |
|
13 |
# Download the audio from a YouTube video. In this example I'm downloading "The Evolution of the Operating System" by Asionometry YouTube channel. I highly checking this channel out, the guy produces very high content. If you don't have `youtube-dl`, you'll have to install it. https://github.com/ytdl-org/youtube-dl
|
14 |
youtube-dl --extract-audio --audio-format mp3 -o the-evolution-of-the-operating-system.mp3 'https://www.youtube.com/watch?v=1lG7lFLXBIs'
|
|
|
5 |
# NOTE: do not use any distil-* model other than the large ones as they don't work on long audio files for some reason.
|
6 |
export WHISPER__MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
|
7 |
|
8 |
+
# Ensure you have `speaches` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
|
9 |
+
docker run --detach --gpus=all --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL ghcr.io/speaches-ai/speaches:latest-cuda
|
10 |
# or you can run it on a CPU
|
11 |
+
# docker run --detach --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL ghcr.io/speaches-ai/speaches:latest-cpu
|
12 |
|
13 |
# Download the audio from a YouTube video. In this example I'm downloading "The Evolution of the Operating System" by Asionometry YouTube channel. I highly checking this channel out, the guy produces very high content. If you don't have `youtube-dl`, you'll have to install it. https://github.com/ytdl-org/youtube-dl
|
14 |
youtube-dl --extract-audio --audio-format mp3 -o the-evolution-of-the-operating-system.mp3 'https://www.youtube.com/watch?v=1lG7lFLXBIs'
|
mkdocs.yml
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
# yaml-language-server: $schema=https://squidfunk.github.io/mkdocs-material/schema.json
|
2 |
# https://www.mkdocs.org/user-guide/configuration/#configuration
|
3 |
-
site_name:
|
4 |
-
site_url: https://
|
5 |
-
repo_url: https://github.com/
|
6 |
edit_uri: edit/master/docs/
|
7 |
docs_dir: docs
|
8 |
theme:
|
|
|
1 |
# yaml-language-server: $schema=https://squidfunk.github.io/mkdocs-material/schema.json
|
2 |
# https://www.mkdocs.org/user-guide/configuration/#configuration
|
3 |
+
site_name: Speaches Documentation
|
4 |
+
site_url: https://speaches-ai.github.io/speaches/
|
5 |
+
repo_url: https://github.com/speaches-ai/speaches/
|
6 |
edit_uri: edit/master/docs/
|
7 |
docs_dir: docs
|
8 |
theme:
|
pyproject.toml
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
[project]
|
2 |
-
name = "
|
3 |
version = "0.1.0"
|
4 |
requires-python = ">=3.12,<3.13"
|
5 |
# https://packaging.python.org/en/latest/specifications/version-specifiers/#id5
|
|
|
1 |
[project]
|
2 |
+
name = "speaches"
|
3 |
version = "0.1.0"
|
4 |
requires-python = ">=3.12,<3.13"
|
5 |
# https://packaging.python.org/en/latest/specifications/version-specifiers/#id5
|
src/{faster_whisper_server β speaches}/__init__.py
RENAMED
File without changes
|
src/{faster_whisper_server β speaches}/api_models.py
RENAMED
@@ -4,7 +4,7 @@ from typing import TYPE_CHECKING, Literal
|
|
4 |
|
5 |
from pydantic import BaseModel, ConfigDict, Field
|
6 |
|
7 |
-
from
|
8 |
|
9 |
if TYPE_CHECKING:
|
10 |
from collections.abc import Iterable
|
@@ -23,7 +23,7 @@ class TranscriptionWord(BaseModel):
|
|
23 |
def from_segments(cls, segments: Iterable[TranscriptionSegment]) -> list[TranscriptionWord]:
|
24 |
words: list[TranscriptionWord] = []
|
25 |
for segment in segments:
|
26 |
-
# NOTE: a temporary "fix" for https://github.com/
|
27 |
# TODO: properly address the issue
|
28 |
assert (
|
29 |
segment.words is not None
|
|
|
4 |
|
5 |
from pydantic import BaseModel, ConfigDict, Field
|
6 |
|
7 |
+
from speaches.text_utils import Transcription, canonicalize_word, segments_to_text
|
8 |
|
9 |
if TYPE_CHECKING:
|
10 |
from collections.abc import Iterable
|
|
|
23 |
def from_segments(cls, segments: Iterable[TranscriptionSegment]) -> list[TranscriptionWord]:
|
24 |
words: list[TranscriptionWord] = []
|
25 |
for segment in segments:
|
26 |
+
# NOTE: a temporary "fix" for https://github.com/speaches-ai/speaches/issues/58.
|
27 |
# TODO: properly address the issue
|
28 |
assert (
|
29 |
segment.words is not None
|
src/{faster_whisper_server β speaches}/asr.py
RENAMED
@@ -5,13 +5,13 @@ import logging
|
|
5 |
import time
|
6 |
from typing import TYPE_CHECKING
|
7 |
|
8 |
-
from
|
9 |
-
from
|
10 |
|
11 |
if TYPE_CHECKING:
|
12 |
from faster_whisper import transcribe
|
13 |
|
14 |
-
from
|
15 |
|
16 |
logger = logging.getLogger(__name__)
|
17 |
|
|
|
5 |
import time
|
6 |
from typing import TYPE_CHECKING
|
7 |
|
8 |
+
from speaches.api_models import TranscriptionSegment, TranscriptionWord
|
9 |
+
from speaches.text_utils import Transcription
|
10 |
|
11 |
if TYPE_CHECKING:
|
12 |
from faster_whisper import transcribe
|
13 |
|
14 |
+
from speaches.audio import Audio
|
15 |
|
16 |
logger = logging.getLogger(__name__)
|
17 |
|
src/{faster_whisper_server β speaches}/audio.py
RENAMED
@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, BinaryIO
|
|
7 |
import numpy as np
|
8 |
import soundfile as sf
|
9 |
|
10 |
-
from
|
11 |
|
12 |
if TYPE_CHECKING:
|
13 |
from collections.abc import AsyncGenerator
|
|
|
7 |
import numpy as np
|
8 |
import soundfile as sf
|
9 |
|
10 |
+
from speaches.config import SAMPLES_PER_SECOND
|
11 |
|
12 |
if TYPE_CHECKING:
|
13 |
from collections.abc import AsyncGenerator
|
src/{faster_whisper_server β speaches}/config.py
RENAMED
File without changes
|
src/{faster_whisper_server β speaches}/dependencies.py
RENAMED
@@ -9,8 +9,8 @@ from openai import AsyncOpenAI
|
|
9 |
from openai.resources.audio import AsyncSpeech, AsyncTranscriptions
|
10 |
from openai.resources.chat.completions import AsyncCompletions
|
11 |
|
12 |
-
from
|
13 |
-
from
|
14 |
|
15 |
logger = logging.getLogger(__name__)
|
16 |
|
@@ -73,7 +73,7 @@ def get_speech_client() -> AsyncSpeech:
|
|
73 |
config = get_config()
|
74 |
if config.speech_base_url is None:
|
75 |
# this might not work as expected if `speech_router` won't have shared state (access to the same `model_manager`) with the main FastAPI `app`. TODO: verify # noqa: E501
|
76 |
-
from
|
77 |
router as speech_router,
|
78 |
)
|
79 |
|
@@ -94,7 +94,7 @@ def get_transcription_client() -> AsyncTranscriptions:
|
|
94 |
config = get_config()
|
95 |
if config.transcription_base_url is None:
|
96 |
# this might not work as expected if `transcription_router` won't have shared state (access to the same `model_manager`) with the main FastAPI `app`. TODO: verify # noqa: E501
|
97 |
-
from
|
98 |
router as stt_router,
|
99 |
)
|
100 |
|
|
|
9 |
from openai.resources.audio import AsyncSpeech, AsyncTranscriptions
|
10 |
from openai.resources.chat.completions import AsyncCompletions
|
11 |
|
12 |
+
from speaches.config import Config
|
13 |
+
from speaches.model_manager import PiperModelManager, WhisperModelManager
|
14 |
|
15 |
logger = logging.getLogger(__name__)
|
16 |
|
|
|
73 |
config = get_config()
|
74 |
if config.speech_base_url is None:
|
75 |
# this might not work as expected if `speech_router` won't have shared state (access to the same `model_manager`) with the main FastAPI `app`. TODO: verify # noqa: E501
|
76 |
+
from speaches.routers.speech import (
|
77 |
router as speech_router,
|
78 |
)
|
79 |
|
|
|
94 |
config = get_config()
|
95 |
if config.transcription_base_url is None:
|
96 |
# this might not work as expected if `transcription_router` won't have shared state (access to the same `model_manager`) with the main FastAPI `app`. TODO: verify # noqa: E501
|
97 |
+
from speaches.routers.stt import (
|
98 |
router as stt_router,
|
99 |
)
|
100 |
|
src/{faster_whisper_server β speaches}/gradio_app.py
RENAMED
@@ -7,8 +7,8 @@ import httpx
|
|
7 |
from httpx_sse import aconnect_sse
|
8 |
from openai import AsyncOpenAI
|
9 |
|
10 |
-
from
|
11 |
-
from
|
12 |
|
13 |
TRANSCRIPTION_ENDPOINT = "/v1/audio/transcriptions"
|
14 |
TRANSLATION_ENDPOINT = "/v1/audio/translations"
|
@@ -128,9 +128,9 @@ def create_gradio_demo(config: Config) -> gr.Blocks: # noqa: C901, PLR0915
|
|
128 |
file.write(audio_bytes)
|
129 |
return file_path
|
130 |
|
131 |
-
with gr.Blocks(title="
|
132 |
gr.Markdown(
|
133 |
-
"### Consider supporting the project by starring the [repository on GitHub](https://github.com/
|
134 |
)
|
135 |
with gr.Tab(label="Transcribe/Translate"):
|
136 |
audio = gr.Audio(type="filepath")
|
@@ -157,7 +157,7 @@ def create_gradio_demo(config: Config) -> gr.Blocks: # noqa: C901, PLR0915
|
|
157 |
|
158 |
with gr.Tab(label="Speech Generation"):
|
159 |
if platform.machine() != "x86_64":
|
160 |
-
from
|
161 |
DEFAULT_VOICE,
|
162 |
MAX_SAMPLE_RATE,
|
163 |
MIN_SAMPLE_RATE,
|
|
|
7 |
from httpx_sse import aconnect_sse
|
8 |
from openai import AsyncOpenAI
|
9 |
|
10 |
+
from speaches.config import Config, Task
|
11 |
+
from speaches.hf_utils import PiperModel
|
12 |
|
13 |
TRANSCRIPTION_ENDPOINT = "/v1/audio/transcriptions"
|
14 |
TRANSLATION_ENDPOINT = "/v1/audio/translations"
|
|
|
128 |
file.write(audio_bytes)
|
129 |
return file_path
|
130 |
|
131 |
+
with gr.Blocks(title="Speaches Playground") as demo:
|
132 |
gr.Markdown(
|
133 |
+
"### Consider supporting the project by starring the [repository on GitHub](https://github.com/speaches-ai/speaches)."
|
134 |
)
|
135 |
with gr.Tab(label="Transcribe/Translate"):
|
136 |
audio = gr.Audio(type="filepath")
|
|
|
157 |
|
158 |
with gr.Tab(label="Speech Generation"):
|
159 |
if platform.machine() != "x86_64":
|
160 |
+
from speaches.routers.speech import (
|
161 |
DEFAULT_VOICE,
|
162 |
MAX_SAMPLE_RATE,
|
163 |
MIN_SAMPLE_RATE,
|
src/{faster_whisper_server β speaches}/hf_utils.py
RENAMED
@@ -10,7 +10,7 @@ import huggingface_hub
|
|
10 |
from huggingface_hub.constants import HF_HUB_CACHE
|
11 |
from pydantic import BaseModel, Field, computed_field
|
12 |
|
13 |
-
from
|
14 |
|
15 |
logger = logging.getLogger(__name__)
|
16 |
|
|
|
10 |
from huggingface_hub.constants import HF_HUB_CACHE
|
11 |
from pydantic import BaseModel, Field, computed_field
|
12 |
|
13 |
+
from speaches.api_models import Model
|
14 |
|
15 |
logger = logging.getLogger(__name__)
|
16 |
|
src/{faster_whisper_server β speaches}/logger.py
RENAMED
File without changes
|
src/{faster_whisper_server β speaches}/main.py
RENAMED
@@ -10,15 +10,15 @@ from fastapi import (
|
|
10 |
)
|
11 |
from fastapi.middleware.cors import CORSMiddleware
|
12 |
|
13 |
-
from
|
14 |
-
from
|
15 |
-
from
|
16 |
router as misc_router,
|
17 |
)
|
18 |
-
from
|
19 |
router as models_router,
|
20 |
)
|
21 |
-
from
|
22 |
router as stt_router,
|
23 |
)
|
24 |
|
@@ -47,7 +47,7 @@ def create_app() -> FastAPI:
|
|
47 |
logger.debug(f"Config: {config}")
|
48 |
|
49 |
if platform.machine() == "x86_64":
|
50 |
-
from
|
51 |
router as speech_router,
|
52 |
)
|
53 |
else:
|
@@ -86,7 +86,7 @@ def create_app() -> FastAPI:
|
|
86 |
if config.enable_ui:
|
87 |
import gradio as gr
|
88 |
|
89 |
-
from
|
90 |
|
91 |
app = gr.mount_gradio_app(app, create_gradio_demo(config), path="/")
|
92 |
|
|
|
10 |
)
|
11 |
from fastapi.middleware.cors import CORSMiddleware
|
12 |
|
13 |
+
from speaches.dependencies import ApiKeyDependency, get_config, get_model_manager
|
14 |
+
from speaches.logger import setup_logger
|
15 |
+
from speaches.routers.misc import (
|
16 |
router as misc_router,
|
17 |
)
|
18 |
+
from speaches.routers.models import (
|
19 |
router as models_router,
|
20 |
)
|
21 |
+
from speaches.routers.stt import (
|
22 |
router as stt_router,
|
23 |
)
|
24 |
|
|
|
47 |
logger.debug(f"Config: {config}")
|
48 |
|
49 |
if platform.machine() == "x86_64":
|
50 |
+
from speaches.routers.speech import (
|
51 |
router as speech_router,
|
52 |
)
|
53 |
else:
|
|
|
86 |
if config.enable_ui:
|
87 |
import gradio as gr
|
88 |
|
89 |
+
from speaches.gradio_app import create_gradio_demo
|
90 |
|
91 |
app = gr.mount_gradio_app(app, create_gradio_demo(config), path="/")
|
92 |
|
src/{faster_whisper_server β speaches}/model_manager.py
RENAMED
@@ -9,14 +9,14 @@ from typing import TYPE_CHECKING
|
|
9 |
|
10 |
from faster_whisper import WhisperModel
|
11 |
|
12 |
-
from
|
13 |
|
14 |
if TYPE_CHECKING:
|
15 |
from collections.abc import Callable
|
16 |
|
17 |
from piper.voice import PiperVoice
|
18 |
|
19 |
-
from
|
20 |
WhisperConfig,
|
21 |
)
|
22 |
|
|
|
9 |
|
10 |
from faster_whisper import WhisperModel
|
11 |
|
12 |
+
from speaches.hf_utils import get_piper_voice_model_file
|
13 |
|
14 |
if TYPE_CHECKING:
|
15 |
from collections.abc import Callable
|
16 |
|
17 |
from piper.voice import PiperVoice
|
18 |
|
19 |
+
from speaches.config import (
|
20 |
WhisperConfig,
|
21 |
)
|
22 |
|
src/{faster_whisper_server β speaches}/routers/__init__.py
RENAMED
File without changes
|
src/{faster_whisper_server β speaches}/routers/misc.py
RENAMED
@@ -7,8 +7,8 @@ from fastapi import (
|
|
7 |
import huggingface_hub
|
8 |
from huggingface_hub.hf_api import RepositoryNotFoundError
|
9 |
|
10 |
-
from
|
11 |
-
from
|
12 |
|
13 |
router = APIRouter()
|
14 |
|
|
|
7 |
import huggingface_hub
|
8 |
from huggingface_hub.hf_api import RepositoryNotFoundError
|
9 |
|
10 |
+
from speaches import hf_utils
|
11 |
+
from speaches.dependencies import ModelManagerDependency # noqa: TC001
|
12 |
|
13 |
router = APIRouter()
|
14 |
|
src/{faster_whisper_server β speaches}/routers/models.py
RENAMED
@@ -9,11 +9,11 @@ from fastapi import (
|
|
9 |
)
|
10 |
import huggingface_hub
|
11 |
|
12 |
-
from
|
13 |
ListModelsResponse,
|
14 |
Model,
|
15 |
)
|
16 |
-
from
|
17 |
|
18 |
if TYPE_CHECKING:
|
19 |
from huggingface_hub.hf_api import ModelInfo
|
|
|
9 |
)
|
10 |
import huggingface_hub
|
11 |
|
12 |
+
from speaches.api_models import (
|
13 |
ListModelsResponse,
|
14 |
Model,
|
15 |
)
|
16 |
+
from speaches.hf_utils import list_whisper_models
|
17 |
|
18 |
if TYPE_CHECKING:
|
19 |
from huggingface_hub.hf_api import ModelInfo
|
src/{faster_whisper_server β speaches}/routers/speech.py
RENAMED
@@ -11,8 +11,8 @@ from piper.voice import PiperVoice
|
|
11 |
from pydantic import BaseModel, BeforeValidator, Field, ValidationError, model_validator
|
12 |
import soundfile as sf
|
13 |
|
14 |
-
from
|
15 |
-
from
|
16 |
PiperModel,
|
17 |
list_piper_models,
|
18 |
read_piper_voices_config,
|
|
|
11 |
from pydantic import BaseModel, BeforeValidator, Field, ValidationError, model_validator
|
12 |
import soundfile as sf
|
13 |
|
14 |
+
from speaches.dependencies import PiperModelManagerDependency
|
15 |
+
from speaches.hf_utils import (
|
16 |
PiperModel,
|
17 |
list_piper_models,
|
18 |
read_piper_voices_config,
|
src/{faster_whisper_server β speaches}/routers/stt.py
RENAMED
@@ -27,7 +27,7 @@ from numpy import float32
|
|
27 |
from numpy.typing import NDArray
|
28 |
from pydantic import AfterValidator, Field
|
29 |
|
30 |
-
from
|
31 |
DEFAULT_TIMESTAMP_GRANULARITIES,
|
32 |
TIMESTAMP_GRANULARITIES_COMBINATIONS,
|
33 |
CreateTranscriptionResponseJson,
|
@@ -35,17 +35,17 @@ from faster_whisper_server.api_models import (
|
|
35 |
TimestampGranularities,
|
36 |
TranscriptionSegment,
|
37 |
)
|
38 |
-
from
|
39 |
-
from
|
40 |
-
from
|
41 |
SAMPLES_PER_SECOND,
|
42 |
Language,
|
43 |
ResponseFormat,
|
44 |
Task,
|
45 |
)
|
46 |
-
from
|
47 |
-
from
|
48 |
-
from
|
49 |
|
50 |
if TYPE_CHECKING:
|
51 |
from collections.abc import Generator, Iterable
|
@@ -77,7 +77,7 @@ def audio_file_dependency(
|
|
77 |
) from e
|
78 |
except Exception as e:
|
79 |
logger.exception(
|
80 |
-
"Failed to decode audio. This is likely a bug. Please create an issue at https://github.com/
|
81 |
)
|
82 |
raise HTTPException(status_code=500, detail="Failed to decode audio.") from e
|
83 |
else:
|
|
|
27 |
from numpy.typing import NDArray
|
28 |
from pydantic import AfterValidator, Field
|
29 |
|
30 |
+
from speaches.api_models import (
|
31 |
DEFAULT_TIMESTAMP_GRANULARITIES,
|
32 |
TIMESTAMP_GRANULARITIES_COMBINATIONS,
|
33 |
CreateTranscriptionResponseJson,
|
|
|
35 |
TimestampGranularities,
|
36 |
TranscriptionSegment,
|
37 |
)
|
38 |
+
from speaches.asr import FasterWhisperASR
|
39 |
+
from speaches.audio import AudioStream, audio_samples_from_file
|
40 |
+
from speaches.config import (
|
41 |
SAMPLES_PER_SECOND,
|
42 |
Language,
|
43 |
ResponseFormat,
|
44 |
Task,
|
45 |
)
|
46 |
+
from speaches.dependencies import ConfigDependency, ModelManagerDependency, get_config
|
47 |
+
from speaches.text_utils import segments_to_srt, segments_to_text, segments_to_vtt
|
48 |
+
from speaches.transcriber import audio_transcriber
|
49 |
|
50 |
if TYPE_CHECKING:
|
51 |
from collections.abc import Generator, Iterable
|
|
|
77 |
) from e
|
78 |
except Exception as e:
|
79 |
logger.exception(
|
80 |
+
"Failed to decode audio. This is likely a bug. Please create an issue at https://github.com/speaches-ai/speaches/issues/new."
|
81 |
)
|
82 |
raise HTTPException(status_code=500, detail="Failed to decode audio.") from e
|
83 |
else:
|
src/{faster_whisper_server β speaches}/text_utils.py
RENAMED
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING
|
|
6 |
if TYPE_CHECKING:
|
7 |
from collections.abc import Iterable
|
8 |
|
9 |
-
from
|
10 |
|
11 |
|
12 |
class Transcription:
|
@@ -38,7 +38,7 @@ class Transcription:
|
|
38 |
self.words.extend(words)
|
39 |
|
40 |
def _ensure_no_word_overlap(self, words: list[TranscriptionWord]) -> None:
|
41 |
-
from
|
42 |
|
43 |
config = get_config() # HACK
|
44 |
if len(self.words) > 0 and len(words) > 0:
|
|
|
6 |
if TYPE_CHECKING:
|
7 |
from collections.abc import Iterable
|
8 |
|
9 |
+
from speaches.api_models import TranscriptionSegment, TranscriptionWord
|
10 |
|
11 |
|
12 |
class Transcription:
|
|
|
38 |
self.words.extend(words)
|
39 |
|
40 |
def _ensure_no_word_overlap(self, words: list[TranscriptionWord]) -> None:
|
41 |
+
from speaches.dependencies import get_config # HACK: avoid circular import
|
42 |
|
43 |
config = get_config() # HACK
|
44 |
if len(self.words) > 0 and len(words) > 0:
|
src/{faster_whisper_server β speaches}/text_utils_test.py
RENAMED
@@ -1,5 +1,5 @@
|
|
1 |
-
from
|
2 |
-
from
|
3 |
canonicalize_word,
|
4 |
common_prefix,
|
5 |
is_eos,
|
|
|
1 |
+
from speaches.api_models import TranscriptionWord
|
2 |
+
from speaches.text_utils import (
|
3 |
canonicalize_word,
|
4 |
common_prefix,
|
5 |
is_eos,
|
src/{faster_whisper_server β speaches}/transcriber.py
RENAMED
@@ -3,14 +3,14 @@ from __future__ import annotations
|
|
3 |
import logging
|
4 |
from typing import TYPE_CHECKING
|
5 |
|
6 |
-
from
|
7 |
-
from
|
8 |
|
9 |
if TYPE_CHECKING:
|
10 |
from collections.abc import AsyncGenerator
|
11 |
|
12 |
-
from
|
13 |
-
from
|
14 |
|
15 |
logger = logging.getLogger(__name__)
|
16 |
|
|
|
3 |
import logging
|
4 |
from typing import TYPE_CHECKING
|
5 |
|
6 |
+
from speaches.audio import Audio, AudioStream
|
7 |
+
from speaches.text_utils import Transcription, common_prefix, to_full_sentences, word_to_text
|
8 |
|
9 |
if TYPE_CHECKING:
|
10 |
from collections.abc import AsyncGenerator
|
11 |
|
12 |
+
from speaches.api_models import TranscriptionWord
|
13 |
+
from speaches.asr import FasterWhisperASR
|
14 |
|
15 |
logger = logging.getLogger(__name__)
|
16 |
|
tests/api_timestamp_granularities_test.py
CHANGED
@@ -5,7 +5,7 @@ from pathlib import Path
|
|
5 |
from openai import AsyncOpenAI
|
6 |
import pytest
|
7 |
|
8 |
-
from
|
9 |
|
10 |
|
11 |
@pytest.mark.asyncio
|
|
|
5 |
from openai import AsyncOpenAI
|
6 |
import pytest
|
7 |
|
8 |
+
from speaches.api_models import TIMESTAMP_GRANULARITIES_COMBINATIONS, TimestampGranularities
|
9 |
|
10 |
|
11 |
@pytest.mark.asyncio
|
tests/conftest.py
CHANGED
@@ -12,9 +12,9 @@ import pytest
|
|
12 |
import pytest_asyncio
|
13 |
from pytest_mock import MockerFixture
|
14 |
|
15 |
-
from
|
16 |
-
from
|
17 |
-
from
|
18 |
|
19 |
DISABLE_LOGGERS = ["multipart.multipart", "faster_whisper"]
|
20 |
OPENAI_BASE_URL = "https://api.openai.com/v1"
|
@@ -54,11 +54,11 @@ async def aclient_factory(mocker: MockerFixture) -> AclientFactory:
|
|
54 |
@asynccontextmanager
|
55 |
async def inner(config: Config = DEFAULT_CONFIG) -> AsyncGenerator[AsyncClient, None]:
|
56 |
# NOTE: all calls to `get_config` should be patched. One way to test that this works is to update the original `get_config` to raise an exception and see if the tests fail # noqa: E501
|
57 |
-
mocker.patch("
|
58 |
-
mocker.patch("
|
59 |
# NOTE: I couldn't get the following to work but it shouldn't matter
|
60 |
# mocker.patch(
|
61 |
-
# "
|
62 |
# )
|
63 |
|
64 |
app = create_app()
|
|
|
12 |
import pytest_asyncio
|
13 |
from pytest_mock import MockerFixture
|
14 |
|
15 |
+
from speaches.config import Config, WhisperConfig
|
16 |
+
from speaches.dependencies import get_config
|
17 |
+
from speaches.main import create_app
|
18 |
|
19 |
DISABLE_LOGGERS = ["multipart.multipart", "faster_whisper"]
|
20 |
OPENAI_BASE_URL = "https://api.openai.com/v1"
|
|
|
54 |
@asynccontextmanager
|
55 |
async def inner(config: Config = DEFAULT_CONFIG) -> AsyncGenerator[AsyncClient, None]:
|
56 |
# NOTE: all calls to `get_config` should be patched. One way to test that this works is to update the original `get_config` to raise an exception and see if the tests fail # noqa: E501
|
57 |
+
mocker.patch("speaches.dependencies.get_config", return_value=config)
|
58 |
+
mocker.patch("speaches.main.get_config", return_value=config)
|
59 |
# NOTE: I couldn't get the following to work but it shouldn't matter
|
60 |
# mocker.patch(
|
61 |
+
# "speaches.text_utils.Transcription._ensure_no_word_overlap.get_config", return_value=config
|
62 |
# )
|
63 |
|
64 |
app = create_app()
|
tests/model_manager_test.py
CHANGED
@@ -3,7 +3,7 @@ import asyncio
|
|
3 |
import anyio
|
4 |
import pytest
|
5 |
|
6 |
-
from
|
7 |
from tests.conftest import DEFAULT_WHISPER_MODEL, AclientFactory
|
8 |
|
9 |
MODEL = DEFAULT_WHISPER_MODEL # just to make the test more readable
|
|
|
3 |
import anyio
|
4 |
import pytest
|
5 |
|
6 |
+
from speaches.config import Config, WhisperConfig
|
7 |
from tests.conftest import DEFAULT_WHISPER_MODEL, AclientFactory
|
8 |
|
9 |
MODEL = DEFAULT_WHISPER_MODEL # just to make the test more readable
|
tests/openai_timestamp_granularities_test.py
CHANGED
@@ -5,7 +5,7 @@ from pathlib import Path
|
|
5 |
from openai import AsyncOpenAI, BadRequestError
|
6 |
import pytest
|
7 |
|
8 |
-
from
|
9 |
|
10 |
|
11 |
@pytest.mark.asyncio
|
|
|
5 |
from openai import AsyncOpenAI, BadRequestError
|
6 |
import pytest
|
7 |
|
8 |
+
from speaches.api_models import TIMESTAMP_GRANULARITIES_COMBINATIONS, TimestampGranularities
|
9 |
|
10 |
|
11 |
@pytest.mark.asyncio
|
tests/speech_test.py
CHANGED
@@ -9,7 +9,7 @@ platform_machine = platform.machine()
|
|
9 |
if platform_machine != "x86_64":
|
10 |
pytest.skip("Only supported on x86_64", allow_module_level=True)
|
11 |
|
12 |
-
from
|
13 |
DEFAULT_MODEL,
|
14 |
DEFAULT_RESPONSE_FORMAT,
|
15 |
DEFAULT_VOICE,
|
|
|
9 |
if platform_machine != "x86_64":
|
10 |
pytest.skip("Only supported on x86_64", allow_module_level=True)
|
11 |
|
12 |
+
from speaches.routers.speech import ( # noqa: E402
|
13 |
DEFAULT_MODEL,
|
14 |
DEFAULT_RESPONSE_FORMAT,
|
15 |
DEFAULT_VOICE,
|
tests/sse_test.py
CHANGED
@@ -9,7 +9,7 @@ import srt
|
|
9 |
import webvtt
|
10 |
import webvtt.vtt
|
11 |
|
12 |
-
from
|
13 |
CreateTranscriptionResponseJson,
|
14 |
CreateTranscriptionResponseVerboseJson,
|
15 |
)
|
|
|
9 |
import webvtt
|
10 |
import webvtt.vtt
|
11 |
|
12 |
+
from speaches.api_models import (
|
13 |
CreateTranscriptionResponseJson,
|
14 |
CreateTranscriptionResponseVerboseJson,
|
15 |
)
|
uv.lock
CHANGED
@@ -266,115 +266,6 @@ wheels = [
|
|
266 |
{ url = "https://files.pythonhosted.org/packages/7b/03/ab118cb743dcf671da01ad0cfd7564465dda115db32976fdc95e21ce8feb/faster_whisper-1.1.0-py3-none-any.whl", hash = "sha256:0f2d025676bbff1e46c4108b6f9a82578d6e33826c174af2990e45b33fab6182", size = 1118168 },
|
267 |
]
|
268 |
|
269 |
-
[[package]]
|
270 |
-
name = "faster-whisper-server"
|
271 |
-
version = "0.1.0"
|
272 |
-
source = { editable = "." }
|
273 |
-
dependencies = [
|
274 |
-
{ name = "ctranslate2" },
|
275 |
-
{ name = "fastapi" },
|
276 |
-
{ name = "faster-whisper" },
|
277 |
-
{ name = "huggingface-hub", extra = ["hf-transfer"] },
|
278 |
-
{ name = "numpy" },
|
279 |
-
{ name = "piper-phonemize", marker = "platform_machine == 'x86_64'" },
|
280 |
-
{ name = "piper-tts", marker = "platform_machine == 'x86_64'" },
|
281 |
-
{ name = "pydantic" },
|
282 |
-
{ name = "pydantic-settings" },
|
283 |
-
{ name = "python-multipart" },
|
284 |
-
{ name = "sounddevice" },
|
285 |
-
{ name = "soundfile" },
|
286 |
-
{ name = "uvicorn" },
|
287 |
-
]
|
288 |
-
|
289 |
-
[package.optional-dependencies]
|
290 |
-
client = [
|
291 |
-
{ name = "keyboard" },
|
292 |
-
]
|
293 |
-
dev = [
|
294 |
-
{ name = "anyio" },
|
295 |
-
{ name = "basedpyright" },
|
296 |
-
{ name = "mdx-truly-sane-lists" },
|
297 |
-
{ name = "mkdocs-material" },
|
298 |
-
{ name = "mkdocs-render-swagger-plugin" },
|
299 |
-
{ name = "mkdocstrings", extra = ["python"] },
|
300 |
-
{ name = "pre-commit" },
|
301 |
-
{ name = "pytest" },
|
302 |
-
{ name = "pytest-antilru" },
|
303 |
-
{ name = "pytest-asyncio" },
|
304 |
-
{ name = "pytest-mock" },
|
305 |
-
{ name = "pytest-xdist" },
|
306 |
-
{ name = "ruff" },
|
307 |
-
{ name = "srt" },
|
308 |
-
{ name = "webvtt-py" },
|
309 |
-
]
|
310 |
-
opentelemetry = [
|
311 |
-
{ name = "opentelemetry-distro" },
|
312 |
-
{ name = "opentelemetry-exporter-otlp" },
|
313 |
-
{ name = "opentelemetry-instrumentation-asyncio" },
|
314 |
-
{ name = "opentelemetry-instrumentation-fastapi" },
|
315 |
-
{ name = "opentelemetry-instrumentation-grpc" },
|
316 |
-
{ name = "opentelemetry-instrumentation-httpx" },
|
317 |
-
{ name = "opentelemetry-instrumentation-logging" },
|
318 |
-
{ name = "opentelemetry-instrumentation-requests" },
|
319 |
-
{ name = "opentelemetry-instrumentation-threading" },
|
320 |
-
{ name = "opentelemetry-instrumentation-urllib" },
|
321 |
-
{ name = "opentelemetry-instrumentation-urllib3" },
|
322 |
-
]
|
323 |
-
ui = [
|
324 |
-
{ name = "gradio" },
|
325 |
-
{ name = "httpx" },
|
326 |
-
{ name = "httpx-sse" },
|
327 |
-
{ name = "openai" },
|
328 |
-
]
|
329 |
-
|
330 |
-
[package.metadata]
|
331 |
-
requires-dist = [
|
332 |
-
{ name = "anyio", marker = "extra == 'dev'", specifier = ">=4.4.0" },
|
333 |
-
{ name = "basedpyright", marker = "extra == 'dev'", specifier = ">=1.18.0" },
|
334 |
-
{ name = "ctranslate2", specifier = ">=4.5.0" },
|
335 |
-
{ name = "fastapi", specifier = ">=0.115.0" },
|
336 |
-
{ name = "faster-whisper", specifier = ">=1.1.0" },
|
337 |
-
{ name = "gradio", marker = "extra == 'ui'", specifier = ">=5.0.2" },
|
338 |
-
{ name = "httpx", marker = "extra == 'ui'", specifier = ">=0.27.2" },
|
339 |
-
{ name = "httpx-sse", marker = "extra == 'ui'", specifier = ">=0.4.0" },
|
340 |
-
{ name = "huggingface-hub", extras = ["hf-transfer"], specifier = ">=0.25.1" },
|
341 |
-
{ name = "keyboard", marker = "extra == 'client'", specifier = ">=0.13.5" },
|
342 |
-
{ name = "mdx-truly-sane-lists", marker = "extra == 'dev'", specifier = ">=1.3" },
|
343 |
-
{ name = "mkdocs-material", marker = "extra == 'dev'", specifier = ">=9.5.39" },
|
344 |
-
{ name = "mkdocs-render-swagger-plugin", marker = "extra == 'dev'", specifier = ">=0.1.2" },
|
345 |
-
{ name = "mkdocstrings", extras = ["python"], marker = "extra == 'dev'", specifier = ">=0.26.1" },
|
346 |
-
{ name = "numpy", specifier = ">=2.1.1" },
|
347 |
-
{ name = "openai", marker = "extra == 'ui'", specifier = ">=1.48.0" },
|
348 |
-
{ name = "opentelemetry-distro", marker = "extra == 'opentelemetry'", specifier = ">=0.48b0" },
|
349 |
-
{ name = "opentelemetry-exporter-otlp", marker = "extra == 'opentelemetry'", specifier = ">=1.27.0" },
|
350 |
-
{ name = "opentelemetry-instrumentation-asyncio", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
|
351 |
-
{ name = "opentelemetry-instrumentation-fastapi", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
|
352 |
-
{ name = "opentelemetry-instrumentation-grpc", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
|
353 |
-
{ name = "opentelemetry-instrumentation-httpx", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
|
354 |
-
{ name = "opentelemetry-instrumentation-logging", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
|
355 |
-
{ name = "opentelemetry-instrumentation-requests", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
|
356 |
-
{ name = "opentelemetry-instrumentation-threading", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
|
357 |
-
{ name = "opentelemetry-instrumentation-urllib", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
|
358 |
-
{ name = "opentelemetry-instrumentation-urllib3", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
|
359 |
-
{ name = "piper-phonemize", marker = "platform_machine == 'x86_64'", url = "https://github.com/fedirz/piper-phonemize/raw/refs/heads/master/dist/piper_phonemize-1.2.0-cp312-cp312-manylinux_2_28_x86_64.whl" },
|
360 |
-
{ name = "piper-tts", marker = "platform_machine == 'x86_64'", specifier = ">=1.2.0" },
|
361 |
-
{ name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.0.1" },
|
362 |
-
{ name = "pydantic", specifier = ">=2.9.0" },
|
363 |
-
{ name = "pydantic-settings", specifier = ">=2.5.2" },
|
364 |
-
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.3.3" },
|
365 |
-
{ name = "pytest-antilru", marker = "extra == 'dev'", specifier = ">=2.0.0" },
|
366 |
-
{ name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24.0" },
|
367 |
-
{ name = "pytest-mock", marker = "extra == 'dev'", specifier = ">=3.14.0" },
|
368 |
-
{ name = "pytest-xdist", marker = "extra == 'dev'", specifier = ">=3.6.1" },
|
369 |
-
{ name = "python-multipart", specifier = ">=0.0.10" },
|
370 |
-
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.7.1" },
|
371 |
-
{ name = "sounddevice", specifier = ">=0.5.1" },
|
372 |
-
{ name = "soundfile", specifier = ">=0.12.1" },
|
373 |
-
{ name = "srt", marker = "extra == 'dev'", specifier = ">=3.5.3" },
|
374 |
-
{ name = "uvicorn", specifier = ">=0.30.6" },
|
375 |
-
{ name = "webvtt-py", marker = "extra == 'dev'", specifier = ">=0.5.1" },
|
376 |
-
]
|
377 |
-
|
378 |
[[package]]
|
379 |
name = "ffmpy"
|
380 |
version = "0.4.0"
|
@@ -4241,6 +4132,115 @@ wheels = [
|
|
4241 |
{ url = "https://files.pythonhosted.org/packages/50/ff/26a4ee48d0b66625a4e4028a055b9f25bc9d7c7b2d17d21a45137621a50d/soundfile-0.12.1-py2.py3-none-win_amd64.whl", hash = "sha256:0d86924c00b62552b650ddd28af426e3ff2d4dc2e9047dae5b3d8452e0a49a77", size = 1009109 },
|
4242 |
]
|
4243 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4244 |
[[package]]
|
4245 |
name = "srt"
|
4246 |
version = "3.5.3"
|
|
|
266 |
{ url = "https://files.pythonhosted.org/packages/7b/03/ab118cb743dcf671da01ad0cfd7564465dda115db32976fdc95e21ce8feb/faster_whisper-1.1.0-py3-none-any.whl", hash = "sha256:0f2d025676bbff1e46c4108b6f9a82578d6e33826c174af2990e45b33fab6182", size = 1118168 },
|
267 |
]
|
268 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
[[package]]
|
270 |
name = "ffmpy"
|
271 |
version = "0.4.0"
|
|
|
4132 |
{ url = "https://files.pythonhosted.org/packages/50/ff/26a4ee48d0b66625a4e4028a055b9f25bc9d7c7b2d17d21a45137621a50d/soundfile-0.12.1-py2.py3-none-win_amd64.whl", hash = "sha256:0d86924c00b62552b650ddd28af426e3ff2d4dc2e9047dae5b3d8452e0a49a77", size = 1009109 },
|
4133 |
]
|
4134 |
|
4135 |
+
[[package]]
|
4136 |
+
name = "speaches"
|
4137 |
+
version = "0.1.0"
|
4138 |
+
source = { editable = "." }
|
4139 |
+
dependencies = [
|
4140 |
+
{ name = "ctranslate2" },
|
4141 |
+
{ name = "fastapi" },
|
4142 |
+
{ name = "faster-whisper" },
|
4143 |
+
{ name = "huggingface-hub", extra = ["hf-transfer"] },
|
4144 |
+
{ name = "numpy" },
|
4145 |
+
{ name = "piper-phonemize", marker = "platform_machine == 'x86_64'" },
|
4146 |
+
{ name = "piper-tts", marker = "platform_machine == 'x86_64'" },
|
4147 |
+
{ name = "pydantic" },
|
4148 |
+
{ name = "pydantic-settings" },
|
4149 |
+
{ name = "python-multipart" },
|
4150 |
+
{ name = "sounddevice" },
|
4151 |
+
{ name = "soundfile" },
|
4152 |
+
{ name = "uvicorn" },
|
4153 |
+
]
|
4154 |
+
|
4155 |
+
[package.optional-dependencies]
|
4156 |
+
client = [
|
4157 |
+
{ name = "keyboard" },
|
4158 |
+
]
|
4159 |
+
dev = [
|
4160 |
+
{ name = "anyio" },
|
4161 |
+
{ name = "basedpyright" },
|
4162 |
+
{ name = "mdx-truly-sane-lists" },
|
4163 |
+
{ name = "mkdocs-material" },
|
4164 |
+
{ name = "mkdocs-render-swagger-plugin" },
|
4165 |
+
{ name = "mkdocstrings", extra = ["python"] },
|
4166 |
+
{ name = "pre-commit" },
|
4167 |
+
{ name = "pytest" },
|
4168 |
+
{ name = "pytest-antilru" },
|
4169 |
+
{ name = "pytest-asyncio" },
|
4170 |
+
{ name = "pytest-mock" },
|
4171 |
+
{ name = "pytest-xdist" },
|
4172 |
+
{ name = "ruff" },
|
4173 |
+
{ name = "srt" },
|
4174 |
+
{ name = "webvtt-py" },
|
4175 |
+
]
|
4176 |
+
opentelemetry = [
|
4177 |
+
{ name = "opentelemetry-distro" },
|
4178 |
+
{ name = "opentelemetry-exporter-otlp" },
|
4179 |
+
{ name = "opentelemetry-instrumentation-asyncio" },
|
4180 |
+
{ name = "opentelemetry-instrumentation-fastapi" },
|
4181 |
+
{ name = "opentelemetry-instrumentation-grpc" },
|
4182 |
+
{ name = "opentelemetry-instrumentation-httpx" },
|
4183 |
+
{ name = "opentelemetry-instrumentation-logging" },
|
4184 |
+
{ name = "opentelemetry-instrumentation-requests" },
|
4185 |
+
{ name = "opentelemetry-instrumentation-threading" },
|
4186 |
+
{ name = "opentelemetry-instrumentation-urllib" },
|
4187 |
+
{ name = "opentelemetry-instrumentation-urllib3" },
|
4188 |
+
]
|
4189 |
+
ui = [
|
4190 |
+
{ name = "gradio" },
|
4191 |
+
{ name = "httpx" },
|
4192 |
+
{ name = "httpx-sse" },
|
4193 |
+
{ name = "openai" },
|
4194 |
+
]
|
4195 |
+
|
4196 |
+
[package.metadata]
|
4197 |
+
requires-dist = [
|
4198 |
+
{ name = "anyio", marker = "extra == 'dev'", specifier = ">=4.4.0" },
|
4199 |
+
{ name = "basedpyright", marker = "extra == 'dev'", specifier = ">=1.18.0" },
|
4200 |
+
{ name = "ctranslate2", specifier = ">=4.5.0" },
|
4201 |
+
{ name = "fastapi", specifier = ">=0.115.0" },
|
4202 |
+
{ name = "faster-whisper", specifier = ">=1.1.0" },
|
4203 |
+
{ name = "gradio", marker = "extra == 'ui'", specifier = ">=5.0.2" },
|
4204 |
+
{ name = "httpx", marker = "extra == 'ui'", specifier = ">=0.27.2" },
|
4205 |
+
{ name = "httpx-sse", marker = "extra == 'ui'", specifier = ">=0.4.0" },
|
4206 |
+
{ name = "huggingface-hub", extras = ["hf-transfer"], specifier = ">=0.25.1" },
|
4207 |
+
{ name = "keyboard", marker = "extra == 'client'", specifier = ">=0.13.5" },
|
4208 |
+
{ name = "mdx-truly-sane-lists", marker = "extra == 'dev'", specifier = ">=1.3" },
|
4209 |
+
{ name = "mkdocs-material", marker = "extra == 'dev'", specifier = ">=9.5.39" },
|
4210 |
+
{ name = "mkdocs-render-swagger-plugin", marker = "extra == 'dev'", specifier = ">=0.1.2" },
|
4211 |
+
{ name = "mkdocstrings", extras = ["python"], marker = "extra == 'dev'", specifier = ">=0.26.1" },
|
4212 |
+
{ name = "numpy", specifier = ">=2.1.1" },
|
4213 |
+
{ name = "openai", marker = "extra == 'ui'", specifier = ">=1.48.0" },
|
4214 |
+
{ name = "opentelemetry-distro", marker = "extra == 'opentelemetry'", specifier = ">=0.48b0" },
|
4215 |
+
{ name = "opentelemetry-exporter-otlp", marker = "extra == 'opentelemetry'", specifier = ">=1.27.0" },
|
4216 |
+
{ name = "opentelemetry-instrumentation-asyncio", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
|
4217 |
+
{ name = "opentelemetry-instrumentation-fastapi", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
|
4218 |
+
{ name = "opentelemetry-instrumentation-grpc", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
|
4219 |
+
{ name = "opentelemetry-instrumentation-httpx", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
|
4220 |
+
{ name = "opentelemetry-instrumentation-logging", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
|
4221 |
+
{ name = "opentelemetry-instrumentation-requests", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
|
4222 |
+
{ name = "opentelemetry-instrumentation-threading", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
|
4223 |
+
{ name = "opentelemetry-instrumentation-urllib", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
|
4224 |
+
{ name = "opentelemetry-instrumentation-urllib3", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
|
4225 |
+
{ name = "piper-phonemize", marker = "platform_machine == 'x86_64'", url = "https://github.com/fedirz/piper-phonemize/raw/refs/heads/master/dist/piper_phonemize-1.2.0-cp312-cp312-manylinux_2_28_x86_64.whl" },
|
4226 |
+
{ name = "piper-tts", marker = "platform_machine == 'x86_64'", specifier = ">=1.2.0" },
|
4227 |
+
{ name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.0.1" },
|
4228 |
+
{ name = "pydantic", specifier = ">=2.9.0" },
|
4229 |
+
{ name = "pydantic-settings", specifier = ">=2.5.2" },
|
4230 |
+
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.3.3" },
|
4231 |
+
{ name = "pytest-antilru", marker = "extra == 'dev'", specifier = ">=2.0.0" },
|
4232 |
+
{ name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24.0" },
|
4233 |
+
{ name = "pytest-mock", marker = "extra == 'dev'", specifier = ">=3.14.0" },
|
4234 |
+
{ name = "pytest-xdist", marker = "extra == 'dev'", specifier = ">=3.6.1" },
|
4235 |
+
{ name = "python-multipart", specifier = ">=0.0.10" },
|
4236 |
+
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.7.1" },
|
4237 |
+
{ name = "sounddevice", specifier = ">=0.5.1" },
|
4238 |
+
{ name = "soundfile", specifier = ">=0.12.1" },
|
4239 |
+
{ name = "srt", marker = "extra == 'dev'", specifier = ">=3.5.3" },
|
4240 |
+
{ name = "uvicorn", specifier = ">=0.30.6" },
|
4241 |
+
{ name = "webvtt-py", marker = "extra == 'dev'", specifier = ">=0.5.1" },
|
4242 |
+
]
|
4243 |
+
|
4244 |
[[package]]
|
4245 |
name = "srt"
|
4246 |
version = "3.5.3"
|