Fedir Zadniprovskyi commited on
Commit
43cc67a
Β·
1 Parent(s): 9922993

rename to `speaches`

Browse files
Files changed (45) hide show
  1. Dockerfile +4 -4
  2. README.md +13 -9
  3. Taskfile.yaml +2 -2
  4. compose.cpu.yaml +3 -3
  5. compose.cuda-cdi.yaml +2 -2
  6. compose.cuda.yaml +3 -3
  7. compose.observability.yaml +1 -1
  8. compose.yaml +2 -2
  9. docs/configuration.md +2 -2
  10. docs/installation.md +19 -19
  11. docs/introduction.md +5 -4
  12. docs/openapi.json +1 -1
  13. docs/usage/open-webui-integration.md +4 -4
  14. docs/usage/text-to-speech.md +4 -5
  15. examples/javascript/index.js +1 -1
  16. examples/live-audio/script.sh +3 -3
  17. examples/youtube/script.sh +3 -3
  18. mkdocs.yml +3 -3
  19. pyproject.toml +1 -1
  20. src/{faster_whisper_server β†’ speaches}/__init__.py +0 -0
  21. src/{faster_whisper_server β†’ speaches}/api_models.py +2 -2
  22. src/{faster_whisper_server β†’ speaches}/asr.py +3 -3
  23. src/{faster_whisper_server β†’ speaches}/audio.py +1 -1
  24. src/{faster_whisper_server β†’ speaches}/config.py +0 -0
  25. src/{faster_whisper_server β†’ speaches}/dependencies.py +4 -4
  26. src/{faster_whisper_server β†’ speaches}/gradio_app.py +5 -5
  27. src/{faster_whisper_server β†’ speaches}/hf_utils.py +1 -1
  28. src/{faster_whisper_server β†’ speaches}/logger.py +0 -0
  29. src/{faster_whisper_server β†’ speaches}/main.py +7 -7
  30. src/{faster_whisper_server β†’ speaches}/model_manager.py +2 -2
  31. src/{faster_whisper_server β†’ speaches}/routers/__init__.py +0 -0
  32. src/{faster_whisper_server β†’ speaches}/routers/misc.py +2 -2
  33. src/{faster_whisper_server β†’ speaches}/routers/models.py +2 -2
  34. src/{faster_whisper_server β†’ speaches}/routers/speech.py +2 -2
  35. src/{faster_whisper_server β†’ speaches}/routers/stt.py +8 -8
  36. src/{faster_whisper_server β†’ speaches}/text_utils.py +2 -2
  37. src/{faster_whisper_server β†’ speaches}/text_utils_test.py +2 -2
  38. src/{faster_whisper_server β†’ speaches}/transcriber.py +4 -4
  39. tests/api_timestamp_granularities_test.py +1 -1
  40. tests/conftest.py +6 -6
  41. tests/model_manager_test.py +1 -1
  42. tests/openai_timestamp_granularities_test.py +1 -1
  43. tests/speech_test.py +1 -1
  44. tests/sse_test.py +1 -1
  45. uv.lock +109 -109
Dockerfile CHANGED
@@ -1,7 +1,7 @@
1
  ARG BASE_IMAGE=nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04
2
  # hadolint ignore=DL3006
3
  FROM ${BASE_IMAGE}
4
- LABEL org.opencontainers.image.source="https://github.com/fedirz/faster-whisper-server"
5
  LABEL org.opencontainers.image.licenses="MIT"
6
  # `ffmpeg` is installed because without it `gradio` won't work with mp3(possible others as well) files
7
  # hadolint ignore=DL3008
@@ -15,7 +15,7 @@ RUN apt-get update && \
15
  USER ubuntu
16
  ENV HOME=/home/ubuntu \
17
  PATH=/home/ubuntu/.local/bin:$PATH
18
- WORKDIR $HOME/faster-whisper-server
19
  # https://docs.astral.sh/uv/guides/integration/docker/#installing-uv
20
  COPY --chown=ubuntu --from=ghcr.io/astral-sh/uv:0.5.14 /uv /bin/uv
21
  # https://docs.astral.sh/uv/guides/integration/docker/#intermediate-layers
@@ -35,7 +35,7 @@ RUN mkdir -p $HOME/.cache/huggingface/hub
35
  ENV WHISPER__MODEL=Systran/faster-whisper-large-v3
36
  ENV UVICORN_HOST=0.0.0.0
37
  ENV UVICORN_PORT=8000
38
- ENV PATH="$HOME/faster-whisper-server/.venv/bin:$PATH"
39
  # https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhubenablehftransfer
40
  # NOTE: I've disabled this because it doesn't inside of Docker container. I couldn't pinpoint the exact reason. This doesn't happen when running the server locally.
41
  # RuntimeError: An error occurred while downloading using `hf_transfer`. Consider disabling HF_HUB_ENABLE_HF_TRANSFER for better error handling.
@@ -44,4 +44,4 @@ ENV HF_HUB_ENABLE_HF_TRANSFER=0
44
  # https://www.reddit.com/r/StableDiffusion/comments/1f6asvd/gradio_sends_ip_address_telemetry_by_default/
45
  ENV DO_NOT_TRACK=1
46
  EXPOSE 8000
47
- CMD ["uvicorn", "--factory", "faster_whisper_server.main:create_app"]
 
1
  ARG BASE_IMAGE=nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04
2
  # hadolint ignore=DL3006
3
  FROM ${BASE_IMAGE}
4
+ LABEL org.opencontainers.image.source="https://github.com/speaches-ai/speaches"
5
  LABEL org.opencontainers.image.licenses="MIT"
6
  # `ffmpeg` is installed because without it `gradio` won't work with mp3(possible others as well) files
7
  # hadolint ignore=DL3008
 
15
  USER ubuntu
16
  ENV HOME=/home/ubuntu \
17
  PATH=/home/ubuntu/.local/bin:$PATH
18
+ WORKDIR $HOME/speaches
19
  # https://docs.astral.sh/uv/guides/integration/docker/#installing-uv
20
  COPY --chown=ubuntu --from=ghcr.io/astral-sh/uv:0.5.14 /uv /bin/uv
21
  # https://docs.astral.sh/uv/guides/integration/docker/#intermediate-layers
 
35
  ENV WHISPER__MODEL=Systran/faster-whisper-large-v3
36
  ENV UVICORN_HOST=0.0.0.0
37
  ENV UVICORN_PORT=8000
38
+ ENV PATH="$HOME/speaches/.venv/bin:$PATH"
39
  # https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhubenablehftransfer
40
  # NOTE: I've disabled this because it doesn't inside of Docker container. I couldn't pinpoint the exact reason. This doesn't happen when running the server locally.
41
  # RuntimeError: An error occurred while downloading using `hf_transfer`. Consider disabling HF_HUB_ENABLE_HF_TRANSFER for better error handling.
 
44
  # https://www.reddit.com/r/StableDiffusion/comments/1f6asvd/gradio_sends_ip_address_telemetry_by_default/
45
  ENV DO_NOT_TRACK=1
46
  EXPOSE 8000
47
+ CMD ["uvicorn", "--factory", "speaches.main:create_app"]
README.md CHANGED
@@ -1,11 +1,15 @@
1
- # Faster Whisper Server
 
 
 
 
 
2
 
3
- `faster-whisper-server` is an OpenAI API-compatible transcription server which uses [faster-whisper](https://github.com/SYSTRAN/faster-whisper) as its backend.
4
  Features:
5
 
6
  - GPU and CPU support.
7
  - Easily deployable using Docker.
8
- - **Configurable through environment variables (see [config.py](./src/faster_whisper_server/config.py))**.
9
  - OpenAI API compatible.
10
  - Streaming support (transcription is sent via [SSE](https://en.wikipedia.org/wiki/Server-sent_events) as the audio is transcribed. You don't need to wait for the audio to fully be transcribed before receiving it).
11
  - Live transcription support (audio is sent via websocket as it's generated).
@@ -18,7 +22,7 @@ Please create an issue if you find a bug, have a question, or a feature suggesti
18
  See [OpenAI API reference](https://platform.openai.com/docs/api-reference/audio) for more information.
19
 
20
  - Audio file transcription via `POST /v1/audio/transcriptions` endpoint.
21
- - Unlike OpenAI's API, `faster-whisper-server` also supports streaming transcriptions (and translations). This is useful for when you want to process large audio files and would rather receive the transcription in chunks as they are processed, rather than waiting for the whole file to be transcribed. It works similarly to chat messages when chatting with LLMs.
22
  - Audio file translation via `POST /v1/audio/translations` endpoint.
23
  - Live audio transcription via `WS /v1/audio/transcriptions` endpoint.
24
  - LocalAgreement2 ([paper](https://aclanthology.org/2023.ijcnlp-demo.3.pdf) | [original implementation](https://github.com/ufal/whisper_streaming)) algorithm is used for live transcription.
@@ -35,13 +39,13 @@ See [OpenAI API reference](https://platform.openai.com/docs/api-reference/audio)
35
  NOTE: I'm using newer Docker Compsose features. If you are using an older version of Docker Compose, you may need need to update.
36
 
37
  ```bash
38
- curl --silent --remote-name https://raw.githubusercontent.com/fedirz/faster-whisper-server/master/compose.yaml
39
 
40
  # for GPU support
41
- curl --silent --remote-name https://raw.githubusercontent.com/fedirz/faster-whisper-server/master/compose.cuda.yaml
42
  docker compose --file compose.cuda.yaml up --detach
43
  # for CPU only (use this if you don't have a GPU, as the image is much smaller)
44
- curl --silent --remote-name https://raw.githubusercontent.com/fedirz/faster-whisper-server/master/compose.cpu.yaml
45
  docker compose --file compose.cpu.yaml up --detach
46
  ```
47
 
@@ -49,9 +53,9 @@ docker compose --file compose.cpu.yaml up --detach
49
 
50
  ```bash
51
  # for GPU support
52
- docker run --gpus=all --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --detach fedirz/faster-whisper-server:latest-cuda
53
  # for CPU only (use this if you don't have a GPU, as the image is much smaller)
54
- docker run --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=Systran/faster-whisper-small --detach fedirz/faster-whisper-server:latest-cpu
55
  ```
56
 
57
  ### Using Kubernetes
 
1
+ > [!NOTE]
2
+ > This project was previously named `faster-whisper-server`. I've decided to change the name from `faster-whisper-server`, as the project has evolved to support more than just transcription.
3
+
4
+ # Speaches
5
+
6
+ `speaches` is an OpenAI API-compatible server supporting transcription, translation, and speech generation. For transcription/translation it uses [faster-whisper](https://github.com/SYSTRAN/faster-whisper) and for text-to-speech [piper](https://github.com/rhasspy/piper) is used.
7
 
 
8
  Features:
9
 
10
  - GPU and CPU support.
11
  - Easily deployable using Docker.
12
+ - **Configurable through environment variables (see [config.py](./src/speaches/config.py))**.
13
  - OpenAI API compatible.
14
  - Streaming support (transcription is sent via [SSE](https://en.wikipedia.org/wiki/Server-sent_events) as the audio is transcribed. You don't need to wait for the audio to fully be transcribed before receiving it).
15
  - Live transcription support (audio is sent via websocket as it's generated).
 
22
  See [OpenAI API reference](https://platform.openai.com/docs/api-reference/audio) for more information.
23
 
24
  - Audio file transcription via `POST /v1/audio/transcriptions` endpoint.
25
+ - Unlike OpenAI's API, `speaches` also supports streaming transcriptions (and translations). This is useful for when you want to process large audio files and would rather receive the transcription in chunks as they are processed, rather than waiting for the whole file to be transcribed. It works similarly to chat messages when chatting with LLMs.
26
  - Audio file translation via `POST /v1/audio/translations` endpoint.
27
  - Live audio transcription via `WS /v1/audio/transcriptions` endpoint.
28
  - LocalAgreement2 ([paper](https://aclanthology.org/2023.ijcnlp-demo.3.pdf) | [original implementation](https://github.com/ufal/whisper_streaming)) algorithm is used for live transcription.
 
39
  NOTE: I'm using newer Docker Compsose features. If you are using an older version of Docker Compose, you may need need to update.
40
 
41
  ```bash
42
+ curl --silent --remote-name https://raw.githubusercontent.com/speaches-ai/speaches/master/compose.yaml
43
 
44
  # for GPU support
45
+ curl --silent --remote-name https://raw.githubusercontent.com/speaches-ai/speaches/master/compose.cuda.yaml
46
  docker compose --file compose.cuda.yaml up --detach
47
  # for CPU only (use this if you don't have a GPU, as the image is much smaller)
48
+ curl --silent --remote-name https://raw.githubusercontent.com/speaches-ai/speaches/master/compose.cpu.yaml
49
  docker compose --file compose.cpu.yaml up --detach
50
  ```
51
 
 
53
 
54
  ```bash
55
  # for GPU support
56
+ docker run --gpus=all --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --detach ghcr.io/speaches-ai/speaches:latest-cuda
57
  # for CPU only (use this if you don't have a GPU, as the image is much smaller)
58
+ docker run --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=Systran/faster-whisper-small --detach ghcr.io/speaches-ai/speaches:latest-cpu
59
  ```
60
 
61
  ### Using Kubernetes
Taskfile.yaml CHANGED
@@ -2,8 +2,8 @@ version: "3"
2
  tasks:
3
  server:
4
  cmds:
5
- - pkill --signal SIGKILL --echo --full 'uvicorn --factory --host 0.0.0.0 faster_whisper_server.main:create_app' || true
6
- - opentelemetry-instrument uvicorn --factory --host 0.0.0.0 faster_whisper_server.main:create_app {{.CLI_ARGS}}
7
  sources:
8
  - src/**/*.py
9
  test:
 
2
  tasks:
3
  server:
4
  cmds:
5
+ - pkill --signal SIGKILL --echo --full 'uvicorn --factory --host 0.0.0.0 speaches.main:create_app' || true
6
+ - opentelemetry-instrument uvicorn --factory --host 0.0.0.0 speaches.main:create_app {{.CLI_ARGS}}
7
  sources:
8
  - src/**/*.py
9
  test:
compose.cpu.yaml CHANGED
@@ -1,11 +1,11 @@
1
  # include:
2
  # - compose.observability.yaml
3
  services:
4
- faster-whisper-server:
5
  extends:
6
  file: compose.yaml
7
- service: faster-whisper-server
8
- image: fedirz/faster-whisper-server:latest-cpu
9
  build:
10
  args:
11
  BASE_IMAGE: ubuntu:24.04
 
1
  # include:
2
  # - compose.observability.yaml
3
  services:
4
+ speaches:
5
  extends:
6
  file: compose.yaml
7
+ service: speaches
8
+ image: ghcr.io/speaches-ai/speaches:latest-cpu
9
  build:
10
  args:
11
  BASE_IMAGE: ubuntu:24.04
compose.cuda-cdi.yaml CHANGED
@@ -4,10 +4,10 @@
4
  # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html
5
  # https://docs.docker.com/reference/cli/dockerd/#enable-cdi-devices
6
  services:
7
- faster-whisper-server:
8
  extends:
9
  file: compose.cuda.yaml
10
- service: faster-whisper-server
11
  volumes:
12
  - hf-hub-cache:/home/ubuntu/.cache/huggingface/hub
13
  deploy:
 
4
  # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html
5
  # https://docs.docker.com/reference/cli/dockerd/#enable-cdi-devices
6
  services:
7
+ speaches:
8
  extends:
9
  file: compose.cuda.yaml
10
+ service: speaches
11
  volumes:
12
  - hf-hub-cache:/home/ubuntu/.cache/huggingface/hub
13
  deploy:
compose.cuda.yaml CHANGED
@@ -1,11 +1,11 @@
1
  # include:
2
  # - compose.observability.yaml
3
  services:
4
- faster-whisper-server:
5
  extends:
6
  file: compose.yaml
7
- service: faster-whisper-server
8
- image: fedirz/faster-whisper-server:latest-cuda
9
  build:
10
  args:
11
  BASE_IMAGE: nvidia/cuda:12.6.2-cudnn-runtime-ubuntu24.04
 
1
  # include:
2
  # - compose.observability.yaml
3
  services:
4
+ speaches:
5
  extends:
6
  file: compose.yaml
7
+ service: speaches
8
+ image: ghcr.io/speaches-ai/speaches:latest-cuda
9
  build:
10
  args:
11
  BASE_IMAGE: nvidia/cuda:12.6.2-cudnn-runtime-ubuntu24.04
compose.observability.yaml CHANGED
@@ -5,7 +5,7 @@ services:
5
  volumes:
6
  - ./configuration/opentelemetry-collector.yaml:/etc/opentelemetry-collector.yaml
7
  ports:
8
- # NOTE: when `faster-whisper-server` is also running as a Docker Compose service, this doesn't need to be exposed.
9
  - 4317:4317 # OTLP gRPC receiver
10
  # - 4318:4318 # OTLP HTTP receiver
11
  # - 8888:8888 # Prometheus metrics exposed by the Collector
 
5
  volumes:
6
  - ./configuration/opentelemetry-collector.yaml:/etc/opentelemetry-collector.yaml
7
  ports:
8
+ # NOTE: when `speaches` is also running as a Docker Compose service, this doesn't need to be exposed.
9
  - 4317:4317 # OTLP gRPC receiver
10
  # - 4318:4318 # OTLP HTTP receiver
11
  # - 8888:8888 # Prometheus metrics exposed by the Collector
compose.yaml CHANGED
@@ -1,7 +1,7 @@
1
  # TODO: https://docs.astral.sh/uv/guides/integration/docker/#configuring-watch-with-docker-compose
2
  services:
3
- faster-whisper-server:
4
- container_name: faster-whisper-server
5
  build:
6
  dockerfile: Dockerfile
7
  context: .
 
1
  # TODO: https://docs.astral.sh/uv/guides/integration/docker/#configuring-watch-with-docker-compose
2
  services:
3
+ speaches:
4
+ container_name: speaches
5
  build:
6
  dockerfile: Dockerfile
7
  context: .
docs/configuration.md CHANGED
@@ -1,5 +1,5 @@
1
  <!-- https://mkdocstrings.github.io/python/usage/configuration/general/ -->
2
- ::: faster_whisper_server.config.Config
3
  options:
4
  show_bases: true
5
  show_if_no_docstring: true
@@ -16,7 +16,7 @@
16
  - "!speech_*"
17
  - "!transcription_*"
18
 
19
- ::: faster_whisper_server.config.WhisperConfig
20
 
21
  <!-- TODO: nested model `whisper` -->
22
  <!-- TODO: Insert new lines for multi-line docstrings -->
 
1
  <!-- https://mkdocstrings.github.io/python/usage/configuration/general/ -->
2
+ ::: speaches.config.Config
3
  options:
4
  show_bases: true
5
  show_if_no_docstring: true
 
16
  - "!speech_*"
17
  - "!transcription_*"
18
 
19
+ ::: speaches.config.WhisperConfig
20
 
21
  <!-- TODO: nested model `whisper` -->
22
  <!-- TODO: Insert new lines for multi-line docstrings -->
docs/installation.md CHANGED
@@ -9,25 +9,25 @@ Download the necessary Docker Compose files
9
  === "CUDA"
10
 
11
  ```bash
12
- curl --silent --remote-name https://raw.githubusercontent.com/fedirz/faster-whisper-server/master/compose.yaml
13
- curl --silent --remote-name https://raw.githubusercontent.com/fedirz/faster-whisper-server/master/compose.cuda.yaml
14
  export COMPOSE_FILE=compose.cuda.yaml
15
  ```
16
 
17
  === "CUDA (with CDI feature enabled)"
18
 
19
  ```bash
20
- curl --silent --remote-name https://raw.githubusercontent.com/fedirz/faster-whisper-server/master/compose.yaml
21
- curl --silent --remote-name https://raw.githubusercontent.com/fedirz/faster-whisper-server/master/compose.cuda.yaml
22
- curl --silent --remote-name https://raw.githubusercontent.com/fedirz/faster-whisper-server/master/compose.cuda-cdi.yaml
23
  export COMPOSE_FILE=compose.cuda-cdi.yaml
24
  ```
25
 
26
  === "CPU"
27
 
28
  ```bash
29
- curl --silent --remote-name https://raw.githubusercontent.com/fedirz/faster-whisper-server/master/compose.yaml
30
- curl --silent --remote-name https://raw.githubusercontent.com/fedirz/faster-whisper-server/master/compose.cpu.yaml
31
  export COMPOSE_FILE=compose.cpu.yaml
32
  ```
33
 
@@ -58,10 +58,10 @@ docker compose up --detach
58
  --rm \
59
  --detach \
60
  --publish 8000:8000 \
61
- --name faster-whisper-server \
62
  --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub \
63
  --gpus=all \
64
- fedirz/faster-whisper-server:latest-cuda
65
  ```
66
 
67
  === "CUDA (with CDI feature enabled)"
@@ -71,10 +71,10 @@ docker compose up --detach
71
  --rm \
72
  --detach \
73
  --publish 8000:8000 \
74
- --name faster-whisper-server \
75
  --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub \
76
  --device=nvidia.com/gpu=all \
77
- fedirz/faster-whisper-server:latest-cuda
78
  ```
79
 
80
  === "CPU"
@@ -84,31 +84,31 @@ docker compose up --detach
84
  --rm \
85
  --detach \
86
  --publish 8000:8000 \
87
- --name faster-whisper-server \
88
  --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub \
89
- fedirz/faster-whisper-server:latest-cpu
90
  ```
91
 
92
  ??? note "Build from source"
93
 
94
  ```bash
95
- docker build --tag faster-whisper-server .
96
 
97
  # NOTE: you need to install and enable [buildx](https://github.com/docker/buildx) for multi-platform builds
98
  # Build image for both amd64 and arm64
99
- docker buildx build --tag faster-whisper-server --platform linux/amd64,linux/arm64 .
100
 
101
  # Build image without CUDA support
102
- docker build --tag faster-whisper-server --build-arg BASE_IMAGE=ubuntu:24.04 .
103
  ```
104
 
105
  ## Python (requires Python 3.12+ and `uv` package manager)
106
 
107
  ```bash
108
- git clone https://github.com/fedirz/faster-whisper-server.git
109
- cd faster-whisper-server
110
  uv venv
111
  sourve .venv/bin/activate
112
  uv sync --all-extras
113
- uvicorn --factory --host 0.0.0.0 faster_whisper_server.main:create_app
114
  ```
 
9
  === "CUDA"
10
 
11
  ```bash
12
+ curl --silent --remote-name https://raw.githubusercontent.com/speaches-ai/speaches/master/compose.yaml
13
+ curl --silent --remote-name https://raw.githubusercontent.com/speaches-ai/speaches/master/compose.cuda.yaml
14
  export COMPOSE_FILE=compose.cuda.yaml
15
  ```
16
 
17
  === "CUDA (with CDI feature enabled)"
18
 
19
  ```bash
20
+ curl --silent --remote-name https://raw.githubusercontent.com/speaches-ai/speaches/master/compose.yaml
21
+ curl --silent --remote-name https://raw.githubusercontent.com/speaches-ai/speaches/master/compose.cuda.yaml
22
+ curl --silent --remote-name https://raw.githubusercontent.com/speaches-ai/speaches/master/compose.cuda-cdi.yaml
23
  export COMPOSE_FILE=compose.cuda-cdi.yaml
24
  ```
25
 
26
  === "CPU"
27
 
28
  ```bash
29
+ curl --silent --remote-name https://raw.githubusercontent.com/speaches-ai/speaches/master/compose.yaml
30
+ curl --silent --remote-name https://raw.githubusercontent.com/speaches-ai/speaches/master/compose.cpu.yaml
31
  export COMPOSE_FILE=compose.cpu.yaml
32
  ```
33
 
 
58
  --rm \
59
  --detach \
60
  --publish 8000:8000 \
61
+ --name speaches \
62
  --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub \
63
  --gpus=all \
64
+ ghcr.io/speaches-ai/speaches:latest-cuda
65
  ```
66
 
67
  === "CUDA (with CDI feature enabled)"
 
71
  --rm \
72
  --detach \
73
  --publish 8000:8000 \
74
+ --name speaches \
75
  --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub \
76
  --device=nvidia.com/gpu=all \
77
+ ghcr.io/speaches-ai/speaches:latest-cuda
78
  ```
79
 
80
  === "CPU"
 
84
  --rm \
85
  --detach \
86
  --publish 8000:8000 \
87
+ --name speaches \
88
  --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub \
89
+ ghcr.io/speaches-ai/speaches:latest-cpu
90
  ```
91
 
92
  ??? note "Build from source"
93
 
94
  ```bash
95
+ docker build --tag speaches .
96
 
97
  # NOTE: you need to install and enable [buildx](https://github.com/docker/buildx) for multi-platform builds
98
  # Build image for both amd64 and arm64
99
+ docker buildx build --tag speaches --platform linux/amd64,linux/arm64 .
100
 
101
  # Build image without CUDA support
102
+ docker build --tag speaches --build-arg BASE_IMAGE=ubuntu:24.04 .
103
  ```
104
 
105
  ## Python (requires Python 3.12+ and `uv` package manager)
106
 
107
  ```bash
108
+ git clone https://github.com/speaches-ai/speaches.git
109
+ cd speaches
110
  uv venv
111
  sourve .venv/bin/activate
112
  uv sync --all-extras
113
+ uvicorn --factory --host 0.0.0.0 speaches.main:create_app
114
  ```
docs/introduction.md CHANGED
@@ -8,19 +8,20 @@
8
 
9
  TODO: add HuggingFace Space URL
10
 
11
- # Faster Whisper Server
12
 
13
- `faster-whisper-server` is an OpenAI API-compatible server supporting transcription, translation, and speech generation. For transcription/translation it uses [faster-whisper](https://github.com/SYSTRAN/faster-whisper) and for text-to-speech [piper](https://github.com/rhasspy/piper) is used.
14
 
15
  ## Features:
16
 
17
  - GPU and CPU support.
18
  - [Deployable via Docker Compose / Docker](./installation.md)
19
  - [Highly configurable](./configuration.md)
20
- - OpenAI API compatible. All tools and SDKs that work with OpenAI's API should work with `faster-whisper-server`.
21
  - Streaming support (transcription is sent via [SSE](https://en.wikipedia.org/wiki/Server-sent_events) as the audio is transcribed. You don't need to wait for the audio to fully be transcribed before receiving it).
22
  - Live transcription support (audio is sent via websocket as it's generated).
23
  - Dynamic model loading / offloading. Just specify which model you want to use in the request and it will be loaded automatically. It will then be unloaded after a period of inactivity.
 
24
  - (Coming soon) Audio generation (chat completions endpoint) | [OpenAI Documentation](https://platform.openai.com/docs/guides/realtime)
25
  - Generate a spoken audio summary of a body of text (text in, audio out)
26
  - Perform sentiment analysis on a recording (audio in, text out)
@@ -34,7 +35,7 @@ Please create an issue if you find a bug, have a question, or a feature suggesti
34
  See [OpenAI API reference](https://platform.openai.com/docs/api-reference/audio) for more information.
35
 
36
  - Audio file transcription via `POST /v1/audio/transcriptions` endpoint.
37
- - Unlike OpenAI's API, `faster-whisper-server` also supports streaming transcriptions (and translations). This is useful for when you want to process large audio files and would rather receive the transcription in chunks as they are processed, rather than waiting for the whole file to be transcribed. It works similarly to chat messages when chatting with LLMs.
38
  - Audio file translation via `POST /v1/audio/translations` endpoint.
39
  - Live audio transcription via `WS /v1/audio/transcriptions` endpoint.
40
  - LocalAgreement2 ([paper](https://aclanthology.org/2023.ijcnlp-demo.3.pdf) | [original implementation](https://github.com/ufal/whisper_streaming)) algorithm is used for live transcription.
 
8
 
9
  TODO: add HuggingFace Space URL
10
 
11
+ # Speaches
12
 
13
+ `speaches` is an OpenAI API-compatible server supporting transcription, translation, and speech generation. For transcription/translation it uses [faster-whisper](https://github.com/SYSTRAN/faster-whisper) and for text-to-speech [piper](https://github.com/rhasspy/piper) is used.
14
 
15
  ## Features:
16
 
17
  - GPU and CPU support.
18
  - [Deployable via Docker Compose / Docker](./installation.md)
19
  - [Highly configurable](./configuration.md)
20
+ - OpenAI API compatible. All tools and SDKs that work with OpenAI's API should work with `speaches`.
21
  - Streaming support (transcription is sent via [SSE](https://en.wikipedia.org/wiki/Server-sent_events) as the audio is transcribed. You don't need to wait for the audio to fully be transcribed before receiving it).
22
  - Live transcription support (audio is sent via websocket as it's generated).
23
  - Dynamic model loading / offloading. Just specify which model you want to use in the request and it will be loaded automatically. It will then be unloaded after a period of inactivity.
24
+ - [Text-to-speech (TTS) via `piper`]
25
  - (Coming soon) Audio generation (chat completions endpoint) | [OpenAI Documentation](https://platform.openai.com/docs/guides/realtime)
26
  - Generate a spoken audio summary of a body of text (text in, audio out)
27
  - Perform sentiment analysis on a recording (audio in, text out)
 
35
  See [OpenAI API reference](https://platform.openai.com/docs/api-reference/audio) for more information.
36
 
37
  - Audio file transcription via `POST /v1/audio/transcriptions` endpoint.
38
+ - Unlike OpenAI's API, `speaches` also supports streaming transcriptions (and translations). This is useful for when you want to process large audio files and would rather receive the transcription in chunks as they are processed, rather than waiting for the whole file to be transcribed. It works similarly to chat messages when chatting with LLMs.
39
  - Audio file translation via `POST /v1/audio/translations` endpoint.
40
  - Live audio transcription via `WS /v1/audio/transcriptions` endpoint.
41
  - LocalAgreement2 ([paper](https://aclanthology.org/2023.ijcnlp-demo.3.pdf) | [original implementation](https://github.com/ufal/whisper_streaming)) algorithm is used for live transcription.
docs/openapi.json CHANGED
@@ -1 +1 @@
1
- {"openapi":"3.1.0","info":{"title":"FastAPI","version":"0.1.0"},"paths":{"/v1/audio/translations":{"post":{"tags":["automatic-speech-recognition"],"summary":"Translate File","operationId":"translate_file_v1_audio_translations_post","requestBody":{"content":{"application/x-www-form-urlencoded":{"schema":{"$ref":"#/components/schemas/Body_translate_file_v1_audio_translations_post"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"anyOf":[{"type":"string"},{"$ref":"#/components/schemas/CreateTranscriptionResponseJson"},{"$ref":"#/components/schemas/CreateTranscriptionResponseVerboseJson"}],"title":"Response Translate File V1 Audio Translations Post"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/v1/audio/transcriptions":{"post":{"tags":["automatic-speech-recognition"],"summary":"Transcribe File","operationId":"transcribe_file_v1_audio_transcriptions_post","requestBody":{"content":{"application/x-www-form-urlencoded":{"schema":{"$ref":"#/components/schemas/Body_transcribe_file_v1_audio_transcriptions_post"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"anyOf":[{"type":"string"},{"$ref":"#/components/schemas/CreateTranscriptionResponseJson"},{"$ref":"#/components/schemas/CreateTranscriptionResponseVerboseJson"}],"title":"Response Transcribe File V1 Audio Transcriptions Post"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/v1/models":{"get":{"tags":["models"],"summary":"Get Models","operationId":"get_models_v1_models_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ListModelsResponse"}}}}}}},"/v1/models/{model_name}":{"get":{"tags":["models"],"summary":"Get Model","operationId":"get_model_v1_models__model_name__get","parameters":[{"name":"model_name","in":"path","required":true,"schema":{"type":"string","title":"Model Name"},"example":"Systran/faster-distil-whisper-large-v3"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/Model"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/health":{"get":{"tags":["diagnostic"],"summary":"Health","operationId":"health_health_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}},"/api/pull/{model_name}":{"post":{"tags":["experimental"],"summary":"Download a model from Hugging Face.","operationId":"pull_model_api_pull__model_name__post","parameters":[{"name":"model_name","in":"path","required":true,"schema":{"type":"string","title":"Model Name"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/ps":{"get":{"tags":["experimental"],"summary":"Get a list of loaded models.","operationId":"get_running_models_api_ps_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":{"items":{"type":"string"},"type":"array"},"type":"object","title":"Response Get Running Models Api Ps Get"}}}}}}},"/api/ps/{model_name}":{"post":{"tags":["experimental"],"summary":"Load a model into memory.","operationId":"load_model_route_api_ps__model_name__post","parameters":[{"name":"model_name","in":"path","required":true,"schema":{"type":"string","title":"Model Name"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"tags":["experimental"],"summary":"Unload a model from memory.","operationId":"stop_running_model_api_ps__model_name__delete","parameters":[{"name":"model_name","in":"path","required":true,"schema":{"type":"string","title":"Model Name"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/v1/audio/speech":{"post":{"tags":["speech-to-text"],"summary":"Synthesize","operationId":"synthesize_v1_audio_speech_post","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/CreateSpeechRequestBody"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/v1/audio/speech/voices":{"get":{"tags":["speech-to-text"],"summary":"List Voices","operationId":"list_voices_v1_audio_speech_voices_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"items":{"$ref":"#/components/schemas/PiperModel"},"type":"array","title":"Response List Voices V1 Audio Speech Voices Get"}}}}}}}},"components":{"schemas":{"Body_transcribe_file_v1_audio_transcriptions_post":{"properties":{"model":{"anyOf":[{"type":"string","description":"The ID of the model. You can get a list of available models by calling `/v1/models`.","examples":["Systran/faster-distil-whisper-large-v3","bofenghuang/whisper-large-v2-cv11-french-ct2"]},{"type":"null"}],"title":"Model"},"language":{"anyOf":[{"$ref":"#/components/schemas/Language"},{"type":"null"}]},"prompt":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Prompt"},"response_format":{"anyOf":[{"$ref":"#/components/schemas/faster_whisper_server__config__ResponseFormat"},{"type":"null"}]},"temperature":{"type":"number","title":"Temperature","default":0.0},"timestamp_granularities":{"items":{"type":"string","enum":["segment","word"]},"type":"array","title":"Timestamp Granularities","default":["segment"]},"stream":{"type":"boolean","title":"Stream","default":false},"hotwords":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Hotwords"},"vad_filter":{"type":"boolean","title":"Vad Filter","default":false},"file":{"type":"string","format":"binary","title":"File"}},"type":"object","required":["file"],"title":"Body_transcribe_file_v1_audio_transcriptions_post"},"Body_translate_file_v1_audio_translations_post":{"properties":{"model":{"anyOf":[{"type":"string","description":"The ID of the model. You can get a list of available models by calling `/v1/models`.","examples":["Systran/faster-distil-whisper-large-v3","bofenghuang/whisper-large-v2-cv11-french-ct2"]},{"type":"null"}],"title":"Model"},"prompt":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Prompt"},"response_format":{"anyOf":[{"$ref":"#/components/schemas/faster_whisper_server__config__ResponseFormat"},{"type":"null"}]},"temperature":{"type":"number","title":"Temperature","default":0.0},"stream":{"type":"boolean","title":"Stream","default":false},"vad_filter":{"type":"boolean","title":"Vad Filter","default":false},"file":{"type":"string","format":"binary","title":"File"}},"type":"object","required":["file"],"title":"Body_translate_file_v1_audio_translations_post"},"CreateSpeechRequestBody":{"properties":{"model":{"type":"string","enum":["piper"],"const":"piper","title":"Model","description":"The ID of the model. The only supported model is 'piper'.","default":"piper","examples":["piper"]},"input":{"type":"string","title":"Input","description":"The text to generate audio for. ","examples":["A rainbow is an optical phenomenon caused by refraction, internal reflection and dispersion of light in water droplets resulting in a continuous spectrum of light appearing in the sky. The rainbow takes the form of a multicoloured circular arc. Rainbows caused by sunlight always appear in the section of sky directly opposite the Sun. Rainbows can be caused by many forms of airborne water. These include not only rain, but also mist, spray, and airborne dew."]},"voice":{"type":"string","title":"Voice","default":"en_US-amy-medium"},"response_format":{"$ref":"#/components/schemas/faster_whisper_server__routers__speech__ResponseFormat","description":"The format to audio in. Supported formats are mp3, flac, wav, pcm. opus, aac are not supported","default":"mp3","examples":["mp3","flac","wav","pcm"]},"speed":{"type":"number","maximum":4.0,"minimum":0.25,"title":"Speed","default":1.0},"sample_rate":{"anyOf":[{"type":"integer","maximum":48000.0,"minimum":8000.0},{"type":"null"}],"title":"Sample Rate"}},"type":"object","required":["input"],"title":"CreateSpeechRequestBody"},"CreateTranscriptionResponseJson":{"properties":{"text":{"type":"string","title":"Text"}},"type":"object","required":["text"],"title":"CreateTranscriptionResponseJson"},"CreateTranscriptionResponseVerboseJson":{"properties":{"task":{"type":"string","title":"Task","default":"transcribe"},"language":{"type":"string","title":"Language"},"duration":{"type":"number","title":"Duration"},"text":{"type":"string","title":"Text"},"words":{"anyOf":[{"items":{"$ref":"#/components/schemas/TranscriptionWord"},"type":"array"},{"type":"null"}],"title":"Words"},"segments":{"items":{"$ref":"#/components/schemas/TranscriptionSegment"},"type":"array","title":"Segments"}},"type":"object","required":["language","duration","text","words","segments"],"title":"CreateTranscriptionResponseVerboseJson"},"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"Language":{"type":"string","enum":["af","am","ar","as","az","ba","be","bg","bn","bo","br","bs","ca","cs","cy","da","de","el","en","es","et","eu","fa","fi","fo","fr","gl","gu","ha","haw","he","hi","hr","ht","hu","hy","id","is","it","ja","jw","ka","kk","km","kn","ko","la","lb","ln","lo","lt","lv","mg","mi","mk","ml","mn","mr","ms","mt","my","ne","nl","nn","no","oc","pa","pl","ps","pt","ro","ru","sa","sd","si","sk","sl","sn","so","sq","sr","su","sv","sw","ta","te","tg","th","tk","tl","tr","tt","uk","ur","uz","vi","yi","yo","yue","zh"],"title":"Language"},"ListModelsResponse":{"properties":{"data":{"items":{"$ref":"#/components/schemas/Model"},"type":"array","title":"Data"},"object":{"type":"string","enum":["list"],"const":"list","title":"Object","default":"list"}},"type":"object","required":["data"],"title":"ListModelsResponse"},"Model":{"properties":{"id":{"type":"string","title":"Id"},"created":{"type":"integer","title":"Created"},"object":{"type":"string","enum":["model"],"const":"model","title":"Object"},"owned_by":{"type":"string","title":"Owned By"},"language":{"items":{"type":"string"},"type":"array","title":"Language"}},"type":"object","required":["id","created","object","owned_by"],"title":"Model","examples":[{"created":1700732060,"id":"Systran/faster-whisper-large-v3","object":"model","owned_by":"Systran"},{"created":1711378296,"id":"Systran/faster-distil-whisper-large-v3","object":"model","owned_by":"Systran"},{"created":1687968011,"id":"bofenghuang/whisper-large-v2-cv11-french-ct2","object":"model","owned_by":"bofenghuang"}]},"PiperModel":{"properties":{"object":{"type":"string","enum":["model"],"const":"model","title":"Object","default":"model"},"created":{"type":"integer","title":"Created"},"owned_by":{"type":"string","enum":["rhasspy"],"const":"rhasspy","title":"Owned By","default":"rhasspy"},"model_path":{"type":"string","format":"path","title":"Model Path","examples":["/home/nixos/.cache/huggingface/hub/models--rhasspy--piper-voices/snapshots/3d796cc2f2c884b3517c527507e084f7bb245aea/en/en_US/amy/medium/en_US-amy-medium.onnx"]},"id":{"type":"string","title":"Id","readOnly":true,"examples":["rhasspy/piper-voices/en_US-amy-medium"]},"voice":{"type":"string","title":"Voice","readOnly":true,"examples":["rhasspy/piper-voices/en_US-amy-medium"]},"config_path":{"type":"string","format":"path","title":"Config Path","readOnly":true},"quality":{"type":"string","enum":["x_low","low","medium","high"],"title":"Quality","readOnly":true},"sample_rate":{"type":"integer","title":"Sample Rate","readOnly":true}},"type":"object","required":["created","model_path","id","voice","config_path","quality","sample_rate"],"title":"PiperModel","description":"Similar structure to the GET /v1/models response but with extra fields."},"TranscriptionSegment":{"properties":{"id":{"type":"integer","title":"Id"},"seek":{"type":"integer","title":"Seek"},"start":{"type":"number","title":"Start"},"end":{"type":"number","title":"End"},"text":{"type":"string","title":"Text"},"tokens":{"items":{"type":"integer"},"type":"array","title":"Tokens"},"temperature":{"type":"number","title":"Temperature"},"avg_logprob":{"type":"number","title":"Avg Logprob"},"compression_ratio":{"type":"number","title":"Compression Ratio"},"no_speech_prob":{"type":"number","title":"No Speech Prob"},"words":{"anyOf":[{"items":{"$ref":"#/components/schemas/TranscriptionWord"},"type":"array"},{"type":"null"}],"title":"Words"}},"type":"object","required":["id","seek","start","end","text","tokens","temperature","avg_logprob","compression_ratio","no_speech_prob","words"],"title":"TranscriptionSegment"},"TranscriptionWord":{"properties":{"start":{"type":"number","title":"Start"},"end":{"type":"number","title":"End"},"word":{"type":"string","title":"Word"},"probability":{"type":"number","title":"Probability"}},"type":"object","required":["start","end","word","probability"],"title":"TranscriptionWord"},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"faster_whisper_server__config__ResponseFormat":{"type":"string","enum":["text","json","verbose_json","srt","vtt"],"title":"ResponseFormat"},"faster_whisper_server__routers__speech__ResponseFormat":{"type":"string","enum":["mp3","flac","wav","pcm"]}}},"tags":[{"name":"automatic-speech-recognition"},{"name":"speech-to-text"},{"name":"models"},{"name":"diagnostic"},{"name":"experimental","description":"Not meant for public use yet. May change or be removed at any time."}]}
 
1
+ {"openapi":"3.1.0","info":{"title":"FastAPI","version":"0.1.0"},"paths":{"/v1/audio/translations":{"post":{"tags":["automatic-speech-recognition"],"summary":"Translate File","operationId":"translate_file_v1_audio_translations_post","requestBody":{"content":{"application/x-www-form-urlencoded":{"schema":{"$ref":"#/components/schemas/Body_translate_file_v1_audio_translations_post"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"anyOf":[{"type":"string"},{"$ref":"#/components/schemas/CreateTranscriptionResponseJson"},{"$ref":"#/components/schemas/CreateTranscriptionResponseVerboseJson"}],"title":"Response Translate File V1 Audio Translations Post"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/v1/audio/transcriptions":{"post":{"tags":["automatic-speech-recognition"],"summary":"Transcribe File","operationId":"transcribe_file_v1_audio_transcriptions_post","requestBody":{"content":{"application/x-www-form-urlencoded":{"schema":{"$ref":"#/components/schemas/Body_transcribe_file_v1_audio_transcriptions_post"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"anyOf":[{"type":"string"},{"$ref":"#/components/schemas/CreateTranscriptionResponseJson"},{"$ref":"#/components/schemas/CreateTranscriptionResponseVerboseJson"}],"title":"Response Transcribe File V1 Audio Transcriptions Post"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/v1/models":{"get":{"tags":["models"],"summary":"Get Models","operationId":"get_models_v1_models_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ListModelsResponse"}}}}}}},"/v1/models/{model_name}":{"get":{"tags":["models"],"summary":"Get Model","operationId":"get_model_v1_models__model_name__get","parameters":[{"name":"model_name","in":"path","required":true,"schema":{"type":"string","title":"Model Name"},"example":"Systran/faster-distil-whisper-large-v3"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/Model"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/health":{"get":{"tags":["diagnostic"],"summary":"Health","operationId":"health_health_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}},"/api/pull/{model_name}":{"post":{"tags":["experimental"],"summary":"Download a model from Hugging Face.","operationId":"pull_model_api_pull__model_name__post","parameters":[{"name":"model_name","in":"path","required":true,"schema":{"type":"string","title":"Model Name"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/ps":{"get":{"tags":["experimental"],"summary":"Get a list of loaded models.","operationId":"get_running_models_api_ps_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":{"items":{"type":"string"},"type":"array"},"type":"object","title":"Response Get Running Models Api Ps Get"}}}}}}},"/api/ps/{model_name}":{"post":{"tags":["experimental"],"summary":"Load a model into memory.","operationId":"load_model_route_api_ps__model_name__post","parameters":[{"name":"model_name","in":"path","required":true,"schema":{"type":"string","title":"Model Name"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"tags":["experimental"],"summary":"Unload a model from memory.","operationId":"stop_running_model_api_ps__model_name__delete","parameters":[{"name":"model_name","in":"path","required":true,"schema":{"type":"string","title":"Model Name"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/v1/audio/speech":{"post":{"tags":["speech-to-text"],"summary":"Synthesize","operationId":"synthesize_v1_audio_speech_post","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/CreateSpeechRequestBody"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/v1/audio/speech/voices":{"get":{"tags":["speech-to-text"],"summary":"List Voices","operationId":"list_voices_v1_audio_speech_voices_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"items":{"$ref":"#/components/schemas/PiperModel"},"type":"array","title":"Response List Voices V1 Audio Speech Voices Get"}}}}}}}},"components":{"schemas":{"Body_transcribe_file_v1_audio_transcriptions_post":{"properties":{"model":{"anyOf":[{"type":"string","description":"The ID of the model. You can get a list of available models by calling `/v1/models`.","examples":["Systran/faster-distil-whisper-large-v3","bofenghuang/whisper-large-v2-cv11-french-ct2"]},{"type":"null"}],"title":"Model"},"language":{"anyOf":[{"$ref":"#/components/schemas/Language"},{"type":"null"}]},"prompt":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Prompt"},"response_format":{"anyOf":[{"$ref":"#/components/schemas/speaches__config__ResponseFormat"},{"type":"null"}]},"temperature":{"type":"number","title":"Temperature","default":0.0},"timestamp_granularities":{"items":{"type":"string","enum":["segment","word"]},"type":"array","title":"Timestamp Granularities","default":["segment"]},"stream":{"type":"boolean","title":"Stream","default":false},"hotwords":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Hotwords"},"vad_filter":{"type":"boolean","title":"Vad Filter","default":false},"file":{"type":"string","format":"binary","title":"File"}},"type":"object","required":["file"],"title":"Body_transcribe_file_v1_audio_transcriptions_post"},"Body_translate_file_v1_audio_translations_post":{"properties":{"model":{"anyOf":[{"type":"string","description":"The ID of the model. You can get a list of available models by calling `/v1/models`.","examples":["Systran/faster-distil-whisper-large-v3","bofenghuang/whisper-large-v2-cv11-french-ct2"]},{"type":"null"}],"title":"Model"},"prompt":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Prompt"},"response_format":{"anyOf":[{"$ref":"#/components/schemas/speaches__config__ResponseFormat"},{"type":"null"}]},"temperature":{"type":"number","title":"Temperature","default":0.0},"stream":{"type":"boolean","title":"Stream","default":false},"vad_filter":{"type":"boolean","title":"Vad Filter","default":false},"file":{"type":"string","format":"binary","title":"File"}},"type":"object","required":["file"],"title":"Body_translate_file_v1_audio_translations_post"},"CreateSpeechRequestBody":{"properties":{"model":{"type":"string","enum":["piper"],"const":"piper","title":"Model","description":"The ID of the model. The only supported model is 'piper'.","default":"piper","examples":["piper"]},"input":{"type":"string","title":"Input","description":"The text to generate audio for. ","examples":["A rainbow is an optical phenomenon caused by refraction, internal reflection and dispersion of light in water droplets resulting in a continuous spectrum of light appearing in the sky. The rainbow takes the form of a multicoloured circular arc. Rainbows caused by sunlight always appear in the section of sky directly opposite the Sun. Rainbows can be caused by many forms of airborne water. These include not only rain, but also mist, spray, and airborne dew."]},"voice":{"type":"string","title":"Voice","default":"en_US-amy-medium"},"response_format":{"$ref":"#/components/schemas/speaches__routers__speech__ResponseFormat","description":"The format to audio in. Supported formats are mp3, flac, wav, pcm. opus, aac are not supported","default":"mp3","examples":["mp3","flac","wav","pcm"]},"speed":{"type":"number","maximum":4.0,"minimum":0.25,"title":"Speed","default":1.0},"sample_rate":{"anyOf":[{"type":"integer","maximum":48000.0,"minimum":8000.0},{"type":"null"}],"title":"Sample Rate"}},"type":"object","required":["input"],"title":"CreateSpeechRequestBody"},"CreateTranscriptionResponseJson":{"properties":{"text":{"type":"string","title":"Text"}},"type":"object","required":["text"],"title":"CreateTranscriptionResponseJson"},"CreateTranscriptionResponseVerboseJson":{"properties":{"task":{"type":"string","title":"Task","default":"transcribe"},"language":{"type":"string","title":"Language"},"duration":{"type":"number","title":"Duration"},"text":{"type":"string","title":"Text"},"words":{"anyOf":[{"items":{"$ref":"#/components/schemas/TranscriptionWord"},"type":"array"},{"type":"null"}],"title":"Words"},"segments":{"items":{"$ref":"#/components/schemas/TranscriptionSegment"},"type":"array","title":"Segments"}},"type":"object","required":["language","duration","text","words","segments"],"title":"CreateTranscriptionResponseVerboseJson"},"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"Language":{"type":"string","enum":["af","am","ar","as","az","ba","be","bg","bn","bo","br","bs","ca","cs","cy","da","de","el","en","es","et","eu","fa","fi","fo","fr","gl","gu","ha","haw","he","hi","hr","ht","hu","hy","id","is","it","ja","jw","ka","kk","km","kn","ko","la","lb","ln","lo","lt","lv","mg","mi","mk","ml","mn","mr","ms","mt","my","ne","nl","nn","no","oc","pa","pl","ps","pt","ro","ru","sa","sd","si","sk","sl","sn","so","sq","sr","su","sv","sw","ta","te","tg","th","tk","tl","tr","tt","uk","ur","uz","vi","yi","yo","yue","zh"],"title":"Language"},"ListModelsResponse":{"properties":{"data":{"items":{"$ref":"#/components/schemas/Model"},"type":"array","title":"Data"},"object":{"type":"string","enum":["list"],"const":"list","title":"Object","default":"list"}},"type":"object","required":["data"],"title":"ListModelsResponse"},"Model":{"properties":{"id":{"type":"string","title":"Id"},"created":{"type":"integer","title":"Created"},"object":{"type":"string","enum":["model"],"const":"model","title":"Object"},"owned_by":{"type":"string","title":"Owned By"},"language":{"items":{"type":"string"},"type":"array","title":"Language"}},"type":"object","required":["id","created","object","owned_by"],"title":"Model","examples":[{"created":1700732060,"id":"Systran/faster-whisper-large-v3","object":"model","owned_by":"Systran"},{"created":1711378296,"id":"Systran/faster-distil-whisper-large-v3","object":"model","owned_by":"Systran"},{"created":1687968011,"id":"bofenghuang/whisper-large-v2-cv11-french-ct2","object":"model","owned_by":"bofenghuang"}]},"PiperModel":{"properties":{"object":{"type":"string","enum":["model"],"const":"model","title":"Object","default":"model"},"created":{"type":"integer","title":"Created"},"owned_by":{"type":"string","enum":["rhasspy"],"const":"rhasspy","title":"Owned By","default":"rhasspy"},"model_path":{"type":"string","format":"path","title":"Model Path","examples":["/home/nixos/.cache/huggingface/hub/models--rhasspy--piper-voices/snapshots/3d796cc2f2c884b3517c527507e084f7bb245aea/en/en_US/amy/medium/en_US-amy-medium.onnx"]},"id":{"type":"string","title":"Id","readOnly":true,"examples":["rhasspy/piper-voices/en_US-amy-medium"]},"voice":{"type":"string","title":"Voice","readOnly":true,"examples":["rhasspy/piper-voices/en_US-amy-medium"]},"config_path":{"type":"string","format":"path","title":"Config Path","readOnly":true},"quality":{"type":"string","enum":["x_low","low","medium","high"],"title":"Quality","readOnly":true},"sample_rate":{"type":"integer","title":"Sample Rate","readOnly":true}},"type":"object","required":["created","model_path","id","voice","config_path","quality","sample_rate"],"title":"PiperModel","description":"Similar structure to the GET /v1/models response but with extra fields."},"TranscriptionSegment":{"properties":{"id":{"type":"integer","title":"Id"},"seek":{"type":"integer","title":"Seek"},"start":{"type":"number","title":"Start"},"end":{"type":"number","title":"End"},"text":{"type":"string","title":"Text"},"tokens":{"items":{"type":"integer"},"type":"array","title":"Tokens"},"temperature":{"type":"number","title":"Temperature"},"avg_logprob":{"type":"number","title":"Avg Logprob"},"compression_ratio":{"type":"number","title":"Compression Ratio"},"no_speech_prob":{"type":"number","title":"No Speech Prob"},"words":{"anyOf":[{"items":{"$ref":"#/components/schemas/TranscriptionWord"},"type":"array"},{"type":"null"}],"title":"Words"}},"type":"object","required":["id","seek","start","end","text","tokens","temperature","avg_logprob","compression_ratio","no_speech_prob","words"],"title":"TranscriptionSegment"},"TranscriptionWord":{"properties":{"start":{"type":"number","title":"Start"},"end":{"type":"number","title":"End"},"word":{"type":"string","title":"Word"},"probability":{"type":"number","title":"Probability"}},"type":"object","required":["start","end","word","probability"],"title":"TranscriptionWord"},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"speaches__config__ResponseFormat":{"type":"string","enum":["text","json","verbose_json","srt","vtt"],"title":"ResponseFormat"},"speaches__routers__speech__ResponseFormat":{"type":"string","enum":["mp3","flac","wav","pcm"]}}},"tags":[{"name":"automatic-speech-recognition"},{"name":"speech-to-text"},{"name":"models"},{"name":"diagnostic"},{"name":"experimental","description":"Not meant for public use yet. May change or be removed at any time."}]}
docs/usage/open-webui-integration.md CHANGED
@@ -6,7 +6,7 @@
6
  2. Click on the "Audio" tab
7
  3. Update settings
8
  - Speech-to-Text Engine: OpenAI
9
- - API Base URL: http://faster-whisper-server:8000/v1
10
  - API Key: does-not-matter-what-you-put-but-should-not-be-empty
11
  - Model: Systran/faster-distil-whisper-large-v3
12
  4. Click "Save"
@@ -27,10 +27,10 @@ services:
27
  ...
28
  # Environment variables are documented here https://docs.openwebui.com/getting-started/env-configuration#speech-to-text
29
  AUDIO_STT_ENGINE: "openai"
30
- AUDIO_STT_OPENAI_API_BASE_URL: "http://faster-whisper-server:8000/v1"
31
  AUDIO_STT_OPENAI_API_KEY: "does-not-matter-what-you-put-but-should-not-be-empty"
32
  AUDIO_STT_MODEL: "Systran/faster-distil-whisper-large-v3"
33
- faster-whisper-server:
34
- image: fedirz/faster-whisper-server:latest-cuda
35
  ...
36
  ```
 
6
  2. Click on the "Audio" tab
7
  3. Update settings
8
  - Speech-to-Text Engine: OpenAI
9
+ - API Base URL: http://speaches:8000/v1
10
  - API Key: does-not-matter-what-you-put-but-should-not-be-empty
11
  - Model: Systran/faster-distil-whisper-large-v3
12
  4. Click "Save"
 
27
  ...
28
  # Environment variables are documented here https://docs.openwebui.com/getting-started/env-configuration#speech-to-text
29
  AUDIO_STT_ENGINE: "openai"
30
+ AUDIO_STT_OPENAI_API_BASE_URL: "http://speaches:8000/v1"
31
  AUDIO_STT_OPENAI_API_KEY: "does-not-matter-what-you-put-but-should-not-be-empty"
32
  AUDIO_STT_MODEL: "Systran/faster-distil-whisper-large-v3"
33
+ speaches:
34
+ image: ghcr.io/speaches-ai/speaches:latest-cuda
35
  ...
36
  ```
docs/usage/text-to-speech.md CHANGED
@@ -2,7 +2,6 @@
2
 
3
  This feature not supported on ARM devices only x86_64. I was unable to build [piper-phonemize](https://github.com/rhasspy/piper-phonemize)(my [fork](https://github.com/fedirz/piper-phonemize))
4
 
5
- http://localhost:8001/faster-whisper-server/api/
6
  TODO: add a note about automatic downloads
7
  TODO: add a demo
8
  TODO: add a note about tts only running on cpu
@@ -19,13 +18,13 @@ Download the piper voices from [HuggingFace model repository](https://huggingfac
19
 
20
  ```bash
21
  # Download all voices (~15 minutes / 7.7 Gbs)
22
- docker exec -it faster-whisper-server huggingface-cli download rhasspy/piper-voices
23
  # Download all English voices (~4.5 minutes)
24
- docker exec -it faster-whisper-server huggingface-cli download rhasspy/piper-voices --include 'en/**/*' 'voices.json'
25
  # Download all qualities of a specific voice (~4 seconds)
26
- docker exec -it faster-whisper-server huggingface-cli download rhasspy/piper-voices --include 'en/en_US/amy/**/*' 'voices.json'
27
  # Download specific quality of a specific voice (~2 seconds)
28
- docker exec -it faster-whisper-server huggingface-cli download rhasspy/piper-voices --include 'en/en_US/amy/medium/*' 'voices.json'
29
  ```
30
 
31
  !!! note
 
2
 
3
  This feature not supported on ARM devices only x86_64. I was unable to build [piper-phonemize](https://github.com/rhasspy/piper-phonemize)(my [fork](https://github.com/fedirz/piper-phonemize))
4
 
 
5
  TODO: add a note about automatic downloads
6
  TODO: add a demo
7
  TODO: add a note about tts only running on cpu
 
18
 
19
  ```bash
20
  # Download all voices (~15 minutes / 7.7 Gbs)
21
+ docker exec -it speaches huggingface-cli download rhasspy/piper-voices
22
  # Download all English voices (~4.5 minutes)
23
+ docker exec -it speaches huggingface-cli download rhasspy/piper-voices --include 'en/**/*' 'voices.json'
24
  # Download all qualities of a specific voice (~4 seconds)
25
+ docker exec -it speaches huggingface-cli download rhasspy/piper-voices --include 'en/en_US/amy/**/*' 'voices.json'
26
  # Download specific quality of a specific voice (~2 seconds)
27
+ docker exec -it speaches huggingface-cli download rhasspy/piper-voices --include 'en/en_US/amy/medium/*' 'voices.json'
28
  ```
29
 
30
  !!! note
examples/javascript/index.js CHANGED
@@ -1,5 +1,5 @@
1
  /**
2
- * Example provided by https://github.com/Gan-Xing in https://github.com/fedirz/faster-whisper-server/issues/26
3
  */
4
  import 'dotenv/config';
5
  import fs from 'node:fs';
 
1
  /**
2
+ * Example provided by https://github.com/Gan-Xing in https://github.com/speaches-ai/speaches/issues/26
3
  */
4
  import 'dotenv/config';
5
  import fs from 'node:fs';
examples/live-audio/script.sh CHANGED
@@ -9,10 +9,10 @@ set -e
9
 
10
  export WHISPER__MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
11
 
12
- # Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
13
- docker run --detach --gpus=all --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL fedirz/faster-whisper-server:latest-cuda
14
  # or you can run it on a CPU
15
- # docker run --detach --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL fedirz/faster-whisper-server:latest-cpu
16
 
17
  # `pv` is used to limit the rate at which the audio is streamed to the server. Audio is being streamed at a rate of 32kb/s(16000 sample rate * 16-bit sample / 8 bits per byte = 32000 bytes per second). This emulutes live audio input from a microphone: `ffmpeg -loglevel quiet -f alsa -i default -ac 1 -ar 16000 -f s16le -`
18
  # shellcheck disable=SC2002
 
9
 
10
  export WHISPER__MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
11
 
12
+ # Ensure you have `speaches` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
13
+ docker run --detach --gpus=all --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL ghcr.io/speaches-ai/speaches:latest-cuda
14
  # or you can run it on a CPU
15
+ # docker run --detach --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL ghcr.io/speaches-ai/speaches:latest-cpu
16
 
17
  # `pv` is used to limit the rate at which the audio is streamed to the server. Audio is being streamed at a rate of 32kb/s(16000 sample rate * 16-bit sample / 8 bits per byte = 32000 bytes per second). This emulutes live audio input from a microphone: `ffmpeg -loglevel quiet -f alsa -i default -ac 1 -ar 16000 -f s16le -`
18
  # shellcheck disable=SC2002
examples/youtube/script.sh CHANGED
@@ -5,10 +5,10 @@ set -e
5
  # NOTE: do not use any distil-* model other than the large ones as they don't work on long audio files for some reason.
6
  export WHISPER__MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
7
 
8
- # Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
9
- docker run --detach --gpus=all --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL fedirz/faster-whisper-server:latest-cuda
10
  # or you can run it on a CPU
11
- # docker run --detach --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL fedirz/faster-whisper-server:latest-cpu
12
 
13
  # Download the audio from a YouTube video. In this example I'm downloading "The Evolution of the Operating System" by Asionometry YouTube channel. I highly checking this channel out, the guy produces very high content. If you don't have `youtube-dl`, you'll have to install it. https://github.com/ytdl-org/youtube-dl
14
  youtube-dl --extract-audio --audio-format mp3 -o the-evolution-of-the-operating-system.mp3 'https://www.youtube.com/watch?v=1lG7lFLXBIs'
 
5
  # NOTE: do not use any distil-* model other than the large ones as they don't work on long audio files for some reason.
6
  export WHISPER__MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
7
 
8
+ # Ensure you have `speaches` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
9
+ docker run --detach --gpus=all --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL ghcr.io/speaches-ai/speaches:latest-cuda
10
  # or you can run it on a CPU
11
+ # docker run --detach --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL ghcr.io/speaches-ai/speaches:latest-cpu
12
 
13
  # Download the audio from a YouTube video. In this example I'm downloading "The Evolution of the Operating System" by Asionometry YouTube channel. I highly checking this channel out, the guy produces very high content. If you don't have `youtube-dl`, you'll have to install it. https://github.com/ytdl-org/youtube-dl
14
  youtube-dl --extract-audio --audio-format mp3 -o the-evolution-of-the-operating-system.mp3 'https://www.youtube.com/watch?v=1lG7lFLXBIs'
mkdocs.yml CHANGED
@@ -1,8 +1,8 @@
1
  # yaml-language-server: $schema=https://squidfunk.github.io/mkdocs-material/schema.json
2
  # https://www.mkdocs.org/user-guide/configuration/#configuration
3
- site_name: Faster Whisper Server Documentation
4
- site_url: https://fedirz.github.io/faster-whisper-server/
5
- repo_url: https://github.com/fedirz/faster-whisper-server/
6
  edit_uri: edit/master/docs/
7
  docs_dir: docs
8
  theme:
 
1
  # yaml-language-server: $schema=https://squidfunk.github.io/mkdocs-material/schema.json
2
  # https://www.mkdocs.org/user-guide/configuration/#configuration
3
+ site_name: Speaches Documentation
4
+ site_url: https://speaches-ai.github.io/speaches/
5
+ repo_url: https://github.com/speaches-ai/speaches/
6
  edit_uri: edit/master/docs/
7
  docs_dir: docs
8
  theme:
pyproject.toml CHANGED
@@ -1,5 +1,5 @@
1
  [project]
2
- name = "faster-whisper-server"
3
  version = "0.1.0"
4
  requires-python = ">=3.12,<3.13"
5
  # https://packaging.python.org/en/latest/specifications/version-specifiers/#id5
 
1
  [project]
2
+ name = "speaches"
3
  version = "0.1.0"
4
  requires-python = ">=3.12,<3.13"
5
  # https://packaging.python.org/en/latest/specifications/version-specifiers/#id5
src/{faster_whisper_server β†’ speaches}/__init__.py RENAMED
File without changes
src/{faster_whisper_server β†’ speaches}/api_models.py RENAMED
@@ -4,7 +4,7 @@ from typing import TYPE_CHECKING, Literal
4
 
5
  from pydantic import BaseModel, ConfigDict, Field
6
 
7
- from faster_whisper_server.text_utils import Transcription, canonicalize_word, segments_to_text
8
 
9
  if TYPE_CHECKING:
10
  from collections.abc import Iterable
@@ -23,7 +23,7 @@ class TranscriptionWord(BaseModel):
23
  def from_segments(cls, segments: Iterable[TranscriptionSegment]) -> list[TranscriptionWord]:
24
  words: list[TranscriptionWord] = []
25
  for segment in segments:
26
- # NOTE: a temporary "fix" for https://github.com/fedirz/faster-whisper-server/issues/58.
27
  # TODO: properly address the issue
28
  assert (
29
  segment.words is not None
 
4
 
5
  from pydantic import BaseModel, ConfigDict, Field
6
 
7
+ from speaches.text_utils import Transcription, canonicalize_word, segments_to_text
8
 
9
  if TYPE_CHECKING:
10
  from collections.abc import Iterable
 
23
  def from_segments(cls, segments: Iterable[TranscriptionSegment]) -> list[TranscriptionWord]:
24
  words: list[TranscriptionWord] = []
25
  for segment in segments:
26
+ # NOTE: a temporary "fix" for https://github.com/speaches-ai/speaches/issues/58.
27
  # TODO: properly address the issue
28
  assert (
29
  segment.words is not None
src/{faster_whisper_server β†’ speaches}/asr.py RENAMED
@@ -5,13 +5,13 @@ import logging
5
  import time
6
  from typing import TYPE_CHECKING
7
 
8
- from faster_whisper_server.api_models import TranscriptionSegment, TranscriptionWord
9
- from faster_whisper_server.text_utils import Transcription
10
 
11
  if TYPE_CHECKING:
12
  from faster_whisper import transcribe
13
 
14
- from faster_whisper_server.audio import Audio
15
 
16
  logger = logging.getLogger(__name__)
17
 
 
5
  import time
6
  from typing import TYPE_CHECKING
7
 
8
+ from speaches.api_models import TranscriptionSegment, TranscriptionWord
9
+ from speaches.text_utils import Transcription
10
 
11
  if TYPE_CHECKING:
12
  from faster_whisper import transcribe
13
 
14
+ from speaches.audio import Audio
15
 
16
  logger = logging.getLogger(__name__)
17
 
src/{faster_whisper_server β†’ speaches}/audio.py RENAMED
@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, BinaryIO
7
  import numpy as np
8
  import soundfile as sf
9
 
10
- from faster_whisper_server.config import SAMPLES_PER_SECOND
11
 
12
  if TYPE_CHECKING:
13
  from collections.abc import AsyncGenerator
 
7
  import numpy as np
8
  import soundfile as sf
9
 
10
+ from speaches.config import SAMPLES_PER_SECOND
11
 
12
  if TYPE_CHECKING:
13
  from collections.abc import AsyncGenerator
src/{faster_whisper_server β†’ speaches}/config.py RENAMED
File without changes
src/{faster_whisper_server β†’ speaches}/dependencies.py RENAMED
@@ -9,8 +9,8 @@ from openai import AsyncOpenAI
9
  from openai.resources.audio import AsyncSpeech, AsyncTranscriptions
10
  from openai.resources.chat.completions import AsyncCompletions
11
 
12
- from faster_whisper_server.config import Config
13
- from faster_whisper_server.model_manager import PiperModelManager, WhisperModelManager
14
 
15
  logger = logging.getLogger(__name__)
16
 
@@ -73,7 +73,7 @@ def get_speech_client() -> AsyncSpeech:
73
  config = get_config()
74
  if config.speech_base_url is None:
75
  # this might not work as expected if `speech_router` won't have shared state (access to the same `model_manager`) with the main FastAPI `app`. TODO: verify # noqa: E501
76
- from faster_whisper_server.routers.speech import (
77
  router as speech_router,
78
  )
79
 
@@ -94,7 +94,7 @@ def get_transcription_client() -> AsyncTranscriptions:
94
  config = get_config()
95
  if config.transcription_base_url is None:
96
  # this might not work as expected if `transcription_router` won't have shared state (access to the same `model_manager`) with the main FastAPI `app`. TODO: verify # noqa: E501
97
- from faster_whisper_server.routers.stt import (
98
  router as stt_router,
99
  )
100
 
 
9
  from openai.resources.audio import AsyncSpeech, AsyncTranscriptions
10
  from openai.resources.chat.completions import AsyncCompletions
11
 
12
+ from speaches.config import Config
13
+ from speaches.model_manager import PiperModelManager, WhisperModelManager
14
 
15
  logger = logging.getLogger(__name__)
16
 
 
73
  config = get_config()
74
  if config.speech_base_url is None:
75
  # this might not work as expected if `speech_router` won't have shared state (access to the same `model_manager`) with the main FastAPI `app`. TODO: verify # noqa: E501
76
+ from speaches.routers.speech import (
77
  router as speech_router,
78
  )
79
 
 
94
  config = get_config()
95
  if config.transcription_base_url is None:
96
  # this might not work as expected if `transcription_router` won't have shared state (access to the same `model_manager`) with the main FastAPI `app`. TODO: verify # noqa: E501
97
+ from speaches.routers.stt import (
98
  router as stt_router,
99
  )
100
 
src/{faster_whisper_server β†’ speaches}/gradio_app.py RENAMED
@@ -7,8 +7,8 @@ import httpx
7
  from httpx_sse import aconnect_sse
8
  from openai import AsyncOpenAI
9
 
10
- from faster_whisper_server.config import Config, Task
11
- from faster_whisper_server.hf_utils import PiperModel
12
 
13
  TRANSCRIPTION_ENDPOINT = "/v1/audio/transcriptions"
14
  TRANSLATION_ENDPOINT = "/v1/audio/translations"
@@ -128,9 +128,9 @@ def create_gradio_demo(config: Config) -> gr.Blocks: # noqa: C901, PLR0915
128
  file.write(audio_bytes)
129
  return file_path
130
 
131
- with gr.Blocks(title="faster-whisper-server Playground") as demo:
132
  gr.Markdown(
133
- "### Consider supporting the project by starring the [repository on GitHub](https://github.com/fedirz/faster-whisper-server)."
134
  )
135
  with gr.Tab(label="Transcribe/Translate"):
136
  audio = gr.Audio(type="filepath")
@@ -157,7 +157,7 @@ def create_gradio_demo(config: Config) -> gr.Blocks: # noqa: C901, PLR0915
157
 
158
  with gr.Tab(label="Speech Generation"):
159
  if platform.machine() != "x86_64":
160
- from faster_whisper_server.routers.speech import (
161
  DEFAULT_VOICE,
162
  MAX_SAMPLE_RATE,
163
  MIN_SAMPLE_RATE,
 
7
  from httpx_sse import aconnect_sse
8
  from openai import AsyncOpenAI
9
 
10
+ from speaches.config import Config, Task
11
+ from speaches.hf_utils import PiperModel
12
 
13
  TRANSCRIPTION_ENDPOINT = "/v1/audio/transcriptions"
14
  TRANSLATION_ENDPOINT = "/v1/audio/translations"
 
128
  file.write(audio_bytes)
129
  return file_path
130
 
131
+ with gr.Blocks(title="Speaches Playground") as demo:
132
  gr.Markdown(
133
+ "### Consider supporting the project by starring the [repository on GitHub](https://github.com/speaches-ai/speaches)."
134
  )
135
  with gr.Tab(label="Transcribe/Translate"):
136
  audio = gr.Audio(type="filepath")
 
157
 
158
  with gr.Tab(label="Speech Generation"):
159
  if platform.machine() != "x86_64":
160
+ from speaches.routers.speech import (
161
  DEFAULT_VOICE,
162
  MAX_SAMPLE_RATE,
163
  MIN_SAMPLE_RATE,
src/{faster_whisper_server β†’ speaches}/hf_utils.py RENAMED
@@ -10,7 +10,7 @@ import huggingface_hub
10
  from huggingface_hub.constants import HF_HUB_CACHE
11
  from pydantic import BaseModel, Field, computed_field
12
 
13
- from faster_whisper_server.api_models import Model
14
 
15
  logger = logging.getLogger(__name__)
16
 
 
10
  from huggingface_hub.constants import HF_HUB_CACHE
11
  from pydantic import BaseModel, Field, computed_field
12
 
13
+ from speaches.api_models import Model
14
 
15
  logger = logging.getLogger(__name__)
16
 
src/{faster_whisper_server β†’ speaches}/logger.py RENAMED
File without changes
src/{faster_whisper_server β†’ speaches}/main.py RENAMED
@@ -10,15 +10,15 @@ from fastapi import (
10
  )
11
  from fastapi.middleware.cors import CORSMiddleware
12
 
13
- from faster_whisper_server.dependencies import ApiKeyDependency, get_config, get_model_manager
14
- from faster_whisper_server.logger import setup_logger
15
- from faster_whisper_server.routers.misc import (
16
  router as misc_router,
17
  )
18
- from faster_whisper_server.routers.models import (
19
  router as models_router,
20
  )
21
- from faster_whisper_server.routers.stt import (
22
  router as stt_router,
23
  )
24
 
@@ -47,7 +47,7 @@ def create_app() -> FastAPI:
47
  logger.debug(f"Config: {config}")
48
 
49
  if platform.machine() == "x86_64":
50
- from faster_whisper_server.routers.speech import (
51
  router as speech_router,
52
  )
53
  else:
@@ -86,7 +86,7 @@ def create_app() -> FastAPI:
86
  if config.enable_ui:
87
  import gradio as gr
88
 
89
- from faster_whisper_server.gradio_app import create_gradio_demo
90
 
91
  app = gr.mount_gradio_app(app, create_gradio_demo(config), path="/")
92
 
 
10
  )
11
  from fastapi.middleware.cors import CORSMiddleware
12
 
13
+ from speaches.dependencies import ApiKeyDependency, get_config, get_model_manager
14
+ from speaches.logger import setup_logger
15
+ from speaches.routers.misc import (
16
  router as misc_router,
17
  )
18
+ from speaches.routers.models import (
19
  router as models_router,
20
  )
21
+ from speaches.routers.stt import (
22
  router as stt_router,
23
  )
24
 
 
47
  logger.debug(f"Config: {config}")
48
 
49
  if platform.machine() == "x86_64":
50
+ from speaches.routers.speech import (
51
  router as speech_router,
52
  )
53
  else:
 
86
  if config.enable_ui:
87
  import gradio as gr
88
 
89
+ from speaches.gradio_app import create_gradio_demo
90
 
91
  app = gr.mount_gradio_app(app, create_gradio_demo(config), path="/")
92
 
src/{faster_whisper_server β†’ speaches}/model_manager.py RENAMED
@@ -9,14 +9,14 @@ from typing import TYPE_CHECKING
9
 
10
  from faster_whisper import WhisperModel
11
 
12
- from faster_whisper_server.hf_utils import get_piper_voice_model_file
13
 
14
  if TYPE_CHECKING:
15
  from collections.abc import Callable
16
 
17
  from piper.voice import PiperVoice
18
 
19
- from faster_whisper_server.config import (
20
  WhisperConfig,
21
  )
22
 
 
9
 
10
  from faster_whisper import WhisperModel
11
 
12
+ from speaches.hf_utils import get_piper_voice_model_file
13
 
14
  if TYPE_CHECKING:
15
  from collections.abc import Callable
16
 
17
  from piper.voice import PiperVoice
18
 
19
+ from speaches.config import (
20
  WhisperConfig,
21
  )
22
 
src/{faster_whisper_server β†’ speaches}/routers/__init__.py RENAMED
File without changes
src/{faster_whisper_server β†’ speaches}/routers/misc.py RENAMED
@@ -7,8 +7,8 @@ from fastapi import (
7
  import huggingface_hub
8
  from huggingface_hub.hf_api import RepositoryNotFoundError
9
 
10
- from faster_whisper_server import hf_utils
11
- from faster_whisper_server.dependencies import ModelManagerDependency # noqa: TCH001
12
 
13
  router = APIRouter()
14
 
 
7
  import huggingface_hub
8
  from huggingface_hub.hf_api import RepositoryNotFoundError
9
 
10
+ from speaches import hf_utils
11
+ from speaches.dependencies import ModelManagerDependency # noqa: TC001
12
 
13
  router = APIRouter()
14
 
src/{faster_whisper_server β†’ speaches}/routers/models.py RENAMED
@@ -9,11 +9,11 @@ from fastapi import (
9
  )
10
  import huggingface_hub
11
 
12
- from faster_whisper_server.api_models import (
13
  ListModelsResponse,
14
  Model,
15
  )
16
- from faster_whisper_server.hf_utils import list_whisper_models
17
 
18
  if TYPE_CHECKING:
19
  from huggingface_hub.hf_api import ModelInfo
 
9
  )
10
  import huggingface_hub
11
 
12
+ from speaches.api_models import (
13
  ListModelsResponse,
14
  Model,
15
  )
16
+ from speaches.hf_utils import list_whisper_models
17
 
18
  if TYPE_CHECKING:
19
  from huggingface_hub.hf_api import ModelInfo
src/{faster_whisper_server β†’ speaches}/routers/speech.py RENAMED
@@ -11,8 +11,8 @@ from piper.voice import PiperVoice
11
  from pydantic import BaseModel, BeforeValidator, Field, ValidationError, model_validator
12
  import soundfile as sf
13
 
14
- from faster_whisper_server.dependencies import PiperModelManagerDependency
15
- from faster_whisper_server.hf_utils import (
16
  PiperModel,
17
  list_piper_models,
18
  read_piper_voices_config,
 
11
  from pydantic import BaseModel, BeforeValidator, Field, ValidationError, model_validator
12
  import soundfile as sf
13
 
14
+ from speaches.dependencies import PiperModelManagerDependency
15
+ from speaches.hf_utils import (
16
  PiperModel,
17
  list_piper_models,
18
  read_piper_voices_config,
src/{faster_whisper_server β†’ speaches}/routers/stt.py RENAMED
@@ -27,7 +27,7 @@ from numpy import float32
27
  from numpy.typing import NDArray
28
  from pydantic import AfterValidator, Field
29
 
30
- from faster_whisper_server.api_models import (
31
  DEFAULT_TIMESTAMP_GRANULARITIES,
32
  TIMESTAMP_GRANULARITIES_COMBINATIONS,
33
  CreateTranscriptionResponseJson,
@@ -35,17 +35,17 @@ from faster_whisper_server.api_models import (
35
  TimestampGranularities,
36
  TranscriptionSegment,
37
  )
38
- from faster_whisper_server.asr import FasterWhisperASR
39
- from faster_whisper_server.audio import AudioStream, audio_samples_from_file
40
- from faster_whisper_server.config import (
41
  SAMPLES_PER_SECOND,
42
  Language,
43
  ResponseFormat,
44
  Task,
45
  )
46
- from faster_whisper_server.dependencies import ConfigDependency, ModelManagerDependency, get_config
47
- from faster_whisper_server.text_utils import segments_to_srt, segments_to_text, segments_to_vtt
48
- from faster_whisper_server.transcriber import audio_transcriber
49
 
50
  if TYPE_CHECKING:
51
  from collections.abc import Generator, Iterable
@@ -77,7 +77,7 @@ def audio_file_dependency(
77
  ) from e
78
  except Exception as e:
79
  logger.exception(
80
- "Failed to decode audio. This is likely a bug. Please create an issue at https://github.com/fedirz/faster-whisper-server/issues/new."
81
  )
82
  raise HTTPException(status_code=500, detail="Failed to decode audio.") from e
83
  else:
 
27
  from numpy.typing import NDArray
28
  from pydantic import AfterValidator, Field
29
 
30
+ from speaches.api_models import (
31
  DEFAULT_TIMESTAMP_GRANULARITIES,
32
  TIMESTAMP_GRANULARITIES_COMBINATIONS,
33
  CreateTranscriptionResponseJson,
 
35
  TimestampGranularities,
36
  TranscriptionSegment,
37
  )
38
+ from speaches.asr import FasterWhisperASR
39
+ from speaches.audio import AudioStream, audio_samples_from_file
40
+ from speaches.config import (
41
  SAMPLES_PER_SECOND,
42
  Language,
43
  ResponseFormat,
44
  Task,
45
  )
46
+ from speaches.dependencies import ConfigDependency, ModelManagerDependency, get_config
47
+ from speaches.text_utils import segments_to_srt, segments_to_text, segments_to_vtt
48
+ from speaches.transcriber import audio_transcriber
49
 
50
  if TYPE_CHECKING:
51
  from collections.abc import Generator, Iterable
 
77
  ) from e
78
  except Exception as e:
79
  logger.exception(
80
+ "Failed to decode audio. This is likely a bug. Please create an issue at https://github.com/speaches-ai/speaches/issues/new."
81
  )
82
  raise HTTPException(status_code=500, detail="Failed to decode audio.") from e
83
  else:
src/{faster_whisper_server β†’ speaches}/text_utils.py RENAMED
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING
6
  if TYPE_CHECKING:
7
  from collections.abc import Iterable
8
 
9
- from faster_whisper_server.api_models import TranscriptionSegment, TranscriptionWord
10
 
11
 
12
  class Transcription:
@@ -38,7 +38,7 @@ class Transcription:
38
  self.words.extend(words)
39
 
40
  def _ensure_no_word_overlap(self, words: list[TranscriptionWord]) -> None:
41
- from faster_whisper_server.dependencies import get_config # HACK: avoid circular import
42
 
43
  config = get_config() # HACK
44
  if len(self.words) > 0 and len(words) > 0:
 
6
  if TYPE_CHECKING:
7
  from collections.abc import Iterable
8
 
9
+ from speaches.api_models import TranscriptionSegment, TranscriptionWord
10
 
11
 
12
  class Transcription:
 
38
  self.words.extend(words)
39
 
40
  def _ensure_no_word_overlap(self, words: list[TranscriptionWord]) -> None:
41
+ from speaches.dependencies import get_config # HACK: avoid circular import
42
 
43
  config = get_config() # HACK
44
  if len(self.words) > 0 and len(words) > 0:
src/{faster_whisper_server β†’ speaches}/text_utils_test.py RENAMED
@@ -1,5 +1,5 @@
1
- from faster_whisper_server.api_models import TranscriptionWord
2
- from faster_whisper_server.text_utils import (
3
  canonicalize_word,
4
  common_prefix,
5
  is_eos,
 
1
+ from speaches.api_models import TranscriptionWord
2
+ from speaches.text_utils import (
3
  canonicalize_word,
4
  common_prefix,
5
  is_eos,
src/{faster_whisper_server β†’ speaches}/transcriber.py RENAMED
@@ -3,14 +3,14 @@ from __future__ import annotations
3
  import logging
4
  from typing import TYPE_CHECKING
5
 
6
- from faster_whisper_server.audio import Audio, AudioStream
7
- from faster_whisper_server.text_utils import Transcription, common_prefix, to_full_sentences, word_to_text
8
 
9
  if TYPE_CHECKING:
10
  from collections.abc import AsyncGenerator
11
 
12
- from faster_whisper_server.api_models import TranscriptionWord
13
- from faster_whisper_server.asr import FasterWhisperASR
14
 
15
  logger = logging.getLogger(__name__)
16
 
 
3
  import logging
4
  from typing import TYPE_CHECKING
5
 
6
+ from speaches.audio import Audio, AudioStream
7
+ from speaches.text_utils import Transcription, common_prefix, to_full_sentences, word_to_text
8
 
9
  if TYPE_CHECKING:
10
  from collections.abc import AsyncGenerator
11
 
12
+ from speaches.api_models import TranscriptionWord
13
+ from speaches.asr import FasterWhisperASR
14
 
15
  logger = logging.getLogger(__name__)
16
 
tests/api_timestamp_granularities_test.py CHANGED
@@ -5,7 +5,7 @@ from pathlib import Path
5
  from openai import AsyncOpenAI
6
  import pytest
7
 
8
- from faster_whisper_server.api_models import TIMESTAMP_GRANULARITIES_COMBINATIONS, TimestampGranularities
9
 
10
 
11
  @pytest.mark.asyncio
 
5
  from openai import AsyncOpenAI
6
  import pytest
7
 
8
+ from speaches.api_models import TIMESTAMP_GRANULARITIES_COMBINATIONS, TimestampGranularities
9
 
10
 
11
  @pytest.mark.asyncio
tests/conftest.py CHANGED
@@ -12,9 +12,9 @@ import pytest
12
  import pytest_asyncio
13
  from pytest_mock import MockerFixture
14
 
15
- from faster_whisper_server.config import Config, WhisperConfig
16
- from faster_whisper_server.dependencies import get_config
17
- from faster_whisper_server.main import create_app
18
 
19
  DISABLE_LOGGERS = ["multipart.multipart", "faster_whisper"]
20
  OPENAI_BASE_URL = "https://api.openai.com/v1"
@@ -54,11 +54,11 @@ async def aclient_factory(mocker: MockerFixture) -> AclientFactory:
54
  @asynccontextmanager
55
  async def inner(config: Config = DEFAULT_CONFIG) -> AsyncGenerator[AsyncClient, None]:
56
  # NOTE: all calls to `get_config` should be patched. One way to test that this works is to update the original `get_config` to raise an exception and see if the tests fail # noqa: E501
57
- mocker.patch("faster_whisper_server.dependencies.get_config", return_value=config)
58
- mocker.patch("faster_whisper_server.main.get_config", return_value=config)
59
  # NOTE: I couldn't get the following to work but it shouldn't matter
60
  # mocker.patch(
61
- # "faster_whisper_server.text_utils.Transcription._ensure_no_word_overlap.get_config", return_value=config
62
  # )
63
 
64
  app = create_app()
 
12
  import pytest_asyncio
13
  from pytest_mock import MockerFixture
14
 
15
+ from speaches.config import Config, WhisperConfig
16
+ from speaches.dependencies import get_config
17
+ from speaches.main import create_app
18
 
19
  DISABLE_LOGGERS = ["multipart.multipart", "faster_whisper"]
20
  OPENAI_BASE_URL = "https://api.openai.com/v1"
 
54
  @asynccontextmanager
55
  async def inner(config: Config = DEFAULT_CONFIG) -> AsyncGenerator[AsyncClient, None]:
56
  # NOTE: all calls to `get_config` should be patched. One way to test that this works is to update the original `get_config` to raise an exception and see if the tests fail # noqa: E501
57
+ mocker.patch("speaches.dependencies.get_config", return_value=config)
58
+ mocker.patch("speaches.main.get_config", return_value=config)
59
  # NOTE: I couldn't get the following to work but it shouldn't matter
60
  # mocker.patch(
61
+ # "speaches.text_utils.Transcription._ensure_no_word_overlap.get_config", return_value=config
62
  # )
63
 
64
  app = create_app()
tests/model_manager_test.py CHANGED
@@ -3,7 +3,7 @@ import asyncio
3
  import anyio
4
  import pytest
5
 
6
- from faster_whisper_server.config import Config, WhisperConfig
7
  from tests.conftest import DEFAULT_WHISPER_MODEL, AclientFactory
8
 
9
  MODEL = DEFAULT_WHISPER_MODEL # just to make the test more readable
 
3
  import anyio
4
  import pytest
5
 
6
+ from speaches.config import Config, WhisperConfig
7
  from tests.conftest import DEFAULT_WHISPER_MODEL, AclientFactory
8
 
9
  MODEL = DEFAULT_WHISPER_MODEL # just to make the test more readable
tests/openai_timestamp_granularities_test.py CHANGED
@@ -5,7 +5,7 @@ from pathlib import Path
5
  from openai import AsyncOpenAI, BadRequestError
6
  import pytest
7
 
8
- from faster_whisper_server.api_models import TIMESTAMP_GRANULARITIES_COMBINATIONS, TimestampGranularities
9
 
10
 
11
  @pytest.mark.asyncio
 
5
  from openai import AsyncOpenAI, BadRequestError
6
  import pytest
7
 
8
+ from speaches.api_models import TIMESTAMP_GRANULARITIES_COMBINATIONS, TimestampGranularities
9
 
10
 
11
  @pytest.mark.asyncio
tests/speech_test.py CHANGED
@@ -9,7 +9,7 @@ platform_machine = platform.machine()
9
  if platform_machine != "x86_64":
10
  pytest.skip("Only supported on x86_64", allow_module_level=True)
11
 
12
- from faster_whisper_server.routers.speech import ( # noqa: E402
13
  DEFAULT_MODEL,
14
  DEFAULT_RESPONSE_FORMAT,
15
  DEFAULT_VOICE,
 
9
  if platform_machine != "x86_64":
10
  pytest.skip("Only supported on x86_64", allow_module_level=True)
11
 
12
+ from speaches.routers.speech import ( # noqa: E402
13
  DEFAULT_MODEL,
14
  DEFAULT_RESPONSE_FORMAT,
15
  DEFAULT_VOICE,
tests/sse_test.py CHANGED
@@ -9,7 +9,7 @@ import srt
9
  import webvtt
10
  import webvtt.vtt
11
 
12
- from faster_whisper_server.api_models import (
13
  CreateTranscriptionResponseJson,
14
  CreateTranscriptionResponseVerboseJson,
15
  )
 
9
  import webvtt
10
  import webvtt.vtt
11
 
12
+ from speaches.api_models import (
13
  CreateTranscriptionResponseJson,
14
  CreateTranscriptionResponseVerboseJson,
15
  )
uv.lock CHANGED
@@ -266,115 +266,6 @@ wheels = [
266
  { url = "https://files.pythonhosted.org/packages/7b/03/ab118cb743dcf671da01ad0cfd7564465dda115db32976fdc95e21ce8feb/faster_whisper-1.1.0-py3-none-any.whl", hash = "sha256:0f2d025676bbff1e46c4108b6f9a82578d6e33826c174af2990e45b33fab6182", size = 1118168 },
267
  ]
268
 
269
- [[package]]
270
- name = "faster-whisper-server"
271
- version = "0.1.0"
272
- source = { editable = "." }
273
- dependencies = [
274
- { name = "ctranslate2" },
275
- { name = "fastapi" },
276
- { name = "faster-whisper" },
277
- { name = "huggingface-hub", extra = ["hf-transfer"] },
278
- { name = "numpy" },
279
- { name = "piper-phonemize", marker = "platform_machine == 'x86_64'" },
280
- { name = "piper-tts", marker = "platform_machine == 'x86_64'" },
281
- { name = "pydantic" },
282
- { name = "pydantic-settings" },
283
- { name = "python-multipart" },
284
- { name = "sounddevice" },
285
- { name = "soundfile" },
286
- { name = "uvicorn" },
287
- ]
288
-
289
- [package.optional-dependencies]
290
- client = [
291
- { name = "keyboard" },
292
- ]
293
- dev = [
294
- { name = "anyio" },
295
- { name = "basedpyright" },
296
- { name = "mdx-truly-sane-lists" },
297
- { name = "mkdocs-material" },
298
- { name = "mkdocs-render-swagger-plugin" },
299
- { name = "mkdocstrings", extra = ["python"] },
300
- { name = "pre-commit" },
301
- { name = "pytest" },
302
- { name = "pytest-antilru" },
303
- { name = "pytest-asyncio" },
304
- { name = "pytest-mock" },
305
- { name = "pytest-xdist" },
306
- { name = "ruff" },
307
- { name = "srt" },
308
- { name = "webvtt-py" },
309
- ]
310
- opentelemetry = [
311
- { name = "opentelemetry-distro" },
312
- { name = "opentelemetry-exporter-otlp" },
313
- { name = "opentelemetry-instrumentation-asyncio" },
314
- { name = "opentelemetry-instrumentation-fastapi" },
315
- { name = "opentelemetry-instrumentation-grpc" },
316
- { name = "opentelemetry-instrumentation-httpx" },
317
- { name = "opentelemetry-instrumentation-logging" },
318
- { name = "opentelemetry-instrumentation-requests" },
319
- { name = "opentelemetry-instrumentation-threading" },
320
- { name = "opentelemetry-instrumentation-urllib" },
321
- { name = "opentelemetry-instrumentation-urllib3" },
322
- ]
323
- ui = [
324
- { name = "gradio" },
325
- { name = "httpx" },
326
- { name = "httpx-sse" },
327
- { name = "openai" },
328
- ]
329
-
330
- [package.metadata]
331
- requires-dist = [
332
- { name = "anyio", marker = "extra == 'dev'", specifier = ">=4.4.0" },
333
- { name = "basedpyright", marker = "extra == 'dev'", specifier = ">=1.18.0" },
334
- { name = "ctranslate2", specifier = ">=4.5.0" },
335
- { name = "fastapi", specifier = ">=0.115.0" },
336
- { name = "faster-whisper", specifier = ">=1.1.0" },
337
- { name = "gradio", marker = "extra == 'ui'", specifier = ">=5.0.2" },
338
- { name = "httpx", marker = "extra == 'ui'", specifier = ">=0.27.2" },
339
- { name = "httpx-sse", marker = "extra == 'ui'", specifier = ">=0.4.0" },
340
- { name = "huggingface-hub", extras = ["hf-transfer"], specifier = ">=0.25.1" },
341
- { name = "keyboard", marker = "extra == 'client'", specifier = ">=0.13.5" },
342
- { name = "mdx-truly-sane-lists", marker = "extra == 'dev'", specifier = ">=1.3" },
343
- { name = "mkdocs-material", marker = "extra == 'dev'", specifier = ">=9.5.39" },
344
- { name = "mkdocs-render-swagger-plugin", marker = "extra == 'dev'", specifier = ">=0.1.2" },
345
- { name = "mkdocstrings", extras = ["python"], marker = "extra == 'dev'", specifier = ">=0.26.1" },
346
- { name = "numpy", specifier = ">=2.1.1" },
347
- { name = "openai", marker = "extra == 'ui'", specifier = ">=1.48.0" },
348
- { name = "opentelemetry-distro", marker = "extra == 'opentelemetry'", specifier = ">=0.48b0" },
349
- { name = "opentelemetry-exporter-otlp", marker = "extra == 'opentelemetry'", specifier = ">=1.27.0" },
350
- { name = "opentelemetry-instrumentation-asyncio", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
351
- { name = "opentelemetry-instrumentation-fastapi", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
352
- { name = "opentelemetry-instrumentation-grpc", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
353
- { name = "opentelemetry-instrumentation-httpx", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
354
- { name = "opentelemetry-instrumentation-logging", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
355
- { name = "opentelemetry-instrumentation-requests", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
356
- { name = "opentelemetry-instrumentation-threading", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
357
- { name = "opentelemetry-instrumentation-urllib", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
358
- { name = "opentelemetry-instrumentation-urllib3", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
359
- { name = "piper-phonemize", marker = "platform_machine == 'x86_64'", url = "https://github.com/fedirz/piper-phonemize/raw/refs/heads/master/dist/piper_phonemize-1.2.0-cp312-cp312-manylinux_2_28_x86_64.whl" },
360
- { name = "piper-tts", marker = "platform_machine == 'x86_64'", specifier = ">=1.2.0" },
361
- { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.0.1" },
362
- { name = "pydantic", specifier = ">=2.9.0" },
363
- { name = "pydantic-settings", specifier = ">=2.5.2" },
364
- { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.3.3" },
365
- { name = "pytest-antilru", marker = "extra == 'dev'", specifier = ">=2.0.0" },
366
- { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24.0" },
367
- { name = "pytest-mock", marker = "extra == 'dev'", specifier = ">=3.14.0" },
368
- { name = "pytest-xdist", marker = "extra == 'dev'", specifier = ">=3.6.1" },
369
- { name = "python-multipart", specifier = ">=0.0.10" },
370
- { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.7.1" },
371
- { name = "sounddevice", specifier = ">=0.5.1" },
372
- { name = "soundfile", specifier = ">=0.12.1" },
373
- { name = "srt", marker = "extra == 'dev'", specifier = ">=3.5.3" },
374
- { name = "uvicorn", specifier = ">=0.30.6" },
375
- { name = "webvtt-py", marker = "extra == 'dev'", specifier = ">=0.5.1" },
376
- ]
377
-
378
  [[package]]
379
  name = "ffmpy"
380
  version = "0.4.0"
@@ -4241,6 +4132,115 @@ wheels = [
4241
  { url = "https://files.pythonhosted.org/packages/50/ff/26a4ee48d0b66625a4e4028a055b9f25bc9d7c7b2d17d21a45137621a50d/soundfile-0.12.1-py2.py3-none-win_amd64.whl", hash = "sha256:0d86924c00b62552b650ddd28af426e3ff2d4dc2e9047dae5b3d8452e0a49a77", size = 1009109 },
4242
  ]
4243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4244
  [[package]]
4245
  name = "srt"
4246
  version = "3.5.3"
 
266
  { url = "https://files.pythonhosted.org/packages/7b/03/ab118cb743dcf671da01ad0cfd7564465dda115db32976fdc95e21ce8feb/faster_whisper-1.1.0-py3-none-any.whl", hash = "sha256:0f2d025676bbff1e46c4108b6f9a82578d6e33826c174af2990e45b33fab6182", size = 1118168 },
267
  ]
268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  [[package]]
270
  name = "ffmpy"
271
  version = "0.4.0"
 
4132
  { url = "https://files.pythonhosted.org/packages/50/ff/26a4ee48d0b66625a4e4028a055b9f25bc9d7c7b2d17d21a45137621a50d/soundfile-0.12.1-py2.py3-none-win_amd64.whl", hash = "sha256:0d86924c00b62552b650ddd28af426e3ff2d4dc2e9047dae5b3d8452e0a49a77", size = 1009109 },
4133
  ]
4134
 
4135
+ [[package]]
4136
+ name = "speaches"
4137
+ version = "0.1.0"
4138
+ source = { editable = "." }
4139
+ dependencies = [
4140
+ { name = "ctranslate2" },
4141
+ { name = "fastapi" },
4142
+ { name = "faster-whisper" },
4143
+ { name = "huggingface-hub", extra = ["hf-transfer"] },
4144
+ { name = "numpy" },
4145
+ { name = "piper-phonemize", marker = "platform_machine == 'x86_64'" },
4146
+ { name = "piper-tts", marker = "platform_machine == 'x86_64'" },
4147
+ { name = "pydantic" },
4148
+ { name = "pydantic-settings" },
4149
+ { name = "python-multipart" },
4150
+ { name = "sounddevice" },
4151
+ { name = "soundfile" },
4152
+ { name = "uvicorn" },
4153
+ ]
4154
+
4155
+ [package.optional-dependencies]
4156
+ client = [
4157
+ { name = "keyboard" },
4158
+ ]
4159
+ dev = [
4160
+ { name = "anyio" },
4161
+ { name = "basedpyright" },
4162
+ { name = "mdx-truly-sane-lists" },
4163
+ { name = "mkdocs-material" },
4164
+ { name = "mkdocs-render-swagger-plugin" },
4165
+ { name = "mkdocstrings", extra = ["python"] },
4166
+ { name = "pre-commit" },
4167
+ { name = "pytest" },
4168
+ { name = "pytest-antilru" },
4169
+ { name = "pytest-asyncio" },
4170
+ { name = "pytest-mock" },
4171
+ { name = "pytest-xdist" },
4172
+ { name = "ruff" },
4173
+ { name = "srt" },
4174
+ { name = "webvtt-py" },
4175
+ ]
4176
+ opentelemetry = [
4177
+ { name = "opentelemetry-distro" },
4178
+ { name = "opentelemetry-exporter-otlp" },
4179
+ { name = "opentelemetry-instrumentation-asyncio" },
4180
+ { name = "opentelemetry-instrumentation-fastapi" },
4181
+ { name = "opentelemetry-instrumentation-grpc" },
4182
+ { name = "opentelemetry-instrumentation-httpx" },
4183
+ { name = "opentelemetry-instrumentation-logging" },
4184
+ { name = "opentelemetry-instrumentation-requests" },
4185
+ { name = "opentelemetry-instrumentation-threading" },
4186
+ { name = "opentelemetry-instrumentation-urllib" },
4187
+ { name = "opentelemetry-instrumentation-urllib3" },
4188
+ ]
4189
+ ui = [
4190
+ { name = "gradio" },
4191
+ { name = "httpx" },
4192
+ { name = "httpx-sse" },
4193
+ { name = "openai" },
4194
+ ]
4195
+
4196
+ [package.metadata]
4197
+ requires-dist = [
4198
+ { name = "anyio", marker = "extra == 'dev'", specifier = ">=4.4.0" },
4199
+ { name = "basedpyright", marker = "extra == 'dev'", specifier = ">=1.18.0" },
4200
+ { name = "ctranslate2", specifier = ">=4.5.0" },
4201
+ { name = "fastapi", specifier = ">=0.115.0" },
4202
+ { name = "faster-whisper", specifier = ">=1.1.0" },
4203
+ { name = "gradio", marker = "extra == 'ui'", specifier = ">=5.0.2" },
4204
+ { name = "httpx", marker = "extra == 'ui'", specifier = ">=0.27.2" },
4205
+ { name = "httpx-sse", marker = "extra == 'ui'", specifier = ">=0.4.0" },
4206
+ { name = "huggingface-hub", extras = ["hf-transfer"], specifier = ">=0.25.1" },
4207
+ { name = "keyboard", marker = "extra == 'client'", specifier = ">=0.13.5" },
4208
+ { name = "mdx-truly-sane-lists", marker = "extra == 'dev'", specifier = ">=1.3" },
4209
+ { name = "mkdocs-material", marker = "extra == 'dev'", specifier = ">=9.5.39" },
4210
+ { name = "mkdocs-render-swagger-plugin", marker = "extra == 'dev'", specifier = ">=0.1.2" },
4211
+ { name = "mkdocstrings", extras = ["python"], marker = "extra == 'dev'", specifier = ">=0.26.1" },
4212
+ { name = "numpy", specifier = ">=2.1.1" },
4213
+ { name = "openai", marker = "extra == 'ui'", specifier = ">=1.48.0" },
4214
+ { name = "opentelemetry-distro", marker = "extra == 'opentelemetry'", specifier = ">=0.48b0" },
4215
+ { name = "opentelemetry-exporter-otlp", marker = "extra == 'opentelemetry'", specifier = ">=1.27.0" },
4216
+ { name = "opentelemetry-instrumentation-asyncio", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
4217
+ { name = "opentelemetry-instrumentation-fastapi", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
4218
+ { name = "opentelemetry-instrumentation-grpc", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
4219
+ { name = "opentelemetry-instrumentation-httpx", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
4220
+ { name = "opentelemetry-instrumentation-logging", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
4221
+ { name = "opentelemetry-instrumentation-requests", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
4222
+ { name = "opentelemetry-instrumentation-threading", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
4223
+ { name = "opentelemetry-instrumentation-urllib", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
4224
+ { name = "opentelemetry-instrumentation-urllib3", marker = "extra == 'opentelemetry'", specifier = "==0.48b0" },
4225
+ { name = "piper-phonemize", marker = "platform_machine == 'x86_64'", url = "https://github.com/fedirz/piper-phonemize/raw/refs/heads/master/dist/piper_phonemize-1.2.0-cp312-cp312-manylinux_2_28_x86_64.whl" },
4226
+ { name = "piper-tts", marker = "platform_machine == 'x86_64'", specifier = ">=1.2.0" },
4227
+ { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.0.1" },
4228
+ { name = "pydantic", specifier = ">=2.9.0" },
4229
+ { name = "pydantic-settings", specifier = ">=2.5.2" },
4230
+ { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.3.3" },
4231
+ { name = "pytest-antilru", marker = "extra == 'dev'", specifier = ">=2.0.0" },
4232
+ { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24.0" },
4233
+ { name = "pytest-mock", marker = "extra == 'dev'", specifier = ">=3.14.0" },
4234
+ { name = "pytest-xdist", marker = "extra == 'dev'", specifier = ">=3.6.1" },
4235
+ { name = "python-multipart", specifier = ">=0.0.10" },
4236
+ { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.7.1" },
4237
+ { name = "sounddevice", specifier = ">=0.5.1" },
4238
+ { name = "soundfile", specifier = ">=0.12.1" },
4239
+ { name = "srt", marker = "extra == 'dev'", specifier = ">=3.5.3" },
4240
+ { name = "uvicorn", specifier = ">=0.30.6" },
4241
+ { name = "webvtt-py", marker = "extra == 'dev'", specifier = ">=0.5.1" },
4242
+ ]
4243
+
4244
  [[package]]
4245
  name = "srt"
4246
  version = "3.5.3"