adding the code files
Browse files- Dockerfile +50 -0
- config.py +13 -0
- controllers/transcription_controller.py +15 -0
- main.py +13 -0
- models/schema.py +11 -0
- requirements.txt +51 -0
- routers/transcription.py +15 -0
- services/whisper_service.py +46 -0
Dockerfile
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use Python base image
|
2 |
+
FROM python:3.12.2
|
3 |
+
|
4 |
+
# Prevent timezone prompt during package installation
|
5 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
6 |
+
ENV TZ=UTC
|
7 |
+
|
8 |
+
# Install system dependencies
|
9 |
+
RUN apt-get update && apt-get install -y \
|
10 |
+
ffmpeg \
|
11 |
+
git \
|
12 |
+
curl \
|
13 |
+
build-essential \
|
14 |
+
cargo \
|
15 |
+
pkg-config \
|
16 |
+
libssl-dev \
|
17 |
+
&& rm -rf /var/lib/apt/lists/*
|
18 |
+
|
19 |
+
# Install Rust for setuptools-rust
|
20 |
+
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
|
21 |
+
ENV PATH="/root/.cargo/bin:${PATH}"
|
22 |
+
|
23 |
+
# Set working directory
|
24 |
+
WORKDIR /app
|
25 |
+
|
26 |
+
# Copy application code
|
27 |
+
COPY . /app
|
28 |
+
|
29 |
+
# Install Python dependencies
|
30 |
+
RUN pip install --no-cache-dir -U pip setuptools wheel
|
31 |
+
RUN pip install --no-cache-dir setuptools-rust
|
32 |
+
RUN pip install --no-cache-dir torch==2.0.1+cu118 torchaudio==2.0.2+cu118 -f https://download.pytorch.org/whl/torch_stable.html
|
33 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
34 |
+
|
35 |
+
# Create and switch to a non-root user
|
36 |
+
RUN useradd -m -u 1000 user
|
37 |
+
USER user
|
38 |
+
|
39 |
+
# Ensure ownership of /app directory
|
40 |
+
COPY --chown=user . /app/
|
41 |
+
|
42 |
+
# Environment variables for PyTorch and Whisper
|
43 |
+
ENV TORCH_DEVICE="cuda"
|
44 |
+
ENV FORCE_FP32="false"
|
45 |
+
|
46 |
+
# Expose port
|
47 |
+
EXPOSE 7860
|
48 |
+
|
49 |
+
# Start the FastAPI application
|
50 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "5"]
|
config.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic_settings import BaseSettings
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
load_dotenv()
|
4 |
+
import os
|
5 |
+
|
6 |
+
class Settings(BaseSettings):
|
7 |
+
WHISPER_MODEL: str = "base" # Options: tiny, base, small, medium, large, large-v2, turbo
|
8 |
+
TEMP_UPLOAD_DIR: str = "temp_uploads"
|
9 |
+
TORCH_DEVICE: str = os.getenv("TORCH_DEVICE", "cpu")
|
10 |
+
FORCE_FP32: bool = os.getenv("FORCE_FP32", "false").lower() == "true"
|
11 |
+
|
12 |
+
|
13 |
+
settings = Settings()
|
controllers/transcription_controller.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import HTTPException
|
2 |
+
from services.whisper_service import WhisperService
|
3 |
+
from models.schema import TranscriptionResponse
|
4 |
+
|
5 |
+
|
6 |
+
class TranscriptionController:
|
7 |
+
def __init__(self):
|
8 |
+
self.whisper_service = WhisperService()
|
9 |
+
|
10 |
+
async def transcribe_audio(self, audio_file: bytes, output_language: str = None) -> TranscriptionResponse:
|
11 |
+
try:
|
12 |
+
result = await self.whisper_service.transcribe(audio_file, output_language)
|
13 |
+
return TranscriptionResponse(**result)
|
14 |
+
except Exception as e:
|
15 |
+
raise HTTPException(status_code=500, detail=str(e))
|
main.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
from routers import transcription
|
3 |
+
|
4 |
+
app = FastAPI(title="Whisper Transcription API",docs_url="/")
|
5 |
+
|
6 |
+
app.include_router(transcription.router, prefix="/api/v1", tags=["transcription"])
|
7 |
+
|
8 |
+
@app.get("/")
|
9 |
+
async def root():
|
10 |
+
return {
|
11 |
+
"message": "Welcome to Whisper Transcription API",
|
12 |
+
"docs_url": "/docs"
|
13 |
+
}
|
models/schema.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel
|
2 |
+
from typing import Optional
|
3 |
+
|
4 |
+
|
5 |
+
class TranscriptionRequest(BaseModel):
|
6 |
+
output_language: str = "en"
|
7 |
+
|
8 |
+
class TranscriptionResponse(BaseModel):
|
9 |
+
text: str
|
10 |
+
language: Optional[str]
|
11 |
+
segments: Optional[list]
|
requirements.txt
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accelerate==1.4.0
|
2 |
+
audioread==3.0.1
|
3 |
+
certifi==2025.1.31
|
4 |
+
cffi==1.17.1
|
5 |
+
charset-normalizer==3.4.1
|
6 |
+
decorator==5.2.1
|
7 |
+
filelock==3.13.1
|
8 |
+
fsspec==2024.6.1
|
9 |
+
huggingface-hub==0.29.1
|
10 |
+
idna==3.10
|
11 |
+
Jinja2==3.1.4
|
12 |
+
joblib==1.4.2
|
13 |
+
lazy_loader==0.4
|
14 |
+
librosa==0.10.2.post1
|
15 |
+
llvmlite==0.44.0
|
16 |
+
MarkupSafe==2.1.5
|
17 |
+
mpmath==1.3.0
|
18 |
+
msgpack==1.1.0
|
19 |
+
networkx==3.3
|
20 |
+
numba==0.61.0
|
21 |
+
numpy==2.1.2
|
22 |
+
packaging==24.2
|
23 |
+
pillow==11.0.0
|
24 |
+
platformdirs==4.3.6
|
25 |
+
pooch==1.8.2
|
26 |
+
psutil==7.0.0
|
27 |
+
pycparser==2.22
|
28 |
+
PyYAML==6.0.2
|
29 |
+
regex==2024.11.6
|
30 |
+
requests==2.32.3
|
31 |
+
safetensors==0.5.2
|
32 |
+
scikit-learn==1.6.1
|
33 |
+
scipy==1.15.2
|
34 |
+
setuptools==70.2.0
|
35 |
+
soundfile==0.13.1
|
36 |
+
soxr==0.5.0.post1
|
37 |
+
sympy==1.13.1
|
38 |
+
threadpoolctl==3.5.0
|
39 |
+
tokenizers==0.21.0
|
40 |
+
torch==2.6.0
|
41 |
+
torchaudio==2.6.0
|
42 |
+
torchvision==0.21.0
|
43 |
+
tqdm==4.67.1
|
44 |
+
transformers==4.49.0
|
45 |
+
typing_extensions==4.12.2
|
46 |
+
urllib3==2.3.0
|
47 |
+
pydantic~=2.10.6
|
48 |
+
fastapi~=0.115.8
|
49 |
+
pydantic-settings~=2.8.0
|
50 |
+
openai-whisper~=20240930
|
51 |
+
python-dotenv~=1.0.1
|
routers/transcription.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter, UploadFile, File, Form
|
2 |
+
from typing import Optional
|
3 |
+
from controllers.transcription_controller import TranscriptionController
|
4 |
+
from models.schema import TranscriptionResponse
|
5 |
+
|
6 |
+
router = APIRouter()
|
7 |
+
controller = TranscriptionController()
|
8 |
+
|
9 |
+
@router.post("/transcribe", response_model=TranscriptionResponse)
|
10 |
+
async def transcribe_audio(
|
11 |
+
audio_file: UploadFile = File(...),
|
12 |
+
output_language: Optional[str] = Form(None)
|
13 |
+
):
|
14 |
+
contents = await audio_file.read()
|
15 |
+
return await controller.transcribe_audio(contents, output_language)
|
services/whisper_service.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import whisper
|
2 |
+
import tempfile
|
3 |
+
import os
|
4 |
+
import torch
|
5 |
+
from config import settings
|
6 |
+
|
7 |
+
|
8 |
+
class WhisperService:
|
9 |
+
def __init__(self):
|
10 |
+
if settings.TORCH_DEVICE == "cuda" and not torch.cuda.is_available():
|
11 |
+
print("WARNING: CUDA requested but not available. Falling back to CPU.")
|
12 |
+
self.device = "cpu"
|
13 |
+
else:
|
14 |
+
self.device = settings.TORCH_DEVICE
|
15 |
+
|
16 |
+
self.model = whisper.load_model(settings.WHISPER_MODEL)
|
17 |
+
if settings.FORCE_FP32 or self.device == "cpu":
|
18 |
+
self.model = self.model.float()
|
19 |
+
|
20 |
+
async def transcribe(self, audio_file: bytes, output_language: str = None) -> dict:
|
21 |
+
try:
|
22 |
+
# Create a temporary file to store the uploaded audio
|
23 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_audio:
|
24 |
+
temp_audio.write(audio_file)
|
25 |
+
temp_audio_path = temp_audio.name
|
26 |
+
|
27 |
+
try:
|
28 |
+
# Transcribe the audio
|
29 |
+
transcription_options = {"fp16": not settings.FORCE_FP32 and self.device == "cuda"}
|
30 |
+
if output_language:
|
31 |
+
transcription_options["language"] = output_language
|
32 |
+
|
33 |
+
result = self.model.transcribe(temp_audio_path, **transcription_options)
|
34 |
+
|
35 |
+
return {
|
36 |
+
"text": result["text"],
|
37 |
+
"language": result.get("language"),
|
38 |
+
"segments": result.get("segments")
|
39 |
+
}
|
40 |
+
finally:
|
41 |
+
# Clean up the temporary file
|
42 |
+
if os.path.exists(temp_audio_path):
|
43 |
+
os.remove(temp_audio_path)
|
44 |
+
|
45 |
+
except Exception as e:
|
46 |
+
raise Exception(f"Transcription failed: {str(e)}")
|