Omkar008 commited on
Commit
9f559c6
·
verified ·
1 Parent(s): f18ccfe

adding the code files

Browse files
Dockerfile ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python base image
2
+ FROM python:3.12.2
3
+
4
+ # Prevent timezone prompt during package installation
5
+ ENV DEBIAN_FRONTEND=noninteractive
6
+ ENV TZ=UTC
7
+
8
+ # Install system dependencies
9
+ RUN apt-get update && apt-get install -y \
10
+ ffmpeg \
11
+ git \
12
+ curl \
13
+ build-essential \
14
+ cargo \
15
+ pkg-config \
16
+ libssl-dev \
17
+ && rm -rf /var/lib/apt/lists/*
18
+
19
+ # Install Rust for setuptools-rust
20
+ RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
21
+ ENV PATH="/root/.cargo/bin:${PATH}"
22
+
23
+ # Set working directory
24
+ WORKDIR /app
25
+
26
+ # Copy application code
27
+ COPY . /app
28
+
29
+ # Install Python dependencies
30
+ RUN pip install --no-cache-dir -U pip setuptools wheel
31
+ RUN pip install --no-cache-dir setuptools-rust
32
+ RUN pip install --no-cache-dir torch==2.0.1+cu118 torchaudio==2.0.2+cu118 -f https://download.pytorch.org/whl/torch_stable.html
33
+ RUN pip install --no-cache-dir -r requirements.txt
34
+
35
+ # Create and switch to a non-root user
36
+ RUN useradd -m -u 1000 user
37
+ USER user
38
+
39
+ # Ensure ownership of /app directory
40
+ COPY --chown=user . /app/
41
+
42
+ # Environment variables for PyTorch and Whisper
43
+ ENV TORCH_DEVICE="cuda"
44
+ ENV FORCE_FP32="false"
45
+
46
+ # Expose port
47
+ EXPOSE 7860
48
+
49
+ # Start the FastAPI application
50
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "5"]
config.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic_settings import BaseSettings
2
+ from dotenv import load_dotenv
3
+ load_dotenv()
4
+ import os
5
+
6
+ class Settings(BaseSettings):
7
+ WHISPER_MODEL: str = "base" # Options: tiny, base, small, medium, large, large-v2, turbo
8
+ TEMP_UPLOAD_DIR: str = "temp_uploads"
9
+ TORCH_DEVICE: str = os.getenv("TORCH_DEVICE", "cpu")
10
+ FORCE_FP32: bool = os.getenv("FORCE_FP32", "false").lower() == "true"
11
+
12
+
13
+ settings = Settings()
controllers/transcription_controller.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import HTTPException
2
+ from services.whisper_service import WhisperService
3
+ from models.schema import TranscriptionResponse
4
+
5
+
6
+ class TranscriptionController:
7
+ def __init__(self):
8
+ self.whisper_service = WhisperService()
9
+
10
+ async def transcribe_audio(self, audio_file: bytes, output_language: str = None) -> TranscriptionResponse:
11
+ try:
12
+ result = await self.whisper_service.transcribe(audio_file, output_language)
13
+ return TranscriptionResponse(**result)
14
+ except Exception as e:
15
+ raise HTTPException(status_code=500, detail=str(e))
main.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from routers import transcription
3
+
4
+ app = FastAPI(title="Whisper Transcription API",docs_url="/")
5
+
6
+ app.include_router(transcription.router, prefix="/api/v1", tags=["transcription"])
7
+
8
+ @app.get("/")
9
+ async def root():
10
+ return {
11
+ "message": "Welcome to Whisper Transcription API",
12
+ "docs_url": "/docs"
13
+ }
models/schema.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from typing import Optional
3
+
4
+
5
+ class TranscriptionRequest(BaseModel):
6
+ output_language: str = "en"
7
+
8
+ class TranscriptionResponse(BaseModel):
9
+ text: str
10
+ language: Optional[str]
11
+ segments: Optional[list]
requirements.txt ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==1.4.0
2
+ audioread==3.0.1
3
+ certifi==2025.1.31
4
+ cffi==1.17.1
5
+ charset-normalizer==3.4.1
6
+ decorator==5.2.1
7
+ filelock==3.13.1
8
+ fsspec==2024.6.1
9
+ huggingface-hub==0.29.1
10
+ idna==3.10
11
+ Jinja2==3.1.4
12
+ joblib==1.4.2
13
+ lazy_loader==0.4
14
+ librosa==0.10.2.post1
15
+ llvmlite==0.44.0
16
+ MarkupSafe==2.1.5
17
+ mpmath==1.3.0
18
+ msgpack==1.1.0
19
+ networkx==3.3
20
+ numba==0.61.0
21
+ numpy==2.1.2
22
+ packaging==24.2
23
+ pillow==11.0.0
24
+ platformdirs==4.3.6
25
+ pooch==1.8.2
26
+ psutil==7.0.0
27
+ pycparser==2.22
28
+ PyYAML==6.0.2
29
+ regex==2024.11.6
30
+ requests==2.32.3
31
+ safetensors==0.5.2
32
+ scikit-learn==1.6.1
33
+ scipy==1.15.2
34
+ setuptools==70.2.0
35
+ soundfile==0.13.1
36
+ soxr==0.5.0.post1
37
+ sympy==1.13.1
38
+ threadpoolctl==3.5.0
39
+ tokenizers==0.21.0
40
+ torch==2.6.0
41
+ torchaudio==2.6.0
42
+ torchvision==0.21.0
43
+ tqdm==4.67.1
44
+ transformers==4.49.0
45
+ typing_extensions==4.12.2
46
+ urllib3==2.3.0
47
+ pydantic~=2.10.6
48
+ fastapi~=0.115.8
49
+ pydantic-settings~=2.8.0
50
+ openai-whisper~=20240930
51
+ python-dotenv~=1.0.1
routers/transcription.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, UploadFile, File, Form
2
+ from typing import Optional
3
+ from controllers.transcription_controller import TranscriptionController
4
+ from models.schema import TranscriptionResponse
5
+
6
+ router = APIRouter()
7
+ controller = TranscriptionController()
8
+
9
+ @router.post("/transcribe", response_model=TranscriptionResponse)
10
+ async def transcribe_audio(
11
+ audio_file: UploadFile = File(...),
12
+ output_language: Optional[str] = Form(None)
13
+ ):
14
+ contents = await audio_file.read()
15
+ return await controller.transcribe_audio(contents, output_language)
services/whisper_service.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import tempfile
3
+ import os
4
+ import torch
5
+ from config import settings
6
+
7
+
8
+ class WhisperService:
9
+ def __init__(self):
10
+ if settings.TORCH_DEVICE == "cuda" and not torch.cuda.is_available():
11
+ print("WARNING: CUDA requested but not available. Falling back to CPU.")
12
+ self.device = "cpu"
13
+ else:
14
+ self.device = settings.TORCH_DEVICE
15
+
16
+ self.model = whisper.load_model(settings.WHISPER_MODEL)
17
+ if settings.FORCE_FP32 or self.device == "cpu":
18
+ self.model = self.model.float()
19
+
20
+ async def transcribe(self, audio_file: bytes, output_language: str = None) -> dict:
21
+ try:
22
+ # Create a temporary file to store the uploaded audio
23
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_audio:
24
+ temp_audio.write(audio_file)
25
+ temp_audio_path = temp_audio.name
26
+
27
+ try:
28
+ # Transcribe the audio
29
+ transcription_options = {"fp16": not settings.FORCE_FP32 and self.device == "cuda"}
30
+ if output_language:
31
+ transcription_options["language"] = output_language
32
+
33
+ result = self.model.transcribe(temp_audio_path, **transcription_options)
34
+
35
+ return {
36
+ "text": result["text"],
37
+ "language": result.get("language"),
38
+ "segments": result.get("segments")
39
+ }
40
+ finally:
41
+ # Clean up the temporary file
42
+ if os.path.exists(temp_audio_path):
43
+ os.remove(temp_audio_path)
44
+
45
+ except Exception as e:
46
+ raise Exception(f"Transcription failed: {str(e)}")