File size: 1,408 Bytes
4c04be1
4f4670a
4c04be1
 
 
 
2f2c6bc
 
 
ce691da
4c04be1
 
cb5d3fd
 
 
 
 
 
 
 
2f2c6bc
 
 
78e2ff4
4c04be1
 
 
2f2c6bc
4c04be1
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from typing import Dict
from faster_whisper import WhisperModel
import io

class EndpointHandler:
    def __init__(self, model_dir=None):
        # Set model size, assuming installation has been done with appropriate model files and setup
        model_size = "medium" if model_dir is None else model_dir
        # Change to 'cuda' to use the GPU, and set compute_type for faster computation
        self.model = WhisperModel(model_size, device="cuda", compute_type="float16")

    def __call__(self, data: Dict) -> Dict[str, str]:
        # Process the input data expected to be in 'inputs' key containing audio file bytes
        audio_bytes = data["inputs"]
        
        # Convert bytes to a file-like object
        audio_file = io.BytesIO(audio_bytes)
        
        # Perform transcription using the model
        segments, info = self.model.transcribe(audio_file)
        
        # Compile the results into a text string and extract language information
        # Strip whitespace from each segment before joining them
        text = " ".join(segment.text.strip() for segment in segments)
        language_code = info.language
        language_prob = info.language_probability

        # Compile the response dictionary
        result = {
            "text": text,
            "language": language_code,
            "language_probability": language_prob
        }
        
        return result