Spaces:
Sleeping
Sleeping
File size: 2,928 Bytes
2faf743 fe5b216 34a0eeb fe5b216 34a0eeb fe5b216 34a0eeb 2faf743 fe5b216 34a0eeb 3dc3903 fe5b216 6a3f9b4 3dc3903 2faf743 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import os
import torch as th
import whisper
from whisper.audio import SAMPLE_RATE
from tenacity import retry, wait_random
import openai
import requests
# os.environ['OPENAI_API_KEY'] = "sk-<API KEY>"
class WHISPERModel:
def __init__(self, model_name='small', device='cuda',openai_flag=False):
self.device = device
self.openai_flag = openai_flag
self.model = whisper.load_model(model_name, device=self.device)
def get_info(self, audio_data, conv_duration=30):
clip_audio = whisper.pad_or_trim(audio_data, length=SAMPLE_RATE * conv_duration)
result = self.model.transcribe(clip_audio)
return result['language']
def read_audio(self,audio_path,duration=None):
audio = None
try:
audio = whisper.load_audio(audio_path)
if duration :
audio = whisper.pad_or_trim(audio, length=SAMPLE_RATE * duration*60)
except IOError as err:
raise err
return audio
def speech_to_text(self, audio_path,duration=None):
text_data = dict()
audio_duration = 0
conv_language = ""
if audio_path.startswith('http'):
r = requests.get(audio_path)
if r.status_code == 200:
audio = self.read_audio(audio_path)
else:
raise("Unable to reach for URL {}".format(audio_path))
else:
audio = self.read_audio(audio_path)
conv_language = self.get_info(audio)
if conv_language !='en':
res = self.model.transcribe(audio,task='translate')
if self.openai_flag:
res['text'] = self.translate_text(res['text'], orginal_text=conv_language, convert_to='English')
else:
res = self.model.transcribe(audio)
audio_duration = audio.shape[0] / SAMPLE_RATE
text_data['text'] = res['text']
text_data['duration'] = audio_duration
text_data['language'] = conv_language
return text_data
@retry(wait=wait_random(min=5, max=10))
def translate_text(self, text, orginal_text='ar', convert_to='english'):
prompt = f'Translate the following {orginal_text} text to {convert_to}:\n\n{orginal_text}: ' + text + '\n{convert_to}:'
# Generate response using ChatGPT
response = openai.Completion.create(
engine='text-davinci-003',
prompt=prompt,
max_tokens=100,
n=1,
stop=None,
temperature=0.7
)
# Extract the translated English text from the response
translation = response.choices[0].text.strip()
return translation
if __name__ == '__main__':
url = "https://prypto-api.aswat.co/surveillance/recordings/5f53c28b-3504-4b8b-9db5-0c8b69a96233.mp3"
audio2text = WHISPERModel()
text = audio2text.speech_to_text(url)
|