Spaces:
Runtime error
Runtime error
feat: fix with original whisper
Browse files- app.py +13 -5
- flagged/log.csv +2 -0
- requirements.txt +7 -2
- src/__pycache__/infer.cpython-310.pyc +0 -0
- src/__pycache__/utils.cpython-310.pyc +0 -0
- src/infer.py +8 -17
- src/utils.py +7 -14
app.py
CHANGED
@@ -9,19 +9,27 @@ audio_examples = [
|
|
9 |
[None, "assets/audio/female-english.wav", None],
|
10 |
]
|
11 |
|
|
|
|
|
|
|
|
|
12 |
demo = gr.Interface(
|
13 |
fn=infer.predict,
|
14 |
inputs=[
|
|
|
|
|
|
|
|
|
15 |
gr.Radio(label="Language",
|
16 |
choices=["indonesian","english"],
|
17 |
value="indonesian"),
|
18 |
-
gr.Audio(label="Speak", source="microphone", type="
|
19 |
-
gr.Audio(label="Upload Audio", source="upload", type="
|
20 |
],
|
21 |
outputs=[gr.TextArea(label="Output Text"),],
|
22 |
-
title=
|
23 |
-
description=
|
24 |
-
article=
|
25 |
# examples=audio_examples,
|
26 |
)
|
27 |
|
|
|
9 |
[None, "assets/audio/female-english.wav", None],
|
10 |
]
|
11 |
|
12 |
+
TITLE = "OpenAI Whisper"
|
13 |
+
DESCRIPTION = utils.parsing_text("assets/descriptions.md")
|
14 |
+
ARTICLE = utils.parsing_text("assets/articles.md")
|
15 |
+
|
16 |
demo = gr.Interface(
|
17 |
fn=infer.predict,
|
18 |
inputs=[
|
19 |
+
gr.Dropdown(
|
20 |
+
label="Model",
|
21 |
+
choices=["tiny","small","base","medium","large","large-v2"],
|
22 |
+
value="base"),
|
23 |
gr.Radio(label="Language",
|
24 |
choices=["indonesian","english"],
|
25 |
value="indonesian"),
|
26 |
+
gr.Audio(label="Speak", source="microphone", type="filepath"),
|
27 |
+
gr.Audio(label="Upload Audio", source="upload", type="filepath"),
|
28 |
],
|
29 |
outputs=[gr.TextArea(label="Output Text"),],
|
30 |
+
title=TITLE,
|
31 |
+
description=DESCRIPTION,
|
32 |
+
article=ARTICLE,
|
33 |
# examples=audio_examples,
|
34 |
)
|
35 |
|
flagged/log.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
audio,state,output 0,state,flag,username,timestamp
|
2 |
+
,,,,,,2023-08-11 19:42:07.779875
|
requirements.txt
CHANGED
@@ -1,3 +1,8 @@
|
|
1 |
-
|
|
|
2 |
transformers
|
3 |
-
|
|
|
|
|
|
|
|
|
|
1 |
+
git+https://github.com/huggingface/transformers
|
2 |
+
git+https://github.com/openai/whisper.git
|
3 |
transformers
|
4 |
+
ffmpeg-python==0.2.0
|
5 |
+
gradio==3.38.0
|
6 |
+
torchaudio
|
7 |
+
altair
|
8 |
+
json5
|
src/__pycache__/infer.cpython-310.pyc
CHANGED
Binary files a/src/__pycache__/infer.cpython-310.pyc and b/src/__pycache__/infer.cpython-310.pyc differ
|
|
src/__pycache__/utils.cpython-310.pyc
CHANGED
Binary files a/src/__pycache__/utils.cpython-310.pyc and b/src/__pycache__/utils.cpython-310.pyc differ
|
|
src/infer.py
CHANGED
@@ -2,27 +2,18 @@
|
|
2 |
from typing import *
|
3 |
from src import utils
|
4 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
|
|
5 |
|
6 |
-
model_name
|
7 |
-
processor: Any = WhisperProcessor.from_pretrained(model_name)
|
8 |
-
model: Any = WhisperForConditionalGeneration.from_pretrained(model_name)
|
9 |
-
|
10 |
-
sample_rate: int = 16000
|
11 |
-
float_factor: float = 32678.0
|
12 |
-
|
13 |
-
|
14 |
-
def predict(language, mic_audio=None, audio=None):
|
15 |
if mic_audio is not None:
|
16 |
-
|
17 |
elif audio is not None:
|
18 |
-
|
19 |
else:
|
20 |
return "(please provide audio)"
|
21 |
|
22 |
-
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
|
28 |
-
return transcription[0]
|
|
|
2 |
from typing import *
|
3 |
from src import utils
|
4 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
5 |
+
import whisper
|
6 |
|
7 |
+
def predict(model_name, language, mic_audio=None, audio=None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
if mic_audio is not None:
|
9 |
+
voice = mic_audio
|
10 |
elif audio is not None:
|
11 |
+
voice = audio
|
12 |
else:
|
13 |
return "(please provide audio)"
|
14 |
|
15 |
+
voice = utils.preprocess_audio(voice)
|
16 |
|
17 |
+
model = whisper.load_model(model_name)
|
18 |
+
result = model.transcribe(voice, language=language)
|
19 |
+
return result["text"]
|
|
|
|
src/utils.py
CHANGED
@@ -2,25 +2,18 @@
|
|
2 |
import librosa
|
3 |
import torch
|
4 |
from pathlib import Path
|
|
|
5 |
|
6 |
sample_rate: int = 16000
|
7 |
float_factor: float = 32678.0
|
8 |
|
9 |
-
def preprocess_audio(sampling_rate, waveform):
|
10 |
-
waveform: float = waveform / float_factor
|
11 |
-
|
12 |
-
if len(waveform.shape) > 1:
|
13 |
-
waveform = librosa.to_mono(waveform.T)
|
14 |
-
|
15 |
-
if sampling_rate != sample_rate:
|
16 |
-
waveform = librosa.resample(waveform, orig_sr=sampling_rate, target_sr=sample_rate)
|
17 |
-
|
18 |
-
# limit to 30 seconds
|
19 |
-
waveform: float = waveform[:sample_rate * 30]
|
20 |
-
|
21 |
-
waveform: float = torch.tensor(waveform)
|
22 |
-
return waveform
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
def parsing_text(filepath: str):
|
26 |
path = Path(filepath)
|
|
|
2 |
import librosa
|
3 |
import torch
|
4 |
from pathlib import Path
|
5 |
+
import whisper
|
6 |
|
7 |
sample_rate: int = 16000
|
8 |
float_factor: float = 32678.0
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
+
def preprocess_audio(filepath: str):
|
12 |
+
# load audio and pad/trim it to fit 30 seconds
|
13 |
+
audio = whisper.load_audio(filepath)
|
14 |
+
audio = whisper.pad_or_trim(audio)
|
15 |
+
|
16 |
+
return audio
|
17 |
|
18 |
def parsing_text(filepath: str):
|
19 |
path = Path(filepath)
|