Spaces:
Build error
Build error
File size: 4,859 Bytes
e053abc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import shutil
import gradio as gr
import random
import os
import numpy as np
from pydub import AudioSegment
from datasets import load_dataset
from scipy.io.wavfile import write
from modules.diarization.nemo_diarization import diarization
from modules.nlp.nemo_ner import detect_ner
from modules.nlp.nemo_punct_cap import punctuation_capitalization
FOLDER_WAV_DB = "data/database/"
FOLDER_USER_DATA = "data/user_data/"
FOLDER_USER_DATA_WAV = "data/user_data_wav/"
FOLDER_MANIFESTS = "info/configs/manifests/"
SAMPLE_RATE = 16000
dataset = load_dataset("pustozerov/crema_d_diarization", split='validation')
os.makedirs(FOLDER_WAV_DB, exist_ok=True)
os.makedirs(FOLDER_MANIFESTS, exist_ok=True)
def process_audio(uploaded_file=None):
if uploaded_file:
secondary_audio = False
folder_wav = FOLDER_USER_DATA_WAV
os.makedirs(folder_wav, exist_ok=True)
print(uploaded_file)
shutil.move(uploaded_file, os.path.join(FOLDER_USER_DATA, os.path.basename(uploaded_file)))
uploaded_file = os.path.join(FOLDER_USER_DATA, os.path.basename(uploaded_file))
print(uploaded_file)
if ".mp3" in uploaded_file:
sound = AudioSegment.from_mp3(uploaded_file)
elif ".ogg" in uploaded_file:
sound = AudioSegment.from_ogg(uploaded_file)
else:
sound = AudioSegment.from_wav(uploaded_file)
save_path = folder_wav + os.path.basename(uploaded_file)
os.makedirs(folder_wav, exist_ok=True)
sound.export(save_path, format="wav", parameters=["-ac", "1"])
file_name = os.path.basename(save_path).split(".")[0]
result = diarization(save_path)
else:
secondary_audio = True
folder_wav = FOLDER_WAV_DB
os.makedirs(folder_wav, exist_ok=True)
shuffled_dataset = dataset.shuffle(seed=random.randint(0, 100))
file_name = str(shuffled_dataset["file"][0]).split(".")[0]
audio_bytes = np.array(shuffled_dataset["data"][0])
audio_bytes_scaled = np.int16(audio_bytes / np.max(np.abs(audio_bytes)) * 32767)
write(os.path.join(folder_wav, file_name + '.wav'), rate=SAMPLE_RATE, data=audio_bytes_scaled)
result = diarization(os.path.join(folder_wav, file_name + '.wav'))
transcript_path = "info/transcripts/pred_rttms/" + file_name + ".txt"
with open(transcript_path) as f:
transcript = f.read()
sentences = result[file_name]["sentences"]
all_strings = ""
for sentence in sentences:
all_strings = all_strings + sentence["sentence"] + "\n"
all_strings = punctuation_capitalization([all_strings])[0]
tagged_string, tags_summary = detect_ner(all_strings)
transcript = transcript + '\n' + tagged_string
with open(transcript_path, 'w') as f:
f.write(transcript)
output = "<p>Number of speakers: %s" % result[file_name]["speaker_count"] + "<br>" \
+ "Sentences: %s" % len(result[file_name]["sentences"]) + "<br>" \
+ "Words: %s" % len(result[file_name]["words"]) + "<br>" \
+ "Found named entities: %s" % tags_summary + "</p>"
return [audio_output.update(os.path.join(folder_wav, file_name + '.wav'), visible=secondary_audio),
output, file_output.update(transcript_path, visible=True)]
with gr.Blocks() as demo:
gr.HTML('<br><h1><font size="+4">Call Transcription demo</font></h1>')
gr.HTML('<p><font size="+1">This simple demo shows the possibilities of ASR and NLP in the task of automatic '
'speech recognition '
'and diarization. It works with mp3, ogg, and wav files. You can randomly pick an audio file with the '
'dialogue from the built-in database or try uploading your files.</font></p>')
gr.Markdown('<p><font size="+1">Note: this demo shows up a reduced-performance model. To get a full-performance '
'neural network or '
'develop a system adapted to your task – contact <a '
'href="mailto:kirill.lozovoi@exposit.com?subject=Request for '
'information">kirill.lozovoi@exposit.com</a>.</font></p>')
audio_input = gr.Audio(source="upload", type="filepath")
second_btn = gr.Button('Try uploaded audiofile')
gr.Markdown('<center><p>or</p></center>')
first_btn = gr.Button('Try a random sample from the database')
# Output zone
audio_output = gr.Audio(visible=False, interactive=True)
text_output = gr.HTML()
file_output = gr.File(label="Download audio transcript", visible=False)
# noinspection PyTypeChecker
first_btn.click(fn=process_audio, inputs=None,
outputs=[audio_output, text_output, file_output])
# noinspection PyTypeChecker
second_btn.click(fn=process_audio, inputs=audio_input, outputs=[audio_output, text_output, file_output])
demo.launch(share=True)
|