Spaces:
Build error
Build error
import re | |
import functools | |
import requests | |
import pandas as pd | |
import plotly.express as px | |
import torch | |
import gradio as gr | |
from transformers import pipeline, Wav2Vec2ProcessorWithLM | |
from pyannote.audio import Pipeline | |
from librosa import load, resample | |
import whisperx | |
import re | |
alphabets= "([A-Za-z])" | |
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]" | |
suffixes = "(Inc|Ltd|Jr|Sr|Co)" | |
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)" | |
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)" | |
websites = "[.](com|net|org|io|gov)" | |
def split(text): | |
text = " " + text + " " | |
text = text.replace("\n"," ") | |
text = re.sub(prefixes,"\\1<prd>",text) | |
text = re.sub(websites,"<prd>\\1",text) | |
if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>") | |
text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text) | |
text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text) | |
text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text) | |
text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text) | |
text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text) | |
text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text) | |
text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text) | |
if "”" in text: text = text.replace(".”","”.") | |
if "\"" in text: text = text.replace(".\"","\".") | |
if "!" in text: text = text.replace("!\"","\"!") | |
if "?" in text: text = text.replace("?\"","\"?") | |
text = text.replace(".",".<stop>") | |
text = text.replace("?","?<stop>") | |
text = text.replace("!","!<stop>") | |
text = text.replace("<prd>",".") | |
sentences = text.split("<stop>") | |
sentences = sentences[:-1] | |
sentences = [s.strip() for s in sentences] | |
return sentences | |
def create_fig(x_min, x_max, to_plot, plot_sentences): | |
x, y = list(zip(*to_plot)) | |
x_min -= 5 | |
x_max += 5 | |
plot_df = pd.DataFrame( | |
data={ | |
"x": x, | |
"y": y, | |
"sentence": plot_sentences, | |
} | |
) | |
fig = px.line( | |
plot_df, | |
x="x", | |
y="y", | |
hover_data={ | |
"sentence": True, | |
"x": True, | |
"y": False, | |
}, | |
labels={"x": "time (seconds)", "y": "sentiment"}, | |
title=f"Customer sentiment over time", | |
markers=True, | |
) | |
fig = fig.update_yaxes(categoryorder="category ascending") | |
fig = fig.update_layout( | |
font=dict( | |
size=18, | |
), | |
xaxis_range=[x_min, x_max], | |
) | |
return fig | |
def speech_to_text(speech_file, speaker_segmentation, whisper, alignment_model, metadata, whisper_device): | |
speaker_output = speaker_segmentation(speech_file) | |
result = whisper.transcribe(speech_file) | |
chunks = whisperx.align(result["segments"], alignment_model, metadata, speech_file, whisper_device)["word_segments"] | |
diarized_output = [] | |
i = 0 | |
speaker_counter = 0 | |
# New iteration every time the speaker changes | |
for turn, _, _ in speaker_output.itertracks(yield_label=True): | |
speaker = "Customer" if speaker_counter % 2 == 0 else "Support" | |
diarized = "" | |
while i < len(chunks) and chunks[i]["end"] <= turn.end: | |
diarized += chunks[i]["text"] + " " | |
i += 1 | |
if diarized != "": | |
# diarized = rpunct.punctuate(re.sub(eng_pattern, "", diarized), lang="en") | |
diarized_output.extend( | |
[ | |
(diarized, speaker), | |
("from {:.2f}-{:.2f}".format(turn.start, turn.end), None), | |
] | |
) | |
speaker_counter += 1 | |
return diarized_output |