call-sentiment-demo

Build error

App Files Files Community

call-sentiment-demo / utils.py

enoreyes

Update utils.py

0f37532 almost 2 years ago

raw

history blame contribute delete

3.7 kB

	import re
	import functools
	import requests
	import pandas as pd
	import plotly.express as px
	import torch
	import gradio as gr
	from transformers import pipeline, Wav2Vec2ProcessorWithLM
	from pyannote.audio import Pipeline
	from librosa import load, resample
	import whisperx

	import re
	alphabets= "([A-Za-z])"
	prefixes = "(Mr\|St\|Mrs\|Ms\|Dr)[.]"
	suffixes = "(Inc\|Ltd\|Jr\|Sr\|Co)"
	starters = "(Mr\|Mrs\|Ms\|Dr\|He\s\|She\s\|It\s\|They\s\|Their\s\|Our\s\|We\s\|But\s\|However\s\|That\s\|This\s\|Wherever)"
	acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
	websites = "[.](com\|net\|org\|io\|gov)"

	def split(text):
	text = " " + text + " "
	text = text.replace("\n"," ")
	text = re.sub(prefixes,"\\1<prd>",text)
	text = re.sub(websites,"<prd>\\1",text)
	if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
	text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
	text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
	text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
	text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
	text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
	text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
	text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
	if "”" in text: text = text.replace(".”","”.")
	if "\"" in text: text = text.replace(".\"","\".")
	if "!" in text: text = text.replace("!\"","\"!")
	if "?" in text: text = text.replace("?\"","\"?")
	text = text.replace(".",".<stop>")
	text = text.replace("?","?<stop>")
	text = text.replace("!","!<stop>")
	text = text.replace("<prd>",".")
	sentences = text.split("<stop>")
	sentences = sentences[:-1]
	sentences = [s.strip() for s in sentences]
	return sentences

	def create_fig(x_min, x_max, to_plot, plot_sentences):
	x, y = list(zip(*to_plot))

	x_min -= 5
	x_max += 5

	plot_df = pd.DataFrame(
	data={
	"x": x,
	"y": y,
	"sentence": plot_sentences,
	}
	)

	fig = px.line(
	plot_df,
	x="x",
	y="y",
	hover_data={
	"sentence": True,
	"x": True,
	"y": False,
	},
	labels={"x": "time (seconds)", "y": "sentiment"},
	title=f"Customer sentiment over time",
	markers=True,
	)

	fig = fig.update_yaxes(categoryorder="category ascending")
	fig = fig.update_layout(
	font=dict(
	size=18,
	),
	xaxis_range=[x_min, x_max],
	)

	return fig

	def speech_to_text(speech_file, speaker_segmentation, whisper, alignment_model, metadata, whisper_device):
	speaker_output = speaker_segmentation(speech_file)
	result = whisper.transcribe(speech_file)

	chunks = whisperx.align(result["segments"], alignment_model, metadata, speech_file, whisper_device)["word_segments"]

	diarized_output = []
	i = 0
	speaker_counter = 0

	# New iteration every time the speaker changes
	for turn, _, _ in speaker_output.itertracks(yield_label=True):

	speaker = "Customer" if speaker_counter % 2 == 0 else "Support"
	diarized = ""
	while i < len(chunks) and chunks[i]["end"] <= turn.end:
	diarized += chunks[i]["text"] + " "
	i += 1

	if diarized != "":
	# diarized = rpunct.punctuate(re.sub(eng_pattern, "", diarized), lang="en")

	diarized_output.extend(
	[
	(diarized, speaker),
	("from {:.2f}-{:.2f}".format(turn.start, turn.end), None),
	]
	)

	speaker_counter += 1

	return diarized_output