deeppunct-gr / app.py
wldmr's picture
removed filler words
750c85a
raw
history blame
3.44 kB
from deepmultilingualpunctuation import PunctuationModel
import gradio as gr
import re
# https://stackoverflow.com/questions/22800401/how-to-capitalize-the-first-letter-of-every-sentence
def cap(match):
return(match.group().capitalize())
def predict(brakes, transcript):
# preprocess the text by removing filler words
# Define a list of filler words to remove
filler_words = ["um", "uh", "hmm", "ha", "er", "ah"]
words = transcript.split()
clean_words = [word for word in words if word.lower() not in filler_words]
input_text = " ".join(clean_words)
# Define a regular expression pattern that matches any filler word surrounded by whitespace or punctuation
#pattern = r"(?<=\s|\b)(" + "|".join(fillers) + r")(?=\s|\b)"
# Use re.sub to replace the filler words with empty strings
#clean_input_text = re.sub(pattern, "", input_text)
# Do the punctuation restauration
model = PunctuationModel()
output_text = model.restore_punctuation(input_text)
srt_file = input_text
punctuated = output_text
# if any of the line brake methods are implemented,
# return the text as a single line
pcnt_file_cr = output_text
if 'timelines' in brakes:
# restore the carrige returns
srt_file_strip=srt_file.strip()
srt_file_sub=re.sub('\s*\n\s*','# ',srt_file_strip)
srt_file_array=srt_file_sub.split(' ')
pcnt_file_array=punctuated.split(' ')
# goal: restore the break points i.e. the same number of lines as the srt file
# this is necessary, because each line in the srt file corresponds to a frame from the video
if len(srt_file_array)!=len(pcnt_file_array):
return "AssertError: The length of the transcript and the punctuated file should be the same: ",len(srt_file_array),len(pcnt_file_array)
pcnt_file_array_hash = []
for idx, item in enumerate(srt_file_array):
if item.endswith('#'):
pcnt_file_array_hash.append(pcnt_file_array[idx]+'#')
else:
pcnt_file_array_hash.append(pcnt_file_array[idx])
# assemble the array back to a string
pcnt_file_cr=' '.join(pcnt_file_array_hash).replace('#','\n')
elif 'sentences' in brakes:
split_text = output_text.split('. ')
pcnt_file_cr = '.\n'.join(split_text)
regex1 = r"\bi\b"
regex2 = r"(?<=[.?!;])\s*\w"
regex3 = r"^\w"
pcnt_file_cr_cap = re.sub(regex3, lambda x: x.group().upper(), re.sub(regex2, lambda x: x.group().upper(), re.sub(regex1, "I", pcnt_file_cr)))
return pcnt_file_cr_cap
if __name__ == "__main__":
title = "Deep Punkt App"
description = """
<b>Description</b>: <br>
Model restores punctuation and case i.e. of the following punctuations -- [! ? . , - : ; ' ] and also the upper-casing of words. <br>
"""
examples = [['sentences', "my name is clara i live in berkeley california"]]
interface = gr.Interface(fn = predict,
inputs = [gr.Radio(["sentences", "timelines"], label="brakes"),
"text"],
outputs = ["text"],
title = title,
description = description,
examples=examples,
queue=True,
allow_flagging="never")
interface.launch()