Spaces:
Runtime error
Runtime error
from deepmultilingualpunctuation import PunctuationModel | |
import gradio as gr | |
import re | |
# https://stackoverflow.com/questions/22800401/how-to-capitalize-the-first-letter-of-every-sentence | |
def cap(match): | |
return(match.group().capitalize()) | |
def predict(brakes, transcript): | |
# preprocess the text by removing filler words | |
# Define a list of filler words to remove | |
filler_words = ["um", "uh", "hmm", "ha", "er", "ah"] | |
words = transcript.split() | |
clean_words = [word for word in words if word.lower() not in filler_words] | |
input_text = " ".join(clean_words) | |
# Define a regular expression pattern that matches any filler word surrounded by whitespace or punctuation | |
#pattern = r"(?<=\s|\b)(" + "|".join(fillers) + r")(?=\s|\b)" | |
# Use re.sub to replace the filler words with empty strings | |
#clean_input_text = re.sub(pattern, "", input_text) | |
# Do the punctuation restauration | |
model = PunctuationModel() | |
output_text = model.restore_punctuation(input_text) | |
srt_file = input_text | |
punctuated = output_text | |
# if any of the line brake methods are implemented, | |
# return the text as a single line | |
pcnt_file_cr = output_text | |
if 'timelines' in brakes: | |
# restore the carrige returns | |
srt_file_strip=srt_file.strip() | |
srt_file_sub=re.sub('\s*\n\s*','# ',srt_file_strip) | |
srt_file_array=srt_file_sub.split(' ') | |
pcnt_file_array=punctuated.split(' ') | |
# goal: restore the break points i.e. the same number of lines as the srt file | |
# this is necessary, because each line in the srt file corresponds to a frame from the video | |
if len(srt_file_array)!=len(pcnt_file_array): | |
return "AssertError: The length of the transcript and the punctuated file should be the same: ",len(srt_file_array),len(pcnt_file_array) | |
pcnt_file_array_hash = [] | |
for idx, item in enumerate(srt_file_array): | |
if item.endswith('#'): | |
pcnt_file_array_hash.append(pcnt_file_array[idx]+'#') | |
else: | |
pcnt_file_array_hash.append(pcnt_file_array[idx]) | |
# assemble the array back to a string | |
pcnt_file_cr=' '.join(pcnt_file_array_hash).replace('#','\n') | |
elif 'sentences' in brakes: | |
split_text = output_text.split('. ') | |
pcnt_file_cr = '.\n'.join(split_text) | |
regex1 = r"\bi\b" | |
regex2 = r"(?<=[.?!;])\s*\w" | |
regex3 = r"^\w" | |
pcnt_file_cr_cap = re.sub(regex3, lambda x: x.group().upper(), re.sub(regex2, lambda x: x.group().upper(), re.sub(regex1, "I", pcnt_file_cr))) | |
return pcnt_file_cr_cap | |
if __name__ == "__main__": | |
title = "Deep Punkt App" | |
description = """ | |
<b>Description</b>: <br> | |
Model restores punctuation and case i.e. of the following punctuations -- [! ? . , - : ; ' ] and also the upper-casing of words. <br> | |
""" | |
examples = [['sentences', "my name is clara i live in berkeley california"]] | |
interface = gr.Interface(fn = predict, | |
inputs = [gr.Radio(["sentences", "timelines"], label="brakes"), | |
"text"], | |
outputs = ["text"], | |
title = title, | |
description = description, | |
examples=examples, | |
queue=True, | |
allow_flagging="never") | |
interface.launch() | |