test-diarize / util.py
jduckles's picture
testing
97598b4
import datetime
from jinja2 import Environment
import tempfile
import pandas as pd
import os
source_languages = {
"en": "English",
"zh": "Chinese",
"de": "German",
"es": "Spanish",
"ru": "Russian",
"ko": "Korean",
"fr": "French",
"ja": "Japanese",
"pt": "Portuguese",
"tr": "Turkish",
"pl": "Polish",
"ca": "Catalan",
"nl": "Dutch",
"ar": "Arabic",
"sv": "Swedish",
"it": "Italian",
"id": "Indonesian",
"hi": "Hindi",
"fi": "Finnish",
"vi": "Vietnamese",
"he": "Hebrew",
"uk": "Ukrainian",
"el": "Greek",
"ms": "Malay",
"cs": "Czech",
"ro": "Romanian",
"da": "Danish",
"hu": "Hungarian",
"ta": "Tamil",
"no": "Norwegian",
"th": "Thai",
"ur": "Urdu",
"hr": "Croatian",
"bg": "Bulgarian",
"lt": "Lithuanian",
"la": "Latin",
"mi": "Maori",
"ml": "Malayalam",
"cy": "Welsh",
"sk": "Slovak",
"te": "Telugu",
"fa": "Persian",
"lv": "Latvian",
"bn": "Bengali",
"sr": "Serbian",
"az": "Azerbaijani",
"sl": "Slovenian",
"kn": "Kannada",
"et": "Estonian",
"mk": "Macedonian",
"br": "Breton",
"eu": "Basque",
"is": "Icelandic",
"hy": "Armenian",
"ne": "Nepali",
"mn": "Mongolian",
"bs": "Bosnian",
"kk": "Kazakh",
"sq": "Albanian",
"sw": "Swahili",
"gl": "Galician",
"mr": "Marathi",
"pa": "Punjabi",
"si": "Sinhala",
"km": "Khmer",
"sn": "Shona",
"yo": "Yoruba",
"so": "Somali",
"af": "Afrikaans",
"oc": "Occitan",
"ka": "Georgian",
"be": "Belarusian",
"tg": "Tajik",
"sd": "Sindhi",
"gu": "Gujarati",
"am": "Amharic",
"yi": "Yiddish",
"lo": "Lao",
"uz": "Uzbek",
"fo": "Faroese",
"ht": "Haitian creole",
"ps": "Pashto",
"tk": "Turkmen",
"nn": "Nynorsk",
"mt": "Maltese",
"sa": "Sanskrit",
"lb": "Luxembourgish",
"my": "Myanmar",
"bo": "Tibetan",
"tl": "Tagalog",
"mg": "Malagasy",
"as": "Assamese",
"tt": "Tatar",
"haw": "Hawaiian",
"ln": "Lingala",
"ha": "Hausa",
"ba": "Bashkir",
"jw": "Javanese",
"su": "Sundanese",
}
whisper_models = ["base", "small", "medium", "large"]
def zip_files(config):
"""
Zip together a list of files returning the name of the output file.
config is a dictionary like:
config = {
"files": ['file1.txt', 'file2.txt', 'file3.txt'],
"outputname = "outputfilename"
}
"""
from zipfile import ZipFile
files = config['files']
now = datetime.datetime.now().replace(microsecond=0).isoformat().split("T")[0]
outputname = now + "-" + config['input_name'].split('.')[0]
with ZipFile(f"{outputname}.zip", "w") as zipObj:
for idx, fname in enumerate(files):
zipObj.write(fname, os.path.basename(fname))
return f"{outputname}.zip"
def output_csv(config):
transcript = config['transcript']
outputname = config['outputname']
output_dir = config['output_dir']
csv_file = output_dir + f"/{outputname}.csv"
pd.DataFrame(transcript).to_csv(csv_file)
print("Saved CSV to" + csv_file)
return csv_file
def output_markdown(config):
template = config['template']
outputname = config['outputname']
transcript = config['transcript']
output_dir = config['output_dir']
if template == None:
template = """
{% for part in transcript -%}
**{{ part.Speaker }}**: *{{ part.Start }} - {{ part.End }}*
{{ part.Text }}
<br>
{% endfor %}
"""
environment = Environment()
templ = environment.from_string(template)
# Output a list of dictionaries using 'records'
trans_dict = pd.DataFrame(transcript).to_dict('records')
markdown_out = templ.render(transcript=trans_dict)
markdown_file = output_dir + f"/{outputname}.md"
with open(markdown_file, "w", encoding="utf-8") as message:
message.write(markdown_out)
print(f"...wrote {markdown_file}")
return markdown_file
def output_docx(config):
outputname = config['outputname']
output_dir = config['output_dir']
if config['markdown'] == False:
markdown_file = output_markdown(config)
else:
markdown_file = output_dir + f"/{outputname}.md"
doc_file = config['output_dir'] + f"/{outputname}.docx"
os.system(f'pandoc -i "{markdown_file}" -o "{doc_file}"')
print(f"...wrote {markdown_file}")
if config['markdown'] == False:
os.remove(f'{output_dir}/{outputname}.md')
return doc_file
def otheroutputs(transcript, csv=True, markdown=True, docx=True, upload_name="input.mp3"):
config = {}
now = datetime.datetime.now().replace(microsecond=0).isoformat().split("T")[0]
outputname = now + "-" + upload_name.split('.')[0]
output_dir = tempfile.mkdtemp()
files = []
config['input_name'] = upload_name
config['outputname'] = outputname
config['output_dir'] = output_dir
config['transcript'] = transcript
config['markdown'] = markdown
config['template'] = None # Placeholder to pass through custom jinja templates at a later date
if csv:
files.append(output_csv(config))
if markdown:
files.append(output_markdown(config))
if docx:
files.append(output_docx(config))
config['files'] = files
return config