Spaces:
Build error
Build error
import datetime | |
from jinja2 import Environment | |
import tempfile | |
import pandas as pd | |
import os | |
source_languages = { | |
"en": "English", | |
"zh": "Chinese", | |
"de": "German", | |
"es": "Spanish", | |
"ru": "Russian", | |
"ko": "Korean", | |
"fr": "French", | |
"ja": "Japanese", | |
"pt": "Portuguese", | |
"tr": "Turkish", | |
"pl": "Polish", | |
"ca": "Catalan", | |
"nl": "Dutch", | |
"ar": "Arabic", | |
"sv": "Swedish", | |
"it": "Italian", | |
"id": "Indonesian", | |
"hi": "Hindi", | |
"fi": "Finnish", | |
"vi": "Vietnamese", | |
"he": "Hebrew", | |
"uk": "Ukrainian", | |
"el": "Greek", | |
"ms": "Malay", | |
"cs": "Czech", | |
"ro": "Romanian", | |
"da": "Danish", | |
"hu": "Hungarian", | |
"ta": "Tamil", | |
"no": "Norwegian", | |
"th": "Thai", | |
"ur": "Urdu", | |
"hr": "Croatian", | |
"bg": "Bulgarian", | |
"lt": "Lithuanian", | |
"la": "Latin", | |
"mi": "Maori", | |
"ml": "Malayalam", | |
"cy": "Welsh", | |
"sk": "Slovak", | |
"te": "Telugu", | |
"fa": "Persian", | |
"lv": "Latvian", | |
"bn": "Bengali", | |
"sr": "Serbian", | |
"az": "Azerbaijani", | |
"sl": "Slovenian", | |
"kn": "Kannada", | |
"et": "Estonian", | |
"mk": "Macedonian", | |
"br": "Breton", | |
"eu": "Basque", | |
"is": "Icelandic", | |
"hy": "Armenian", | |
"ne": "Nepali", | |
"mn": "Mongolian", | |
"bs": "Bosnian", | |
"kk": "Kazakh", | |
"sq": "Albanian", | |
"sw": "Swahili", | |
"gl": "Galician", | |
"mr": "Marathi", | |
"pa": "Punjabi", | |
"si": "Sinhala", | |
"km": "Khmer", | |
"sn": "Shona", | |
"yo": "Yoruba", | |
"so": "Somali", | |
"af": "Afrikaans", | |
"oc": "Occitan", | |
"ka": "Georgian", | |
"be": "Belarusian", | |
"tg": "Tajik", | |
"sd": "Sindhi", | |
"gu": "Gujarati", | |
"am": "Amharic", | |
"yi": "Yiddish", | |
"lo": "Lao", | |
"uz": "Uzbek", | |
"fo": "Faroese", | |
"ht": "Haitian creole", | |
"ps": "Pashto", | |
"tk": "Turkmen", | |
"nn": "Nynorsk", | |
"mt": "Maltese", | |
"sa": "Sanskrit", | |
"lb": "Luxembourgish", | |
"my": "Myanmar", | |
"bo": "Tibetan", | |
"tl": "Tagalog", | |
"mg": "Malagasy", | |
"as": "Assamese", | |
"tt": "Tatar", | |
"haw": "Hawaiian", | |
"ln": "Lingala", | |
"ha": "Hausa", | |
"ba": "Bashkir", | |
"jw": "Javanese", | |
"su": "Sundanese", | |
} | |
whisper_models = ["base", "small", "medium", "large"] | |
def zip_files(config): | |
""" | |
Zip together a list of files returning the name of the output file. | |
config is a dictionary like: | |
config = { | |
"files": ['file1.txt', 'file2.txt', 'file3.txt'], | |
"outputname = "outputfilename" | |
} | |
""" | |
from zipfile import ZipFile | |
files = config['files'] | |
now = datetime.datetime.now().replace(microsecond=0).isoformat().split("T")[0] | |
outputname = now + "-" + config['input_name'].split('.')[0] | |
with ZipFile(f"{outputname}.zip", "w") as zipObj: | |
for idx, fname in enumerate(files): | |
zipObj.write(fname, os.path.basename(fname)) | |
return f"{outputname}.zip" | |
def output_csv(config): | |
transcript = config['transcript'] | |
outputname = config['outputname'] | |
output_dir = config['output_dir'] | |
csv_file = output_dir + f"/{outputname}.csv" | |
pd.DataFrame(transcript).to_csv(csv_file) | |
print("Saved CSV to" + csv_file) | |
return csv_file | |
def output_markdown(config): | |
template = config['template'] | |
outputname = config['outputname'] | |
transcript = config['transcript'] | |
output_dir = config['output_dir'] | |
if template == None: | |
template = """ | |
{% for part in transcript -%} | |
**{{ part.Speaker }}**: *{{ part.Start }} - {{ part.End }}* | |
{{ part.Text }} | |
<br> | |
{% endfor %} | |
""" | |
environment = Environment() | |
templ = environment.from_string(template) | |
# Output a list of dictionaries using 'records' | |
trans_dict = pd.DataFrame(transcript).to_dict('records') | |
markdown_out = templ.render(transcript=trans_dict) | |
markdown_file = output_dir + f"/{outputname}.md" | |
with open(markdown_file, "w", encoding="utf-8") as message: | |
message.write(markdown_out) | |
print(f"...wrote {markdown_file}") | |
return markdown_file | |
def output_docx(config): | |
outputname = config['outputname'] | |
output_dir = config['output_dir'] | |
if config['markdown'] == False: | |
markdown_file = output_markdown(config) | |
else: | |
markdown_file = output_dir + f"/{outputname}.md" | |
doc_file = config['output_dir'] + f"/{outputname}.docx" | |
os.system(f'pandoc -i "{markdown_file}" -o "{doc_file}"') | |
print(f"...wrote {markdown_file}") | |
if config['markdown'] == False: | |
os.remove(f'{output_dir}/{outputname}.md') | |
return doc_file | |
def otheroutputs(transcript, csv=True, markdown=True, docx=True, upload_name="input.mp3"): | |
config = {} | |
now = datetime.datetime.now().replace(microsecond=0).isoformat().split("T")[0] | |
outputname = now + "-" + upload_name.split('.')[0] | |
output_dir = tempfile.mkdtemp() | |
files = [] | |
config['input_name'] = upload_name | |
config['outputname'] = outputname | |
config['output_dir'] = output_dir | |
config['transcript'] = transcript | |
config['markdown'] = markdown | |
config['template'] = None # Placeholder to pass through custom jinja templates at a later date | |
if csv: | |
files.append(output_csv(config)) | |
if markdown: | |
files.append(output_markdown(config)) | |
if docx: | |
files.append(output_docx(config)) | |
config['files'] = files | |
return config | |