video-to-subs / clean_text.py
abnerh's picture
german and spanish
0cc2cbd
raw
history blame
947 Bytes
import os, re, string
import subprocess
from textblob_de import TextBlobDE as TextBlob
def clean_english(text):
clean_text = re.sub(r' ', ' ', text)
clean_text = re.sub(r'\bi\s', 'I ', clean_text)
clean_text = re.sub(r'\si$', ' I', clean_text)
clean_text = re.sub(r'i\'', 'I\'', clean_text)
return clean_text
def clean_german(text):
text = text.translate(str.maketrans('', '', string.punctuation))
# Tokenize German text
blob = TextBlob(text)
pos = blob.tags
# Get nouns and capitalize
nouns = {}
for idx in pos:
if idx[1] == 'NN' and len(idx[0]) > 1:
nouns[idx[0]] = idx[0].capitalize()
if len(nouns) != 0:
pattern = re.compile("|".join(nouns.keys()))
text = pattern.sub(lambda m: nouns[re.escape(m.group(0))], text)
return text
def clean_spanish(text):
clean_text = text.translate(str.maketrans('', '', string.punctuation))
clean_text = re.sub(r' ', ' ', clean_text)
return clean_text