Spaces:
Paused
Paused
import os, re, string | |
import subprocess | |
from textblob_de import TextBlobDE as TextBlob | |
def clean_english(text): | |
clean_text = re.sub(r' ', ' ', text) | |
clean_text = re.sub(r'\bi\s', 'I ', clean_text) | |
clean_text = re.sub(r'\si$', ' I', clean_text) | |
clean_text = re.sub(r'i\'', 'I\'', clean_text) | |
return clean_text | |
def clean_german(text): | |
text = text.translate(str.maketrans('', '', string.punctuation)) | |
# Tokenize German text | |
blob = TextBlob(text) | |
pos = blob.tags | |
# Get nouns and capitalize | |
nouns = {} | |
for idx in pos: | |
if idx[1] == 'NN' and len(idx[0]) > 1: | |
nouns[idx[0]] = idx[0].capitalize() | |
if len(nouns) != 0: | |
pattern = re.compile("|".join(nouns.keys())) | |
text = pattern.sub(lambda m: nouns[re.escape(m.group(0))], text) | |
return text | |
def clean_spanish(text): | |
clean_text = text.translate(str.maketrans('', '', string.punctuation)) | |
clean_text = re.sub(r' ', ' ', clean_text) | |
return clean_text | |