Spaces:
Runtime error
Runtime error
File size: 3,521 Bytes
33d6c4f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
from transformers import AutoTokenizer
import re
class PersianTextProcessor:
"""
A class for processing Persian text.
Attributes:
model_size (str): The size of the MT5 model.
model_name (str): The name of the MT5 model.
tokenizer (MT5Tokenizer): The MT5 tokenizer.
model (MT5ForConditionalGeneration): The MT5 model.
Methods:
clean_persian_text(text): Cleans the given Persian text.
translate_text(persian_text): Translates the given Persian text to English.
"""
def __init__(self, model_size="small"):
"""
Initializes the PersianTextProcessor class.
Args:
model_size (str): The size of the MT5 model.
"""
self.model_size = model_size
self.model_name = f"persiannlp/mt5-{self.model_size}-parsinlu-opus-translation_fa_en"
self.tokenizer =MT5Tokenizer.from_pretrained(self.model_name) #AutoTokenizer.from_pretrained("persiannlp/mt5-small-parsinlu-opus-translation_fa_en")
self.model = MT5ForConditionalGeneration.from_pretrained(self.model_name)
def clean_persian_text(self, text):
"""
Cleans the given Persian text by removing emojis, specific patterns, and replacing special characters.
Args:
text (str): The input Persian text.
Returns:
str: The cleaned Persian text.
"""
# Create a regular expression to match emojis.
emoji_pattern = re.compile(
"["
"\U0001F600-\U0001F64F" # emoticons
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F1E0-\U0001F1FF" # flags (iOS)
"]+",
flags=re.UNICODE,
)
# Create a regular expression to match specific patterns.
pattern = "[\U0001F90D\U00002764\U0001F91F][\U0000FE0F\U0000200D]*"
# Remove emojis, specific patterns, and special characters from the text.
text = emoji_pattern.sub("", text)
text = re.sub(pattern, "", text)
text = text.replace("✌", "")
text = text.replace("@", "")
text = text.replace("#", "hashtag_")
return text
def run_model(self, input_string, **generator_args):
"""
Runs the MT5 model on the given input string.
Args:
input_string (str): The input string.
**generator_args: Additional arguments to pass to the MT5 model.
Returns:
str: The output of the MT5 model.
"""
# Encode the input string as a sequence of tokens.
input_ids = self.tokenizer.encode(input_string, return_tensors="pt")
# Generate the output text.
res = self.model.generate(input_ids, **generator_args)
# Decode the output text to a string.
output = self.tokenizer.batch_decode(res, skip_special_tokens=True)
return output
def translate_text(self, persian_text):
"""
Translates the given Persian text to English.
Args:
persian_text (str): The Persian text to translate.
Returns:
str: The translated text.
"""
# Clean the Persian text.
text_cleaned = self.clean_persian_text(persian_text)
# Translate the cleaned text.
translated_text = self.run_model(input_string=text_cleaned)
return translated_text
|