Spaces:
Sleeping
Sleeping
from transformers import MarianMTModel, MarianTokenizer | |
from tqdm import tqdm | |
import os | |
import re | |
import argparse | |
# Load Model and Tokenizer | |
model_name = "Helsinki-NLP/opus-mt-en-es" | |
tokenizer = MarianTokenizer.from_pretrained(model_name) | |
model = MarianMTModel.from_pretrained(model_name) | |
# Extract & separate timestamp and text | |
def extract_timestamp_and_text(line): | |
match = re.match(r'\[(\d+\.\d+\-\d+\.\d+)\]\s+(.*)', line) | |
if match: | |
return match.group(1), match.group(2) | |
return '', line | |
# Translate text | |
def translate_text(text): | |
lines = text.split('\n') | |
translated_lines = [] | |
for line in tqdm(lines, desc="Translating lines", leave=False): | |
if not line.strip(): | |
translated_lines.append('') | |
continue | |
timestamp, line_text = extract_timestamp_and_text(line) | |
if line_text.strip(): | |
model_inputs = tokenizer(line_text, return_tensors="pt", truncation=True, padding="longest") | |
translated = model.generate(**model_inputs) | |
translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0] | |
translated_line = f'[{timestamp}] {translated_text}' | |
else: | |
translated_line = f'[{timestamp}]' | |
translated_lines.append(translated_line) | |
return '\n'.join(translated_lines) | |
# Main function to translate a file | |
def translate_file(src_file_path, dst_file_path): | |
try: | |
with open(src_file_path, 'r') as file: | |
english_text = file.read() | |
spanish_text = translate_text(english_text) | |
with open(dst_file_path, 'w') as file: | |
file.write(spanish_text) | |
print(f"Translation completed: {dst_file_path}") | |
except Exception as e: | |
print(f"Error processing file: {e}") | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Translate English text to Spanish") | |
parser.add_argument("src_file_path", help="Path to the source file with English text") | |
parser.add_argument("dst_file_path", help="Path to save the translated Spanish text") | |
args = parser.parse_args() | |
translate_file(args.src_file_path, args.dst_file_path) | |