Translation
Flair
code
samira456 commited on
Commit
039b967
·
verified ·
1 Parent(s): 307868e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +103 -3
README.md CHANGED
@@ -1,3 +1,103 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ datasets:
4
+ - open-thoughts/OpenThoughts-114k
5
+ language:
6
+ - ar
7
+ metrics:
8
+ - code_eval
9
+ base_model:
10
+ - deepseek-ai/DeepSeek-R1
11
+ new_version: deepseek-ai/DeepSeek-R1
12
+ library_name: adapter-transformers
13
+ tags:
14
+ - code
15
+ ---
16
+ ---
17
+ license: mit
18
+ ---# Step 1: Install required libraries
19
+ !pip install transformers datasets torch sentencepiece
20
+
21
+ # Step 2: Import Libraries
22
+ from datasets import load_dataset
23
+ from transformers import MarianMTModel, MarianTokenizer
24
+ import torch
25
+ from transformers import Trainer, TrainingArguments
26
+
27
+ # Step 3: Load the Dataset
28
+ dataset = load_dataset(cfilt/iitb-engl"ish-hindi")
29
+
30
+ # Check the structure of the dataset
31
+ print(dataset)
32
+
33
+ # Step 4: Prepare Tokenizer and Model
34
+ model_name = "Helsinki-NLP/opus-mt-en-hi"
35
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
36
+ model = MarianMTModel.from_pretrained(model_name)
37
+
38
+ # Step 5: Preprocess the Dataset
39
+ def preprocess_function(examples):
40
+ # Tokenize the English input and Hindi target
41
+ model_inputs = tokenizer(examples["en"], truncation=True, padding="max_length", max_length=128)
42
+ # Tokenize the Hindi target for training
43
+ with tokenizer.as_target_tokenizer():
44
+ labels = tokenizer(examples["hi"], truncation=True, padding="max_length", max_length=128)
45
+
46
+ model_inputs["labels"] = labels["input_ids"]
47
+ return model_inputs
48
+
49
+ # Apply preprocessing to the dataset
50
+ tokenized_datasets = dataset.map(preprocess_function, batched=True)
51
+
52
+ # Step 6: Training the Model
53
+ training_args = TrainingArguments(
54
+ output_dir="./results", # output directory for results
55
+ evaluation_strategy="epoch", # evaluate after every epoch
56
+ learning_rate=2e-5, # learning rate
57
+ per_device_train_batch_size=16, # batch size for training
58
+ per_device_eval_batch_size=16, # batch size for evaluation
59
+ num_train_epochs=3, # number of training epochs
60
+ logging_dir="./logs", # directory for storing logs
61
+ save_steps=500, # save checkpoint every 500 steps
62
+ )
63
+
64
+ # Initialize the Trainer
65
+ trainer = Trainer(
66
+ model=model, # the pre-trained model
67
+ args=training_args, # training arguments
68
+ train_dataset=tokenized_datasets["train"], # training dataset
69
+ eval_dataset=tokenized_datasets["validation"], # validation dataset
70
+ )
71
+
72
+ # Train the model
73
+ trainer.train()
74
+
75
+ # Step 7: Evaluate the Model
76
+ results = trainer.evaluate(tokenized_datasets["test"])
77
+ print("Evaluation Results:", results)
78
+
79
+ # Step 8: Translate Text Using the Model
80
+ def translate(texts):
81
+ # Tokenize the input English text
82
+ inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
83
+
84
+ # Generate the translation (output of the model)
85
+ with torch.no_grad():
86
+ translated = model.generate(**inputs)
87
+
88
+ # Decode the generated ids back into text
89
+ translations = tokenizer.decode(translated[0], skip_special_tokens=True)
90
+ return translations
91
+
92
+ # Example translation
93
+ english_text = ["Hello, how are you?", "I am learning NLP."]
94
+ translations = translate(english_text)
95
+ print(translations)
96
+
97
+ # Step 9: Save the Model and Tokenizer
98
+ model.save_pretrained("./hindi_translation_model")
99
+ tokenizer.save_pretrained("./hindi_translation_tokenizer")
100
+
101
+ # Step 10: Load the model and tokenizer for future use
102
+ model = MarianMTModel.from_pretrained("./hindi_translation_model")
103
+ tokenizer = MarianTokenizer.from_pretrained("./hindi_translation_tokenizer")