Redmind commited on
Commit
0d125e0
·
verified ·
1 Parent(s): 9d34860

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -20
app.py CHANGED
@@ -1,7 +1,6 @@
1
- from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainingArguments, Seq2SeqTrainer
2
  from datasets import Dataset, DatasetDict
3
  import pandas as pd
4
- import torch
5
 
6
  # Load the dataset
7
  file_path = "hindi_dataset.tsv" # Update with your actual file path
@@ -24,23 +23,21 @@ model_name = "Helsinki-NLP/opus-mt-en-hi" # Pre-trained English-to-Hindi model
24
  tokenizer = MarianTokenizer.from_pretrained(model_name)
25
  model = MarianMTModel.from_pretrained(model_name)
26
 
27
- # Tokenize source (English) text
28
  def tokenize_function(examples):
29
- return tokenizer(examples['source'], truncation=True, padding='max_length', max_length=128)
30
-
31
- # Tokenize target (Hindi) text
32
- def tokenize_target_function(examples):
33
  with tokenizer.as_target_tokenizer():
34
- return tokenizer(examples['target'], truncation=True, padding='max_length', max_length=128)
 
 
35
 
36
  # Apply tokenization to the dataset
37
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
38
- tokenized_datasets = tokenized_datasets.map(tokenize_target_function, batched=True)
39
 
40
  # Define the training arguments
41
  training_args = Seq2SeqTrainingArguments(
42
  output_dir="./results",
43
- eval_strategy="epoch",
44
  learning_rate=2e-5,
45
  per_device_train_batch_size=16,
46
  per_device_eval_batch_size=16,
@@ -53,16 +50,8 @@ training_args = Seq2SeqTrainingArguments(
53
  save_steps=500
54
  )
55
 
56
- # Data collator to pad sequences to the same length
57
- def data_collator(features):
58
- keys = ["input_ids", "attention_mask", "labels"]
59
- max_length = max(len(feature[key]) for feature in features for key in keys if key in feature)
60
- for feature in features:
61
- for key in keys:
62
- if key in feature:
63
- padding = [0] * (max_length - len(feature[key]))
64
- feature[key].extend(padding)
65
- return {key: torch.tensor([f[key] for f in features]) for key in keys}
66
 
67
  # Define the Trainer
68
  trainer = Seq2SeqTrainer(
 
1
+ from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
2
  from datasets import Dataset, DatasetDict
3
  import pandas as pd
 
4
 
5
  # Load the dataset
6
  file_path = "hindi_dataset.tsv" # Update with your actual file path
 
23
  tokenizer = MarianTokenizer.from_pretrained(model_name)
24
  model = MarianMTModel.from_pretrained(model_name)
25
 
26
+ # Tokenize source and target text
27
  def tokenize_function(examples):
28
+ model_inputs = tokenizer(examples['english'], truncation=True, padding='max_length', max_length=128)
 
 
 
29
  with tokenizer.as_target_tokenizer():
30
+ labels = tokenizer(examples['hindi'], truncation=True, padding='max_length', max_length=128)
31
+ model_inputs['labels'] = labels['input_ids']
32
+ return model_inputs
33
 
34
  # Apply tokenization to the dataset
35
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
 
36
 
37
  # Define the training arguments
38
  training_args = Seq2SeqTrainingArguments(
39
  output_dir="./results",
40
+ evaluation_strategy="epoch",
41
  learning_rate=2e-5,
42
  per_device_train_batch_size=16,
43
  per_device_eval_batch_size=16,
 
50
  save_steps=500
51
  )
52
 
53
+ # Use the DataCollatorForSeq2Seq for padding
54
+ data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
 
 
 
 
 
 
 
 
55
 
56
  # Define the Trainer
57
  trainer = Seq2SeqTrainer(