Iker commited on
Commit
57ffa19
1 Parent(s): 95c4d87
Files changed (2) hide show
  1. dataset.py +5 -0
  2. translate.py +2 -2
dataset.py CHANGED
@@ -12,6 +12,8 @@ class DatasetReader(IterableDataset):
12
  self.tokenizer = tokenizer
13
  self.max_length = max_length
14
  self.current_line = 0
 
 
15
 
16
  def preprocess(self, text: str):
17
  self.current_line += 1
@@ -31,6 +33,9 @@ class DatasetReader(IterableDataset):
31
  mapped_itr = map(self.preprocess, file_itr)
32
  return mapped_itr
33
 
 
 
 
34
 
35
  class ParallelTextReader(IterableDataset):
36
  def __init__(self, pred_path: str, gold_path: str):
 
12
  self.tokenizer = tokenizer
13
  self.max_length = max_length
14
  self.current_line = 0
15
+ self.total_lines = count_lines(filename)
16
+ print(f"{self.total_lines} lines in {filename}")
17
 
18
  def preprocess(self, text: str):
19
  self.current_line += 1
 
33
  mapped_itr = map(self.preprocess, file_itr)
34
  return mapped_itr
35
 
36
+ def __len__(self):
37
+ return self.total_lines
38
+
39
 
40
  class ParallelTextReader(IterableDataset):
41
  def __init__(self, pred_path: str, gold_path: str):
translate.py CHANGED
@@ -99,11 +99,11 @@ def main(
99
  "num_return_sequences": 1,
100
  }
101
 
102
- total_lines: int = count_lines(sentences_path)
103
 
104
  if accelerator.is_main_process:
105
  print(
106
- f"** We will translate {total_lines} lines. **\n"
107
  f"Input file: {sentences_path}\n"
108
  f"Output file: {output_path}\n"
109
  f"Source language: {source_lang}\n"
 
99
  "num_return_sequences": 1,
100
  }
101
 
102
+ # total_lines: int = count_lines(sentences_path)
103
 
104
  if accelerator.is_main_process:
105
  print(
106
+ f"** Translation **\n"
107
  f"Input file: {sentences_path}\n"
108
  f"Output file: {output_path}\n"
109
  f"Source language: {source_lang}\n"