m3hrdadfi commited on
Commit
d19a8a5
1 Parent(s): 3733ce3

Fix some bugs

Browse files
Files changed (2) hide show
  1. src/run.sh +3 -3
  2. src/run_clm_flax.py +2 -0
src/run.sh CHANGED
@@ -9,9 +9,9 @@ export OUTPUT_DIR=/home/m3hrdadfi/code/gpt2-medium-persian
9
  # export CONFIG_NAME=/home/m3hrdadfi/code/gpt2-medium-persian
10
  # export TOKENIZER_NAME=/home/m3hrdadfi/code/gpt2-medium-persian
11
 
12
- export TRAIN_FILE=/home/m3hrdadfi/data/train.csv
13
- export VALIDATION_FILE=/home/m3hrdadfi/data/test.csv
14
- export TEST_FILE=/home/m3hrdadfi/code/data/test.csv
15
  # export DATASET_NAME=oscar
16
  # export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
17
  export MAX_SEQUENCE_LENGTH=512
 
9
  # export CONFIG_NAME=/home/m3hrdadfi/code/gpt2-medium-persian
10
  # export TOKENIZER_NAME=/home/m3hrdadfi/code/gpt2-medium-persian
11
 
12
+ export TRAIN_FILE=/home/m3hrdadfi/data/train-fixed.csv
13
+ export VALIDATION_FILE=/home/m3hrdadfi/data/test-fixed.csv
14
+ export TEST_FILE=/home/m3hrdadfi/code/data/test-fixed.csv
15
  # export DATASET_NAME=oscar
16
  # export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
17
  export MAX_SEQUENCE_LENGTH=512
src/run_clm_flax.py CHANGED
@@ -368,6 +368,7 @@ def main():
368
  # dataset = dataset.map(normalizer)
369
  # logger.info(f"Preprocessed dataset kept {len(dataset)} out of {len(raw_dataset)}")
370
  dataset = raw_dataset
 
371
 
372
  # Load pretrained model and tokenizer
373
 
@@ -421,6 +422,7 @@ def main():
421
  else:
422
  column_names = dataset["validation"].column_names
423
  text_column_name = "text" if "text" in column_names else column_names[0]
 
424
 
425
  # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
426
  tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
 
368
  # dataset = dataset.map(normalizer)
369
  # logger.info(f"Preprocessed dataset kept {len(dataset)} out of {len(raw_dataset)}")
370
  dataset = raw_dataset
371
+ logger.info(f"dataset: {dataset}")
372
 
373
  # Load pretrained model and tokenizer
374
 
 
422
  else:
423
  column_names = dataset["validation"].column_names
424
  text_column_name = "text" if "text" in column_names else column_names[0]
425
+ logger.info(f"text_column_name: {text_column_name}")
426
 
427
  # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
428
  tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")