Spaces:

sunit333
/

demo-summary-hindi

No application file

App Files Files Community

demo-summary-hindi / Indic-BERT-v1-master /fine_tune /cli.py

sunit333

Upload 63 files

d08dd00 verified 8 months ago

raw

history blame

6.61 kB

	import argparse
	import os
	import sys

	from .modules import get_modules


	# For every dataset, add an entry:
	# [<transformer module>, <do train?>]
	ALL_DATASETS = {
	'indicnlp-articles': ['text_classification', True],
	'wikiann-ner': ['token_classification', True],
	'wiki-cloze': ['masked_lm', False],
	'wiki-section-titles': ['multiple_choice', True],
	'indicnlp-articles-headlines': ['multiple_choice', True],
	'cvit-mkb': ['xsent_retrieval', False],
	'bbc-articles': ['text_classification', True],
	'iitp-movie-reviews': ['text_classification', True],
	'iitp-product-reviews': ['text_classification', True],
	'soham-articles': ['text_classification', True],
	'inltk-headlines': ['text_classification', True],
	'actsa': ['text_classification', True],
	'midas-discourse': ['text_classification', True],
	'wnli-translated': ['text_classification', True],
	'copa-translated': ['multiple_choice', True],
	'amrita-paraphrase-exact': ['text_classification', True],
	'amrita-paraphrase-fuzzy': ['text_classification', True],
	}


	def add_generic_args(parser, root_dir):
	# task-specific args START
	parser.add_argument(
	'--dataset',
	type=str,
	required=True,
	help='The evaluation dataset to use'
	)

	parser.add_argument(
	'--lang',
	default=None,
	type=str,
	required=True,
	help='ISO code of test language',
	)
	parser.add_argument(
	'--train_lang',
	default=None,
	type=str,
	help='ISO code of train language. If not specified, it is assumed to be the same as the test langauges',
	)
	# task-specific args END

	# model structural parameters START
	parser.add_argument(
	'--model',
	default=None,
	type=str,
	required=True,
	help='Path to pretrained model or model identifier from huggingface.co/models',
	)

	parser.add_argument(
	'--config_name', default='', type=str, help='Pretrained config name or path if not the same as model_name'
	)

	parser.add_argument(
	'--tokenizer_name',
	default='',
	type=str,
	help='Pretrained tokenizer name or path if not the same as model_name',
	)

	parser.add_argument(
	'--max_seq_length',
	default=128,
	type=int,
	help='The maximum total input sequence length after tokenization. Sequences longer '
	'than this will be truncated, sequences shorter will be padded.',
	)
	# model structural parameters END

	# data I/O args START
	parser.add_argument(
	'--iglue_dir',
	default=None,
	type=str,
	required=True,
	help='The input data dir',
	)

	parser.add_argument(
	'--overwrite_cache', action='store_true', help='Overwrite the cached training and evaluation sets'
	)

	parser.add_argument(
	'--output_dir',
	default=None,
	type=str,
	required=True,
	help='The output directory where the model predictions and checkpoints will be written.',
	)

	parser.add_argument(
	'--cache_dir',
	default=None,
	type=str,
	help='Where do you want to store the pre-trained models downloaded from s3',
	)
	# data I/O args END

	# model training and inference parameters START
	parser.add_argument(
	'--fp16',
	action='store_true',
	help='Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit',
	)

	parser.add_argument(
	'--fp16_opt_level',
	type=str,
	default='O1',
	help='For fp16: Apex AMP optimization level selected in ["O0", "O1", "O2", and "O3"].'
	'See details at https://nvidia.github.io/apex/amp.html',
	)

	parser.add_argument('--n_gpu', type=int, default=1)
	parser.add_argument('--n_tpu_cores', type=int, default=0)
	parser.add_argument('--max_grad_norm', default=1.0, type=float, help='Max gradient norm.')
	parser.add_argument('--do_train', action='store_true', help='Whether to run training.')
	parser.add_argument('--do_predict', action='store_true', help='Whether to run predictions on the test set.')
	parser.add_argument(
	'--gradient_accumulation_steps',
	type=int,
	default=1,
	help='Number of updates steps to accumulate before performing a backward/update pass.',
	)

	parser.add_argument('--seed', type=int, default=2, help='random seed for initialization')
	parser.add_argument('--learning_rate', default=2e-5, type=float, help='The initial learning rate for Adam.')
	parser.add_argument('--weight_decay', default=0.0, type=float, help='Weight decay if we apply some.')
	parser.add_argument('--adam_epsilon', default=1e-8, type=float, help='Epsilon for Adam optimizer.')
	parser.add_argument('--warmup_steps', default=0, type=int, help='Linear warmup over warmup_steps.')
	parser.add_argument(
	'--num_train_epochs', default=3, type=int, help='Total number of training epochs to perform.'
	)
	parser.add_argument('--train_batch_size', default=32, type=int)
	parser.add_argument('--eval_batch_size', default=32, type=int)
	# model training and inference parameters END


	def main(argvec=None):
	parser = argparse.ArgumentParser()
	add_generic_args(parser, os.getcwd())
	for module in get_modules():
	module.add_model_specific_args(parser, os.getcwd())
	args = parser.parse_args(argvec)
	hparams = vars(args)

	# high-level command line parameters
	dataset = hparams['dataset']
	train_lang = hparams.get('train_lang', hparams['lang'])
	test_lang = hparams['lang']
	model = hparams['model']
	iglue_dir = hparams['iglue_dir']

	data_dir = os.path.join(iglue_dir, dataset)
	output_dir = os.path.join(hparams['output_dir'], dataset,
	'train-{}'.format(train_lang),
	'model-{}'.format(model.replace('/', '-')))

	hparams['model_name_or_path'] = hparams['model']
	hparams['train_lang'] = train_lang
	hparams['test_lang'] = test_lang
	hparams['data_dir'] = data_dir
	hparams['output_dir'] = output_dir
	hparams['do_train'] = ALL_DATASETS[dataset][1]
	hparams['do_predict'] = True

	if dataset not in ALL_DATASETS:
	print('Unrecognized dataset')
	sys.exit()

	os.makedirs(output_dir, exist_ok=True)

	module_name = ALL_DATASETS[dataset][0]
	module_class = get_modules(module_name)
	module = module_class(hparams)
	module.run_module()

	return module


	if __name__ == '__main__':
	main()