Spaces:

meghanaraok
/

LongLAT

Running

App Files Files Community

LongLAT / run_coding.py

meghanaraok

Upload run_coding.py

e9066e1 verified 8 months ago

raw

history blame

8.68 kB

	#!/usr/bin/env python
	# coding: utf-8

	# In[7]:


	from dataclasses import dataclass, field
	from datetime import datetime
	from typing import List, Optional
	from transformers.file_utils import ExplicitEnum

	task_to_keys = {
	"mimic3-50": ("mimic3-50"),
	"mimic3-full": ("mimic3-full"),
	}

	class TransformerLayerUpdateStrategy(ExplicitEnum):
	NO = "no"
	LAST = "last"
	ALL = "all"

	class DocumentPoolingStrategy(ExplicitEnum):
	FLAT = "flat"
	MAX = "max"
	MEAN = "mean"


	@dataclass
	class DataTrainingArguments:
	"""
	Arguments pertaining to what data we are going to input our model for training and eval.

	Using `HfArgumentParser` we can turn this class
	into argparse arguments to be able to specify them on
	the command line.
	"""

	task_name: Optional[str] = field(
	default=None,
	metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
	)
	dataset_name: Optional[str] = field(
	default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
	)
	dataset_config_name: Optional[str] = field(
	default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
	)
	max_seq_length: int = field(
	default=128,
	metadata={
	"help": "The maximum total input sequence length after tokenization. Sequences longer "
	"than this will be truncated, sequences shorter will be padded."
	},
	)
	overwrite_cache: bool = field(
	default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
	)
	pad_to_max_length: bool = field(
	default=True,
	metadata={
	"help": "Whether to pad all samples to `max_seq_length`. "
	"If False, will pad the samples dynamically when batching to the maximum length in the batch."
	},
	)
	max_train_samples: Optional[int] = field(
	default=None,
	metadata={
	"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
	"value if set."
	},
	)
	max_eval_samples: Optional[int] = field(
	default=None,
	metadata={
	"help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
	"value if set."
	},
	)
	max_predict_samples: Optional[int] = field(
	default=None,
	metadata={
	"help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
	"value if set."
	},
	)
	train_file: Optional[str] = field(
	default=None, metadata={"help": "A csv or a json file containing the training data."}
	)
	validation_file: Optional[str] = field(
	default=None, metadata={"help": "A csv or a json file containing the validation data."}
	)
	test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."})

	# customized data arguments
	label_dictionary_file: Optional[str] = field(
	default=None, metadata={"help": "The name of the test data file."}
	)
	code_max_seq_length: int = field(
	default=128,
	metadata={
	"help": "The maximum total input sequence length after tokenization for code long titles"
	},
	)
	code_batch_size: int = field(
	default=8,
	metadata={
	"help": "The batch size for generating code representation"
	},
	)
	ignore_keys_for_eval: Optional[List[str]] = field(
	default=None, metadata={"help": "The list of keys to be ignored during evaluation process."}
	)
	use_cached_datasets: bool = field(
	default=True,
	metadata={"help": "if use cached datasets to save preprocessing time. The cached datasets were preprocessed "
	"and saved into data folder."})
	data_segmented: bool = field(
	default=False,
	metadata={"help": "if dataset is segmented or not"})

	lazy_loading: bool = field(
	default=False,
	metadata={"help": "if dataset is larger than 500MB, please use lazy_loading"})

	def __post_init__(self):
	if self.task_name is not None:
	self.task_name = self.task_name.lower()
	if self.task_name not in task_to_keys.keys():
	raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys()))
	elif self.dataset_name is not None:
	pass
	elif self.train_file is None or self.validation_file is None:
	raise ValueError("Need a training/validation file")
	elif self.label_dictionary_file is None:
	raise ValueError("label dictionary must be provided")
	else:
	train_extension = self.train_file.split(".")[-1]
	assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
	validation_extension = self.validation_file.split(".")[-1]
	assert (
	validation_extension == train_extension
	), "`validation_file` should have the same extension (csv or json) as `train_file`."


	@dataclass
	class ModelArguments:
	"""
	Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
	"""

	model_name_or_path: str = field(
	metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
	)
	config_name: Optional[str] = field(
	default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
	)
	tokenizer_name: Optional[str] = field(
	default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
	)
	cache_dir: Optional[str] = field(
	default=None,
	metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
	)
	use_fast_tokenizer: bool = field(
	default=True,
	metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
	)
	model_revision: str = field(
	default="main",
	metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
	)
	use_auth_token: bool = field(
	default=False,
	metadata={
	"help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
	"with private models)."
	},
	)
	# Customized model arguments
	d_model: int = field(default=768, metadata={"help": "hidden size of model. should be the same as base transformer "
	"model"})
	dropout: float = field(default=0.1, metadata={"help": "Dropout of transformer layer"})
	dropout_att: float = field(default=0.1, metadata={"help": "Dropout of label-wise attention layer"})
	num_chunks_per_document: int = field(default=0.1, metadata={"help": "Num of chunks per document"})
	transformer_layer_update_strategy: TransformerLayerUpdateStrategy = field(
	default="all",
	metadata={"help": "Update which transformer layers when training"})
	use_code_representation: bool = field(
	default=True,
	metadata={"help": "if use code representation as the "
	"initial parameters of code vectors in attention layer"})
	multi_head_attention: bool = field(
	default=True,
	metadata={"help": "if use multi head attention for different chunks"})
	chunk_attention: bool = field(
	default=True,
	metadata={"help": "if use chunk attention for each label"})

	multi_head_chunk_attention: bool = field(
	default=True,
	metadata={"help": "if use multi head chunk attention for each label"})

	num_hidden_layers: int = field(
	default=2, metadata={"help": "NUm of hidden layers in longformer"}
	)

	linear_init_mean: float = field(default=0.0, metadata={"help": "mean value for initializing linear layer weights"})
	linear_init_std: float = field(default=0.03, metadata={"help": "standard deviation value for initializing linear "
	"layer weights"})
	document_pooling_strategy: DocumentPoolingStrategy = field(
	default="flat",
	metadata={"help": "how to pool document representation after label-wise attention layer for each label"})