File size: 1,429 Bytes
305e536 0c136d8 305e536 aab3281 0c136d8 5f3a4af 0c136d8 5f3a4af a8a595d 02ebb6e a8a595d 5f3a4af 30e165f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
from datasets import load_dataset
import config
def load_raw_rewriting_dataset_as_pandas():
return load_dataset(config.HF_RAW_DATASET_NAME,
split=config.HF_RAW_DATASET_SPLIT,
token=config.HF_TOKEN,
cache_dir=config.CACHE_DIR).to_pandas()
def load_full_commit_dataset_as_pandas():
return load_dataset(path=config.HF_FULL_COMMITS_DATASET_NAME,
name=config.HF_FULL_COMMITS_DATASET_SUBNAME,
split=config.HF_FULL_COMMITS_DATASET_SPLIT,
cache_dir=config.CACHE_DIR).to_pandas().rename(
columns={'message': 'reference'})
def load_processed_rewriting_dataset_as_pandas():
manual_rewriting = load_raw_rewriting_dataset_as_pandas()[
["hash", "repo", "commit_msg_start", "commit_msg_end", "session"]]
manual_rewriting.set_index(["hash", "repo"], inplace=True)
mods_dataset = load_full_commit_dataset_as_pandas()[["hash", "repo", "mods"]]
mods_dataset.set_index(["hash", "repo"], inplace=True)
return manual_rewriting.join(other=mods_dataset, how='left').reset_index()
def load_synthetic_dataset_as_pandas():
return load_dataset(config.HF_SYNTHETIC_DATASET_NAME,
split=config.HF_SYNTHETIC_DATASET_SPLIT,
token=config.HF_TOKEN,
cache_dir=config.CACHE_DIR).to_pandas()
|