|
from datasets import load_dataset |
|
|
|
import config |
|
|
|
|
|
def load_raw_rewriting_dataset_as_pandas(): |
|
return load_dataset(config.HF_RAW_DATASET_NAME, |
|
split=config.HF_RAW_DATASET_SPLIT, |
|
token=config.HF_TOKEN, |
|
cache_dir=config.CACHE_DIR).to_pandas() |
|
|
|
|
|
def load_full_commit_dataset_as_pandas(): |
|
return load_dataset(path=config.HF_FULL_COMMITS_DATASET_NAME, |
|
name=config.HF_FULL_COMMITS_DATASET_SUBNAME, |
|
split=config.HF_FULL_COMMITS_DATASET_SPLIT, |
|
cache_dir=config.CACHE_DIR).to_pandas().rename( |
|
columns={'message': 'reference'}) |
|
|
|
|
|
def load_processed_rewriting_dataset_as_pandas(): |
|
manual_rewriting = load_raw_rewriting_dataset_as_pandas()[ |
|
["hash", "repo", "commit_msg_start", "commit_msg_end", "session"]] |
|
manual_rewriting.set_index(["hash", "repo"], inplace=True) |
|
|
|
mods_dataset = load_full_commit_dataset_as_pandas()[["hash", "repo", "mods"]] |
|
mods_dataset.set_index(["hash", "repo"], inplace=True) |
|
|
|
return manual_rewriting.join(other=mods_dataset, how='left').reset_index() |
|
|
|
|
|
def load_synthetic_dataset_as_pandas(): |
|
return load_dataset(config.HF_SYNTHETIC_DATASET_NAME, |
|
split=config.HF_SYNTHETIC_DATASET_SPLIT, |
|
token=config.HF_TOKEN, |
|
cache_dir=config.CACHE_DIR).to_pandas() |
|
|