gpt2-pos-encoding-experiment-10B / prepare_evaluation_data.py
0-hero's picture
Add files using upload-large-folder tool
01cd082 verified
# prepare_evaluation_data.py
import os
import requests
import zipfile
import tarfile
import shutil
from huggingface_hub import hf_hub_download
def download_and_extract(url, extract_path):
filename = url.split('/')[-1]
if not os.path.exists(filename):
print(f"Downloading {filename}...")
r = requests.get(url)
with open(filename, 'wb') as f:
f.write(r.content)
else:
print(f"{filename} already exists.")
if filename.endswith('.zip'):
with zipfile.ZipFile(filename, 'r') as zip_ref:
zip_ref.extractall(extract_path)
elif filename.endswith(('.tar.gz', '.tgz')):
with tarfile.open(filename, 'r:gz') as tar_ref:
tar_ref.extractall(extract_path)
else:
print(f"Cannot extract {filename}.")
def prepare_ptb():
url = 'https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.train.txt'
os.makedirs('data/ptb', exist_ok=True)
for split in ['train', 'valid', 'test']:
split_url = url.replace('train', split)
r = requests.get(split_url)
with open(f'data/ptb/{split}.txt', 'w') as f:
f.write(r.text)
print("PTB dataset prepared.")
def prepare_wikitext2():
import os
from huggingface_hub import hf_hub_download
repo_id = "Salesforce/wikitext"
files = [
"wikitext-2-v1/train-00000-of-00001.parquet",
"wikitext-2-v1/validation-00000-of-00001.parquet",
"wikitext-2-v1/test-00000-of-00001.parquet"
]
extract_path = 'data/'
os.makedirs(extract_path, exist_ok=True)
print("Downloading WikiText-2 dataset from Hugging Face...")
for file_path in files:
local_path = os.path.join(extract_path, os.path.basename(file_path))
if not os.path.exists(local_path):
hf_hub_download(repo_id=repo_id, filename=file_path, local_dir=extract_path, repo_type="dataset")
print(f"Downloaded {os.path.basename(file_path)} to {extract_path}.")
else:
print(f"{os.path.basename(file_path)} already exists in {extract_path}.")
print("WikiText-2 dataset preparation complete.")
def prepare_wikitext103():
import os
from huggingface_hub import hf_hub_download
repo_id = "Salesforce/wikitext"
files = [
"wikitext-103-v1/train-00000-of-00002.parquet",
"wikitext-103-v1/train-00001-of-00002.parquet",
"wikitext-103-v1/validation-00000-of-00001.parquet",
"wikitext-103-v1/test-00000-of-00001.parquet"
]
extract_path = 'data/'
os.makedirs(extract_path, exist_ok=True)
print("Downloading WikiText-103 dataset from Hugging Face...")
for file_path in files:
local_path = os.path.join(extract_path, os.path.basename(file_path))
if not os.path.exists(local_path):
hf_hub_download(repo_id=repo_id, filename=file_path, local_dir=extract_path, repo_type="dataset")
print(f"Downloaded {os.path.basename(file_path)} to {extract_path}.")
else:
print(f"{os.path.basename(file_path)} already exists in {extract_path}.")
print("WikiText-103 dataset preparation complete.")
def prepare_lambada():
url = 'https://raw.githubusercontent.com/cybertronai/bflm/refs/heads/master/lambada_test.jsonl'
os.makedirs('data/lambada', exist_ok=True)
r = requests.get(url)
with open('data/lambada/lambada_test.jsonl', 'wb') as f:
f.write(r.content)
print("LAMBADA dataset prepared.")
if __name__ == '__main__':
prepare_ptb()
prepare_wikitext2()
prepare_wikitext103()
prepare_lambada()