0-hero
/

gpt2-pos-encoding-experiment-10B

Model card Files Files and versions Community

gpt2-pos-encoding-experiment-10B / prepare_evaluation_data.py

0-hero

Add files using upload-large-folder tool

01cd082 verified 4 months ago

raw

history blame contribute delete

3.59 kB

	# prepare_evaluation_data.py
	import os
	import requests
	import zipfile
	import tarfile
	import shutil
	from huggingface_hub import hf_hub_download

	def download_and_extract(url, extract_path):
	filename = url.split('/')[-1]
	if not os.path.exists(filename):
	print(f"Downloading {filename}...")
	r = requests.get(url)
	with open(filename, 'wb') as f:
	f.write(r.content)
	else:
	print(f"{filename} already exists.")

	if filename.endswith('.zip'):
	with zipfile.ZipFile(filename, 'r') as zip_ref:
	zip_ref.extractall(extract_path)
	elif filename.endswith(('.tar.gz', '.tgz')):
	with tarfile.open(filename, 'r:gz') as tar_ref:
	tar_ref.extractall(extract_path)
	else:
	print(f"Cannot extract {filename}.")

	def prepare_ptb():
	url = 'https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.train.txt'
	os.makedirs('data/ptb', exist_ok=True)
	for split in ['train', 'valid', 'test']:
	split_url = url.replace('train', split)
	r = requests.get(split_url)
	with open(f'data/ptb/{split}.txt', 'w') as f:
	f.write(r.text)
	print("PTB dataset prepared.")

	def prepare_wikitext2():
	import os
	from huggingface_hub import hf_hub_download

	repo_id = "Salesforce/wikitext"
	files = [
	"wikitext-2-v1/train-00000-of-00001.parquet",
	"wikitext-2-v1/validation-00000-of-00001.parquet",
	"wikitext-2-v1/test-00000-of-00001.parquet"
	]
	extract_path = 'data/'
	os.makedirs(extract_path, exist_ok=True)

	print("Downloading WikiText-2 dataset from Hugging Face...")
	for file_path in files:
	local_path = os.path.join(extract_path, os.path.basename(file_path))
	if not os.path.exists(local_path):
	hf_hub_download(repo_id=repo_id, filename=file_path, local_dir=extract_path, repo_type="dataset")
	print(f"Downloaded {os.path.basename(file_path)} to {extract_path}.")
	else:
	print(f"{os.path.basename(file_path)} already exists in {extract_path}.")
	print("WikiText-2 dataset preparation complete.")

	def prepare_wikitext103():
	import os
	from huggingface_hub import hf_hub_download

	repo_id = "Salesforce/wikitext"
	files = [
	"wikitext-103-v1/train-00000-of-00002.parquet",
	"wikitext-103-v1/train-00001-of-00002.parquet",
	"wikitext-103-v1/validation-00000-of-00001.parquet",
	"wikitext-103-v1/test-00000-of-00001.parquet"
	]
	extract_path = 'data/'
	os.makedirs(extract_path, exist_ok=True)

	print("Downloading WikiText-103 dataset from Hugging Face...")
	for file_path in files:
	local_path = os.path.join(extract_path, os.path.basename(file_path))
	if not os.path.exists(local_path):
	hf_hub_download(repo_id=repo_id, filename=file_path, local_dir=extract_path, repo_type="dataset")
	print(f"Downloaded {os.path.basename(file_path)} to {extract_path}.")
	else:
	print(f"{os.path.basename(file_path)} already exists in {extract_path}.")
	print("WikiText-103 dataset preparation complete.")

	def prepare_lambada():
	url = 'https://raw.githubusercontent.com/cybertronai/bflm/refs/heads/master/lambada_test.jsonl'
	os.makedirs('data/lambada', exist_ok=True)
	r = requests.get(url)
	with open('data/lambada/lambada_test.jsonl', 'wb') as f:
	f.write(r.content)
	print("LAMBADA dataset prepared.")

	if __name__ == '__main__':
	prepare_ptb()
	prepare_wikitext2()
	prepare_wikitext103()
	prepare_lambada()