Spaces:

CarperAI
/

pilev2_pipeline

Runtime error

ncoop57

Made more readyable and added plots

cf5eed6 almost 2 years ago

10.1 kB

	import gradio as gr
	import matplotlib.pyplot as plt
	import numpy as np
	from functools import partial

	# ai4code_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AI4Code")
	# amps_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AMPS")
	# apache_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/ASFPublicMail")
	# books3_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Books3")
	# cp_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/CPDataset")
	# dmmath_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/DMMath")
	# discourse_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Discourse")
	# wiki_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Enwiki")
	# euro_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/EuroParliamentProceedings")
	# freelaw_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/FreeLaw_Options")
	# ghdiffs_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/GitHubDiff")
	# ghissues_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/GitHubIssues")
	# gutenberg_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Gutenberg")
	# leet_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/LeetCode")
	# pileoflaw_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/PileOfLaw")
	# pubmed_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/PubMed")
	# s2orc_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/S2ORC")
	# se_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/StackExchange")
	# usenet_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/USENET")
	# uspto_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/USPTO")
	# ubuntuirc_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/UbuntuIRC")
	# arxiv_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/arXiv")

	dataset_data = {
	"AI4Code": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"AMPS": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"ASFPublicMail": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"Books3": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"CPDataset": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"DMMath": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"Discourse": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"Enwiki": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"EuroParliamentProceedings": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"FreeLaw_Options": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"GitHubDiff": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"GitHubIssues": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"Gutenberg": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"LeetCode": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"PileOfLaw": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"PubMed": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"S2ORC": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"StackExchange": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"USENET": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"USPTO": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"UbuntuIRC": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"arXiv": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	}

	def plt_plot(ratio, dataset, threshold):
	x = dataset_data[dataset][ratio]
	# calculate percentage of data that will be removed given threshold
	perc = np.sum(x < threshold) / len(x)
	# create a figure
	fig = plt.figure()
	# add a subplot
	ax = fig.add_subplot(111)
	# plot some data using black
	ax.hist(x, bins=50, color="black")
	# plot red dashed line at threshold
	ax.axvline(threshold, color='r', linestyle='dashed', linewidth=2)
	# set title
	# add percentage of data removed
	ax.set_title(f"{dataset} (removed {perc:.2%})")
	plt.xlabel("Value")
	plt.ylabel("Frequency")
	# make it look nice
	plt.tight_layout()
	return fig

	with gr.Blocks() as demo:
	dataset = gr.Radio(list(dataset_data.keys()), label="Dataset", value="arXiv")
	print(dataset.value)

	with gr.Tab("Character Repetition Ratio"):
	# plot some random data
	plot = gr.Plot()
	threshold = gr.Slider(minimum=0, maximum=100, label="Threshold")
	calculate = gr.Button("Calculate")
	plot_fn = partial(plt_plot, "word_rep_ratios")
	calculate.click(plot_fn, [dataset, threshold], plot)

	with gr.Tab("Word Repetition Ratio"):# plot some random data
	plot = gr.Plot()
	threshold = gr.Slider(minimum=0, maximum=1, label="Threshold")
	calculate = gr.Button("Calculate")
	plot_fn = partial(plt_plot, "char_rep_ratios")
	calculate.click(plot_fn, [dataset, threshold], plot)

	with gr.Tab("Flagged Word Ratio"):# plot some random data
	plot = gr.Plot()
	threshold = gr.Slider(minimum=0, maximum=1, label="Threshold")
	calculate = gr.Button("Calculate")
	plot_fn = partial(plt_plot, "flagged_word_ratios")
	calculate.click(plot_fn, [dataset, threshold], plot)

	if __name__ == "__main__":
	demo.launch(share=True)