hf-qa-demo

Runtime error

hf-qa-demo / data /stackoverflow_python_dataset.py

initial commit

c69cba4 about 1 year ago

1.71 kB

	from datetime import datetime
	from datasets import load_dataset
	from bs4 import BeautifulSoup


	def preprocess_dataset():
	"""
	Preprocesses the 'koutch/stackoverflow_python' dataset.

	Returns:
	datasets.arrow_dataset.Dataset: The preprocessed dataset.
	"""
	dataset = load_dataset('koutch/stackoverflow_python', split='train')
	dataset = dataset.filter(
	lambda example:
	example['question_score'] > 100 and
	example['answer_score'] > 5 and
	datetime.strptime(example['answer_date'], '%Y-%m-%dT%H:%M:%SZ').year > 2010
	)

	def html2text(example):
	soup = BeautifulSoup(example, 'html.parser')
	return ''.join(soup.findAll(string=True))

	def transforms(example):
	example['answer'] = html2text(example['answer_body'])
	example['question'] = html2text(example['question_body'])
	return example

	dataset = dataset.map(lambda example: transforms(example))
	dataset = dataset.remove_columns([
	'question_score', 'question_date', 'question_id',
	'answer_date', 'answer_id', 'answer_score', 'tags',
	'question_body', 'answer_body'
	])
	return dataset


	def show_info(dataset):
	"""
	Print information about the dataset.

	Args:
	dataset (datasets.arrow_dataset.Dataset): The dataset.
	"""
	print(dataset.info, '\n')
	print(f'dataset len: {len(dataset)}')
	print(f"example question: {dataset[0]['question']}")
	print(f"example answer: {dataset[0]['answer']}")


	if __name__ == '__main__':
	dataset = preprocess_dataset()
	dataset.push_to_hub('KonradSzafer/stackoverflow_python_preprocessed', private=False)
	show_info(dataset)