hf-qa-demo / data /stackoverflow_python_dataset.py
KonradSzafer's picture
initial commit
c69cba4
raw
history blame
1.71 kB
from datetime import datetime
from datasets import load_dataset
from bs4 import BeautifulSoup
def preprocess_dataset():
"""
Preprocesses the 'koutch/stackoverflow_python' dataset.
Returns:
datasets.arrow_dataset.Dataset: The preprocessed dataset.
"""
dataset = load_dataset('koutch/stackoverflow_python', split='train')
dataset = dataset.filter(
lambda example:
example['question_score'] > 100 and
example['answer_score'] > 5 and
datetime.strptime(example['answer_date'], '%Y-%m-%dT%H:%M:%SZ').year > 2010
)
def html2text(example):
soup = BeautifulSoup(example, 'html.parser')
return ''.join(soup.findAll(string=True))
def transforms(example):
example['answer'] = html2text(example['answer_body'])
example['question'] = html2text(example['question_body'])
return example
dataset = dataset.map(lambda example: transforms(example))
dataset = dataset.remove_columns([
'question_score', 'question_date', 'question_id',
'answer_date', 'answer_id', 'answer_score', 'tags',
'question_body', 'answer_body'
])
return dataset
def show_info(dataset):
"""
Print information about the dataset.
Args:
dataset (datasets.arrow_dataset.Dataset): The dataset.
"""
print(dataset.info, '\n')
print(f'dataset len: {len(dataset)}')
print(f"example question: {dataset[0]['question']}")
print(f"example answer: {dataset[0]['answer']}")
if __name__ == '__main__':
dataset = preprocess_dataset()
dataset.push_to_hub('KonradSzafer/stackoverflow_python_preprocessed', private=False)
show_info(dataset)