Spaces:
Runtime error
Runtime error
File size: 1,709 Bytes
c69cba4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
from datetime import datetime
from datasets import load_dataset
from bs4 import BeautifulSoup
def preprocess_dataset():
"""
Preprocesses the 'koutch/stackoverflow_python' dataset.
Returns:
datasets.arrow_dataset.Dataset: The preprocessed dataset.
"""
dataset = load_dataset('koutch/stackoverflow_python', split='train')
dataset = dataset.filter(
lambda example:
example['question_score'] > 100 and
example['answer_score'] > 5 and
datetime.strptime(example['answer_date'], '%Y-%m-%dT%H:%M:%SZ').year > 2010
)
def html2text(example):
soup = BeautifulSoup(example, 'html.parser')
return ''.join(soup.findAll(string=True))
def transforms(example):
example['answer'] = html2text(example['answer_body'])
example['question'] = html2text(example['question_body'])
return example
dataset = dataset.map(lambda example: transforms(example))
dataset = dataset.remove_columns([
'question_score', 'question_date', 'question_id',
'answer_date', 'answer_id', 'answer_score', 'tags',
'question_body', 'answer_body'
])
return dataset
def show_info(dataset):
"""
Print information about the dataset.
Args:
dataset (datasets.arrow_dataset.Dataset): The dataset.
"""
print(dataset.info, '\n')
print(f'dataset len: {len(dataset)}')
print(f"example question: {dataset[0]['question']}")
print(f"example answer: {dataset[0]['answer']}")
if __name__ == '__main__':
dataset = preprocess_dataset()
dataset.push_to_hub('KonradSzafer/stackoverflow_python_preprocessed', private=False)
show_info(dataset)
|