from datetime import datetime from datasets import load_dataset from bs4 import BeautifulSoup def preprocess_dataset(): """ Preprocesses the 'koutch/stackoverflow_python' dataset. Returns: datasets.arrow_dataset.Dataset: The preprocessed dataset. """ dataset = load_dataset('koutch/stackoverflow_python', split='train') dataset = dataset.filter( lambda example: example['question_score'] > 100 and example['answer_score'] > 5 and datetime.strptime(example['answer_date'], '%Y-%m-%dT%H:%M:%SZ').year > 2010 ) def html2text(example): soup = BeautifulSoup(example, 'html.parser') return ''.join(soup.findAll(string=True)) def transforms(example): example['answer'] = html2text(example['answer_body']) example['question'] = html2text(example['question_body']) return example dataset = dataset.map(lambda example: transforms(example)) dataset = dataset.remove_columns([ 'question_score', 'question_date', 'question_id', 'answer_date', 'answer_id', 'answer_score', 'tags', 'question_body', 'answer_body' ]) return dataset def show_info(dataset): """ Print information about the dataset. Args: dataset (datasets.arrow_dataset.Dataset): The dataset. """ print(dataset.info, '\n') print(f'dataset len: {len(dataset)}') print(f"example question: {dataset[0]['question']}") print(f"example answer: {dataset[0]['answer']}") if __name__ == '__main__': dataset = preprocess_dataset() dataset.push_to_hub('KonradSzafer/stackoverflow_python_preprocessed', private=False) show_info(dataset)