import requests
from bs4 import BeautifulSoup
import pandas as pd
import pysbd
import re
#####################
# Utility functions #
#####################
def html_to_plaintext(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
for tag in soup.find_all(['p', 'li', 'dd', 'dl', 'ul']):
tag.insert_before('\n')
tag.insert_after('\n')
for br in soup.find_all('br'):
br.replace_with('\n')
text = soup.get_text(separator=' ', strip=True)
text = '\n'.join([line.strip() for line in text.splitlines() if line.strip() != ''])
return text
def split_text_into_sentences(text):
seg = pysbd.Segmenter(language="en", clean=False)
sentences = seg.segment(text)
return ' '.join(sentences)
def process_html_to_plaintext(df):
if df.empty:
return df
if 'discussion' in df.columns:
df['discussion_cleaned'] = df['discussion'].apply(html_to_plaintext)
return df
def process_split_text_into_sentences(df):
if df.empty:
return df
if 'discussion_cleaned' in df.columns:
df['discussion_cleaned'] = df['discussion_cleaned'].apply(split_text_into_sentences)
return df
###########################
# Year-based extraction #
###########################
def extract_outcome_from_div(div):
try:
consensus_keywords = ['Deleted', 'Delete', 'delete', 'deleted', 'kept', 'keep', 'Keep', 'Kept', 'merge', 'Merge', 'Not done', 'No consensus', 'no consensus']
dd_tags = div.find_all('dd')
for dd in dd_tags:
b_tag = dd.find('b')
if b_tag and b_tag.text.strip() in consensus_keywords:
return b_tag.text.strip()
img_tag = dd.find('img')
if img_tag and 'X_mark.svg' in img_tag.get('src', ''):
next_b_tag = dd.find_next('b')
if next_b_tag and next_b_tag.text.strip() in consensus_keywords:
return next_b_tag.text.strip()
return 'no consensus'
except Exception as e:
print(f"Error extracting outcome: {e}")
return 'unknown'
def extract_cleaned_discussion(div):
discussion_parts = []
discussion_items = div.find_all(['li', 'dd'])
for item in discussion_items:
for tag in item.find_all(['span', 'img', 'a']):
tag.decompose()
cleaned_text = item.get_text(separator=' ', strip=True)
discussion_parts.append(cleaned_text)
return ' '.join(discussion_parts)
def extract_div_contents_with_additional_columns(url):
response = requests.get(url)
if response.status_code != 200:
return pd.DataFrame(columns=['title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'verdict', 'discussion'])
soup = BeautifulSoup(response.content, 'html.parser')
divs = soup.find_all('div', class_='boilerplate metadata discussion-archived mw-archivedtalk')
if len(divs) == 0:
print(f"No discussions found in {url}. Please check the structure.")
data = []
for i, div in enumerate(divs):
try:
heading_div = div.find_previous('div', class_='mw-heading mw-heading2 ext-discussiontools-init-section')
if heading_div:
h2_tag = heading_div.find('h2')
if h2_tag:
id = h2_tag.get('id', 'Unknown ID')
if id:
text_url = url+'#' + id
title = id.replace('(page does not exist)', '').strip()
else:
title = "Unknown Title"
text_url = "Unknown URL"
else:
title = "Unknown Title"
text_url = "Unknown URL"
else:
# fallback for rare cases
title = "Unknown Title"
text_url = "Unknown URL"
deletion_discussion = div.prettify()
label = extract_outcome_from_div(div)
cleaned_discussion = extract_cleaned_discussion(div)
parts = deletion_discussion.split('
')
discussion = parts[0] if len(parts) > 0 else ''
verdict = '
' + parts[1] if len(parts) > 1 else ''
data.append([title, text_url, deletion_discussion, label, '', cleaned_discussion, verdict])
except Exception as e:
print(f"Error processing div #{i} in {url}: {e}")
continue
df = pd.DataFrame(data, columns=['title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict'])
return df
def scrape_wikidata_deletions(wikidata_url):
months_data = []
month_found = False
for month in range(1, 13):
month_url = f"{wikidata_url}/{month}"
print(f"Processing month: {month}")
response = requests.get(month_url)
if response.status_code == 200:
df = extract_div_contents_with_additional_columns(month_url)
if not df.empty:
df = process_html_to_plaintext(df)
df['discussion_cleaned'] = df['discussion_cleaned'].apply(lambda x: ' '.join(pysbd.Segmenter(language="en", clean=False).segment(x)[1:]) if x else x)
months_data.append(df)
month_found = True
else:
print(f"No month-specific page found for {month_url}.")
if month_found and months_data:
all_data = pd.concat(months_data, ignore_index=True)
return all_data
print(f"Attempting year-based extraction for base URL: {wikidata_url}")
df = extract_div_contents_with_additional_columns(wikidata_url)
if not df.empty:
df = process_html_to_plaintext(df)
df['discussion_cleaned'] = df['discussion_cleaned'].apply(lambda x: ' '.join(pysbd.Segmenter(language="en", clean=False).segment(x)[1:]) if x else x)
return df
print("No data found using month-specific or year-based extraction.")
return pd.DataFrame()
############################
# Title-based extraction #
############################
def extract_outcome_from_text_elements(elements):
consensus_keywords = [
'Deleted', 'Delete', 'delete', 'deleted',
'kept', 'keep', 'Keep', 'Kept',
'merge', 'Merge', 'Not done', 'No consensus', 'no consensus'
]
for el in elements:
b_tags = el.find_all('b')
for b in b_tags:
if b.text.strip() in consensus_keywords:
return b.text.strip()
return ''
def clean_discussion_tag(tag):
for unwanted in tag.find_all(['span', 'img', 'a', 'div'], recursive=True):
unwanted.decompose()
return tag.get_text(separator=' ', strip=True)
def extract_discussion_section(soup, title):
h2_tag = soup.find('h2', id=title)
if not h2_tag:
print(f"No heading found with id={title}")
return '', '', ''
heading_div = h2_tag.find_parent('div', class_='mw-heading mw-heading2 ext-discussiontools-init-section')
if not heading_div:
print(f"No heading div found for {title}")
return '', '', ''
next_heading_div = heading_div.find_next('div', class_='mw-heading mw-heading2 ext-discussiontools-init-section')
discussion_nodes = []
for sibling in heading_div.next_siblings:
if sibling == next_heading_div:
break
discussion_nodes.append(sibling)
discussion_tags = []
for node in discussion_nodes:
if getattr(node, 'name', None) in ['p', 'ul', 'dl']:
if node.find('span', id=title) or node.get('style', '').lower() == 'visibility:hidden;display:none':
continue
discussion_tags.append(node)
if not discussion_tags:
return '', '', ''
label = extract_outcome_from_text_elements(discussion_tags)
discussion_html_parts = [str(tag) for tag in discussion_tags]
cleaned_parts = []
for tag in discussion_tags:
text = clean_discussion_tag(tag)
if text:
cleaned_parts.append(text)
cleaned_discussion = ' '.join(cleaned_parts)
discussion_html = '\n'.join(discussion_html_parts)
return discussion_html, label, cleaned_discussion
def extract_div_from_title(url, title):
response = requests.get(url)
if response.status_code != 200:
print(f"Could not fetch {url}")
return pd.DataFrame(columns=['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label'])
soup = BeautifulSoup(response.content, 'html.parser')
discussion_html, label, cleaned_discussion = extract_discussion_section(soup, title)
text_url = 'https://www.wikidata.org/wiki/Wikidata:Properties_for_deletion'
discussion_url = text_url + '#' + title
data = [[title, text_url, discussion_url, cleaned_discussion, label]]
df = pd.DataFrame(data, columns=['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label'])
return df
############################
# Unified collect function #
############################
def collect_wikidata(mode='year', title='', url='', years=[]):
if mode not in ['title', 'year','url']:
raise ValueError("mode must be either 'title' or 'year' or 'url'.")
if mode == 'title':
if not title or years:
raise ValueError("For 'title' mode, 'title' must be provided and 'years' must be empty.")
url = 'https://www.wikidata.org/wiki/Wikidata:Properties_for_deletion#' + title
df = extract_div_from_title(url, title)
if not df.empty and 'label' in df.columns and df['label'].notnull().any():
df['label'] = df['label'].replace({
'Deleted':'delete', 'Delete':'delete', 'delete':'delete', 'deleted':'delete',
'kept':'keep', 'keep':'keep', 'Keep':'keep', 'Kept':'keep',
'merge':'merge', 'Merge':'merge', 'Not done':'no_consensus',
'No consensus':'no_consensus', 'no consensus':'no_consensus'
})
df = df.rename(columns={'discussion_cleaned':'discussion'})
return df
elif mode == 'url':
if title or years:
raise ValueError("For 'url' mode, 'url' must be provided and 'title' must be empty.")
df = extract_div_contents_with_additional_columns(url)
if not df.empty and 'label' in df.columns and df['label'].notnull().any():
df['label'] = df['label'].replace({
'Deleted':'delete', 'Delete':'delete', 'delete':'delete', 'deleted':'delete',
'kept':'keep', 'keep':'keep', 'Keep':'keep', 'Kept':'keep',
'merge':'merge', 'Merge':'merge', 'Not done':'no_consensus',
'No consensus':'no_consensus', 'no consensus':'no_consensus'
})
else:
return ValueError("No data found for the provided URL.")
df = df.rename(columns={'discussion_cleaned':'discussion'})
return df
elif mode == 'year':
if title or not years:
raise ValueError("For 'year' mode, 'years' must be provided and 'title' must be empty.")
if isinstance(years, list) and len(years) == 2:
start_year, end_year = years
years = list(range(start_year, end_year + 1))
elif isinstance(years, int):
years = [years]
df = pd.DataFrame()
for year in years:
wikidata_url = f'https://www.wikidata.org/wiki/Wikidata:Properties_for_deletion/Archive/{year}'
deletions_df = scrape_wikidata_deletions(wikidata_url)
if deletions_df.empty:
continue
columns_to_drop = ['confirmation', 'discussion', 'verdict', 'deletion_discussion']
deletions_df = deletions_df.drop(columns=[col for col in columns_to_drop if col in deletions_df.columns], errors='ignore')
if 'label' in deletions_df.columns:
deletions_df.rename(columns={'label':'label'}, inplace=True)
deletions_df['label'] = deletions_df['label'].replace({
'Deleted':'delete', 'Delete':'delete', 'delete':'delete', 'deleted':'delete',
'kept':'keep', 'keep':'keep', 'Keep':'keep', 'Kept':'keep',
'merge':'merge', 'Merge':'merge', 'Not done':'no_consensus',
'No consensus':'no_consensus', 'no consensus':'no_consensus'
})
if 'text_url' in deletions_df.columns:
deletions_df.rename(columns={'text_url':'discussion_url'}, inplace=True)
deletions_df['text_url'] = wikidata_url
if 'label' not in deletions_df.columns:
deletions_df['label'] = ''
for col in ['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label']:
if col not in deletions_df.columns:
deletions_df[col] = ''
deletions_df = deletions_df[['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label']]
deletions_df['year'] = year
df = pd.concat([df, deletions_df], ignore_index=True)
df = df.rename(columns={'discussion_cleaned':'discussion'})
return df