Spaces:
Sleeping
Sleeping
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import pysbd | |
import re | |
######################## | |
## Year based search ## | |
######################## | |
BASE_URL = "https://www.wikidata.org/wiki/Wikidata:Requests_for_deletions/Archive" | |
def get_soup(url): | |
response = requests.get(url) | |
response.raise_for_status() | |
return BeautifulSoup(response.text, 'html.parser') | |
def get_year_urls(): | |
soup = get_soup(BASE_URL) | |
year_urls = {} | |
for link in soup.select('a[href^="/wiki/Wikidata:Requests_for_deletions/Archive/"]'): | |
year_url = link['href'] | |
if year_url.endswith(tuple(str(year) for year in range(2012, 2025))): | |
year = year_url.split('/')[-1] | |
full_year_url = "https://www.wikidata.org" + year_url | |
year_urls[year] = full_year_url | |
return year_urls | |
def get_month_day_urls(year_url): | |
soup = get_soup(year_url) | |
month_day_urls = [] | |
for link in soup.select('a[href^="/wiki/Wikidata:Requests_for_deletions/Archive/"]'): | |
date_url = link['href'] | |
if len(date_url.split('/')) >= 7: | |
full_date_url = "https://www.wikidata.org" + date_url | |
if full_date_url not in month_day_urls: | |
month_day_urls.append(full_date_url) | |
return month_day_urls | |
def extract_outcome_from_dd(dd): | |
try: | |
result_tag = dd.find('b') | |
if result_tag: | |
return result_tag.get_text().strip() | |
return 'unknown' | |
except: | |
return 'unknown' | |
def extract_discussions(url): | |
soup = get_soup(url) | |
discussions = [] | |
for h2 in soup.find_all('h2'): | |
title_tag = h2.find('a') | |
if title_tag and 'Q' in title_tag.get_text(): | |
title = title_tag.get_text().strip() | |
discussion_parts = [] | |
last_dd = None | |
for sibling in h2.find_all_next(): | |
if sibling.name == 'h2': | |
break | |
if sibling.name == 'p': | |
discussion_parts.append(sibling.get_text(separator=' ', strip=True)) | |
if sibling.name == 'dl': | |
dds = sibling.find_all('dd') | |
if dds: | |
for dd in dds[:-1]: | |
discussion_parts.append(dd.get_text(separator=' ', strip=True)) | |
last_dd = dds[-1] | |
discussion_text = ' '.join(discussion_parts) if discussion_parts else 'No discussion found' | |
outcome = extract_outcome_from_dd(last_dd) if last_dd else 'Outcome not found' | |
entity_url = url + '#' + title | |
discussions.append({ | |
"title": title, | |
"discussion": discussion_text, | |
"outcome": outcome, | |
"url": entity_url, | |
'date': url.split('Archive/')[-1] | |
}) | |
return discussions | |
def remove_first_sentence_if_q_number(text): | |
seg = pysbd.Segmenter(language="en", clean=False) | |
sentences = seg.segment(text) | |
if sentences and sentences[0].startswith('Q') and sentences[0][1:].isdigit(): | |
return ' '.join(sentences[1:]) | |
return text | |
def process_discussions_by_url_list(url_list): | |
all_discussions = [] | |
for url in url_list: | |
discussions = extract_discussions(url) | |
all_discussions.extend(discussions) | |
df = pd.DataFrame(all_discussions) | |
if not df.empty: | |
df['discussion'] = df['discussion'].apply(remove_first_sentence_if_q_number) | |
return df | |
######################## | |
## Title based search ## | |
######################## | |
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import pysbd | |
def html_to_plaintext(html_content): | |
soup = BeautifulSoup(html_content, 'html.parser') | |
for tag in soup.find_all(['p', 'li', 'dd', 'dl', 'ul']): | |
tag.insert_before('\n') | |
tag.insert_after('\n') | |
for br in soup.find_all('br'): | |
br.replace_with('\n') | |
text = soup.get_text(separator=' ', strip=True) | |
text = '\n'.join([line.strip() for line in text.splitlines() if line.strip() != '']) | |
return text | |
def split_text_into_sentences(text): | |
seg = pysbd.Segmenter(language="en", clean=False) | |
sentences = seg.segment(text) | |
return ' '.join(sentences) | |
def clean_discussion_tag(tag): | |
for unwanted in tag.find_all(['span', 'img', 'a', 'div'], recursive=True): | |
unwanted.decompose() | |
return tag.get_text(separator=' ', strip=True) | |
def extract_outcome_from_text_elements(elements): | |
consensus_keywords = [ | |
'Deleted', 'Delete', 'delete', 'deleted', | |
'kept', 'keep', 'Keep', 'Kept', | |
'merge', 'Merge', 'Not done', 'No consensus', 'no consensus' | |
] | |
for el in elements: | |
b_tags = el.find_all('b') | |
for b in b_tags: | |
if b.text.strip() in consensus_keywords: | |
return b.text.strip() | |
return '' | |
def extract_discussion_section(soup, title): | |
h2_tag = soup.find('h2', id=title) | |
if not h2_tag: | |
print(f"No heading found with id={title}") | |
return '', '', '' | |
heading_div = h2_tag.find_parent('div', class_='mw-heading mw-heading2 ext-discussiontools-init-section') | |
if not heading_div: | |
print(f"No heading div found for {title}") | |
return '', '', '' | |
next_heading_div = heading_div.find_next('div', class_='mw-heading mw-heading2 ext-discussiontools-init-section') | |
discussion_nodes = [] | |
for sibling in heading_div.next_siblings: | |
if sibling == next_heading_div: | |
break | |
discussion_nodes.append(sibling) | |
discussion_tags = [] | |
for node in discussion_nodes: | |
if getattr(node, 'name', None) in ['p', 'ul', 'dl']: | |
if node.has_attr('class') and 'plainlinks' in node['class']: | |
continue | |
if node.get('style', '').lower() == 'visibility:hidden;display:none': | |
continue | |
if node.find('span', id=title): | |
continue | |
discussion_tags.append(node) | |
if not discussion_tags: | |
return '', '', '' | |
label = extract_outcome_from_text_elements(discussion_tags) | |
discussion_html_parts = [str(tag) for tag in discussion_tags] | |
cleaned_parts = [] | |
for tag in discussion_tags: | |
text = clean_discussion_tag(tag) | |
if text: | |
cleaned_parts.append(text) | |
cleaned_discussion = ' '.join(cleaned_parts) | |
discussion_html = '\n'.join(discussion_html_parts) | |
return discussion_html, label, cleaned_discussion | |
def extract_div_from_title(title, url=''): | |
if url=='' or not url: | |
base_url = 'https://www.wikidata.org/wiki/Wikidata:Requests_for_deletions' | |
url = base_url + '#' + title | |
text_url = base_url | |
discussion_url = text_url + '#' + title | |
response = requests.get(url) | |
if response.status_code != 200: | |
print(f"Could not fetch {url}") | |
return pd.DataFrame(columns=['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label']) | |
if title == '': | |
title = url.split('#')[-1] | |
soup = BeautifulSoup(response.content, 'html.parser') | |
discussion_html, label, cleaned_discussion = extract_discussion_section(soup, title) | |
text_url = 'https://www.wikidata.org/wiki/'+ url.split('#')[0] | |
discussion_url = url | |
df = pd.DataFrame([[title, text_url, discussion_url, cleaned_discussion, label]], | |
columns=['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label']) | |
if label: | |
df['label'] = df['label'].replace({ | |
'Deleted':'delete', 'Delete':'delete', 'delete':'delete', 'deleted':'delete', | |
'kept':'keep', 'keep':'keep', 'Keep':'keep', 'Kept':'keep', | |
'merge':'merge', 'Merge':'merge', 'Not done':'no_consensus', | |
'No consensus':'no_consensus', 'no consensus':'no_consensus' | |
}) | |
df['discussion_cleaned'] = df['discussion_cleaned'].apply(split_text_into_sentences) | |
return df | |
######################## | |
## Collection function ## | |
######################## | |
import pandas as pd | |
def collect_wikidata_entity(mode='year', title='', url='', years=[]): | |
if mode not in ['title', 'year','url']: | |
raise ValueError("mode must be either 'title' or 'year'") | |
if mode == 'title': | |
if not title or years: | |
raise ValueError("For 'title' mode, 'title' must be provided and 'years' must be empty.") | |
df = extract_div_from_title(title) | |
df = df.rename(columns={'label':'outcome', 'discussion_cleaned':'discussion'}) | |
return df | |
elif mode == 'url': | |
if 'Archive' in url: | |
archived_url = url.split('#')[0] | |
title = url.split('#')[-1] | |
disc_df = process_discussions_by_url_list([archived_url]) | |
disc_df['title'] = disc_df['title'].str.strip() | |
title = title.strip() | |
df = disc_df[disc_df['title'] == title] | |
print(f"Found {len(df)} discussions for title {title}") | |
if df.empty: | |
return pd.DataFrame(columns=['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label']) | |
df = df.rename(columns={'label':'outcome', 'discussion_cleaned':'discussion'}) | |
return df | |
if title or years: | |
raise ValueError("For 'url' mode, 'url' must be provided and 'title' must be empty.") | |
df = extract_div_from_title('', url) | |
df = df.rename(columns={'label':'outcome', 'discussion_cleaned':'discussion'}) | |
return df | |
elif mode == 'year': | |
if title or not years: | |
raise ValueError("For 'year' mode, 'years' must be provided and 'title' must be empty.") | |
if isinstance(years, list) and len(years) == 2: | |
start_year, end_year = years | |
years = list(range(start_year, end_year + 1)) | |
elif isinstance(years, int): | |
years = [years] | |
df = pd.DataFrame() | |
for year in years: | |
print(f"Processing year: {year}") | |
year_urls = get_year_urls() | |
if str(year) not in year_urls: | |
print(f"No URL found for year {year}") | |
continue | |
year_url = year_urls[str(year)] | |
month_day_urls = get_month_day_urls(year_url) | |
print(f"Found {len(month_day_urls)} month-day URLs for {year}") | |
discussions_df = process_discussions_by_url_list(month_day_urls) | |
if discussions_df.empty: | |
continue | |
discussions_df.rename(columns={'url':'discussion_url', 'outcome':'label', 'discussion':'discussion_cleaned'}, inplace=True) | |
text_url = year_url | |
discussions_df['text_url'] = text_url | |
discussions_df['label'] = discussions_df['label'].replace({ | |
'Deleted':'delete', 'Delete':'delete', 'delete':'delete', 'deleted':'delete', | |
'kept':'keep', 'keep':'keep', 'Keep':'keep', 'Kept':'keep', | |
'merge':'merge', 'Merge':'merge', 'Not done':'no_consensus', | |
'No consensus':'no_consensus', 'no consensus':'no_consensus' | |
}) | |
desired_columns = ['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label'] | |
for col in desired_columns: | |
if col not in discussions_df.columns: | |
discussions_df[col] = '' | |
discussions_df = discussions_df[desired_columns] | |
df = pd.concat([df, discussions_df], ignore_index=True) | |
df = df.rename(columns={'label':'outcome', 'discussion_cleaned':'discussion'}) | |
return df | |