import requests import pandas as pd from bs4 import BeautifulSoup import re #################### Spanish Wikipedia #################### ############### # Title based # ############### def extract_result_resultado(sentence): match = re.search(r"(RESULTADO:|El resultado fue)\s*(\w+)", sentence, flags=re.IGNORECASE) return match.group(2).strip() if match else None def extract_result(sentence): #print(f"Extracting result from sentence: {sentence}") match = re.search(r"se\s+decidió\s+(\w+)", sentence, flags=re.IGNORECASE) if match: #print(f"Match found for 'se decidió': {match.groups()}") return match.group(1).strip() #print("No match found for 'se decidió'.") return None def clean_comments_with_no_text_after_timestamp(content_div): for ol in content_div.find_all('ol'): for li in ol.find_all('li'): li_text = li.get_text(strip=True) if "(CEST)" in li_text or "(CET)" in li_text: match = re.search(r"\(C[ES]T\)\s*(.*)", li_text) if match: after_timestamp = match.group(1).strip() if not after_timestamp: li.decompose() else: li.decompose() return content_div def extract_cleaned_spanish_discussion_and_result(url): response = requests.get(url) if response.status_code != 200: print(f"Error: Received status code {response.status_code} for URL: {url}") return pd.DataFrame(columns=['title', 'discussion_uncleaned', 'discussion', 'result_sentence', 'result', 'text_url', 'discussion_url']) soup = BeautifulSoup(response.content, 'html.parser') title = url.split('/')[-1].replace('_', ' ').replace(':', '') text_url = f"https://es.wikipedia.org/wiki/{url.split('/')[-1]}" discussion_url = url content_div = soup.find('div', class_='mw-content-ltr mw-parser-output') if not content_div: print("Error: Main discussion container not found") return pd.DataFrame(columns=['title', 'discussion_uncleaned', 'discussion', 'result_sentence', 'result', 'text_url', 'discussion_url']) discussion_uncleaned = content_div.prettify() discussion = '' result_sentence = '' result = None try: result_p = next( (p for p in content_div.find_all('p') if "El resultado fue" in p.get_text() or "RESULTADO:" in p.get_text()), None ) if result_p: result_sentence = result_p.get_text(strip=True) bold_tag = result_p.find('b') if bold_tag: result = bold_tag.get_text(strip=True) else: match = re.search(r"(El resultado fue|RESULTADO:)\s*(.+?)\.", result_sentence, flags=re.IGNORECASE) result = match.group(2).strip() if match else None #print(f"Extracted result from sentence: {result}") content_div = clean_comments_with_no_text_after_timestamp(content_div) discussion_text_parts = content_div.find_all(recursive=False) cleaned_text_parts = [] for part in discussion_text_parts: cleaned_text_parts.append(part.get_text(strip=True)) discussion = "\n".join(cleaned_text_parts) if not result: result_div = content_div.find('div', class_='messagebox') if result_div: result_dl = result_div.find('dl') if result_dl: result_sentence = result_dl.get_text(strip=True) #print(f"Extracted result sentence from messagebox: {result_sentence}") result = extract_result(result_sentence) if not result and not result_sentence: result_p = next((p for p in content_div.find_all('p') if "RESULTADO:" in p.get_text() or "se decidió" in p.get_text()), None) if result_p: result_sentence = result_p.get_text(strip=True) #print(f"Extracted result sentence from paragraph: {result_sentence}") result = extract_result(result_sentence) if not result and not result_sentence: voting_sentence = next((p for p in content_div.find_all('p') if "se decidió" in p.get_text()), None) if voting_sentence: result_sentence = voting_sentence.get_text(strip=True) #print(f"Extracted voting sentence: {result_sentence}") result = extract_result(result_sentence) # if result: # print(f"Final extracted result: {result}") if "Votación" in discussion: discussion = discussion.split("Votación", 1)[1].strip() except Exception as e: print(f"Error processing discussion: {e}") data = [[title, discussion_uncleaned, discussion, result_sentence, result, text_url, discussion_url]] df = pd.DataFrame(data, columns=['title', 'discussion_uncleaned', 'discussion', 'result_sentence', 'result', 'text_url', 'discussion_url']) df['result'] = df['result'].apply(lambda x: extract_result_resultado(x) if isinstance(x, str) and len(x.split()) > 1 else x) return df # url = 'https://es.wikipedia.org/wiki/Wikipedia:Consultas_de_borrado/!Hispahack' #'https://es.wikipedia.org/wiki/Wikipedia:Consultas_de_borrado/:Country_Club_La_Planicie' # df = extract_cleaned_spanish_discussion_and_result(url) # df ############### # Date based # ############### def extract_result(sentence): match = re.search(r"(El resultado fue|RESULTADO:)\s*(\w+)", sentence, flags=re.IGNORECASE) return match.group(2).strip() if match else None def extract_multiple_discussions(url): response = requests.get(url) if response.status_code != 200: print(f"Error: Received status code {response.status_code} for URL: {url}") return pd.DataFrame(columns=['title', 'discussion_uncleaned', 'discussion', 'result_sentence', 'result', 'text_url', 'discussion_url']) soup = BeautifulSoup(response.content, 'html.parser') content_div = soup.find('div', class_='mw-content-ltr mw-parser-output') if not content_div: print("Error: Main discussion container not found") return pd.DataFrame(columns=['title', 'discussion_uncleaned', 'discussion', 'result_sentence', 'result', 'text_url', 'discussion_url']) data = [] headings = content_div.find_all('div', class_='mw-heading mw-heading3') for idx, heading in enumerate(headings): try: title_tag = heading.find('a', class_='new') or heading.find('a') if title_tag: title = title_tag.text.strip() text_url = f"https://es.wikipedia.org{title_tag['href']}" else: title = f"{url.split('/')[-1]}_{idx + 1}" text_url = f"https://es.wikipedia.org/wiki/{title}" previous_sibling = heading.find_previous_sibling() result_sentence = None result = None while previous_sibling: if previous_sibling.name == 'p' and "El resultado fue" in previous_sibling.get_text(): normalized_text = previous_sibling.get_text(separator=" ", strip=True) result_sentence = normalized_text result = extract_result(result_sentence) break previous_sibling = previous_sibling.find_previous_sibling() if not result_sentence: result_p = content_div.find('p', string=lambda text: text and "RESULTADO:" in text) if result_p: result_sentence = result_p.get_text(strip=True) result = extract_result(result_sentence) discussion_html = "" current = heading.find_next_sibling() while current and not (current.name == 'div' and 'mw-heading mw-heading3' in current.get('class', [])): discussion_html += str(current) current = current.find_next_sibling() discussion_uncleaned = discussion_html discussion = BeautifulSoup(discussion_html, 'html.parser').get_text(strip=True) data.append([title, discussion_uncleaned, discussion, result_sentence, result, text_url, url]) except Exception as e: print(f"Error processing heading: {e}") df = pd.DataFrame(data, columns=['title', 'discussion_uncleaned', 'discussion', 'result_sentence', 'result', 'text_url', 'discussion_url']) return df # url = 'https://es.wikipedia.org/wiki/Wikipedia:Consultas_de_borrado/Registro/10_de_septiembre_de_2009' # df = extract_multiple_discussions(url) # df ############### # Collect ES # ############### def collect_es(mode='title', title='', url = '',date=''): if mode not in ['title', 'year', 'url']: raise ValueError("mode must be either 'title' or 'year'") if mode == 'title': if not title or date: raise ValueError("For 'title' mode, 'title' must be provided and 'date' must be empty.") url = f"https://es.wikipedia.org/wiki/Wikipedia:Consultas_de_borrado/{title}" df = extract_cleaned_spanish_discussion_and_result(url) if df.empty: print(f"No data found for url: {url}") return df elif mode == 'url': if title or date: raise ValueError("For 'url' mode, 'url' must be provided and 'title' must be empty.") df = extract_cleaned_spanish_discussion_and_result(url) return df elif mode == 'year': if title or not date: raise ValueError("For 'year' mode, 'date' must be provided and 'title' must be empty.") month_map = { '01': 'enero', '02': 'febrero', '03': 'marzo', '04': 'abril', '05': 'mayo', '06': 'junio', '07': 'julio', '08': 'agosto', '09': 'septiembre', '10': 'octubre', '11': 'noviembre', '12': 'diciembre' } match = re.match(r'(\d{2})/(\d{2})/(\d{4})', date) if not match: raise ValueError("Date must be in the format dd/mm/yyyy") day, month, year = match.groups() if month not in month_map: raise ValueError("Invalid month in date") date_str = f"{int(day)}_de_{month_map[month]}_de_{year}" url = f"https://es.wikipedia.org/wiki/Wikipedia:Consultas_de_borrado/Registro/{date_str}" df = extract_multiple_discussions(url) return df