Spaces:
Sleeping
Sleeping
File size: 6,418 Bytes
0fba077 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
# Copyright 2023 by Jan Philip Wahle, https://jpwahle.com/
# All rights reserved.
import asyncio
import json
import aiohttp
import requests
from bs4 import BeautifulSoup
async def fetch(session, url):
"""Asynchronous function to fetch a URL using aiohttp."""
async with session.get(url) as response:
return await response.text()
async def async_match_acl_id_to_s2_paper(acl_id):
"""
Fetches the paper information from the Semantic Scholar API for the given ACL ID.
Args:
acl_id (str): The ACL ID of the paper to fetch.
Returns:
dict: A dictionary containing the paper information.
"""
url = f"https://api.semanticscholar.org/graph/v1/paper/ACL:{acl_id}"
async with aiohttp.ClientSession() as session:
res_text = await fetch(session, url)
return json.loads(res_text)
def extract_paper_info(paper_url):
"""
Extracts information about a paper from its ACL Anthology URL.
Args:
paper_url (str): The URL of the paper on the ACL Anthology website.
Returns:
dict: A dictionary containing the title, authors, and ACL Anthology ID of the paper.
"""
html_doc = requests.get(paper_url, timeout=10).text
soup = BeautifulSoup(html_doc, "html.parser")
title = soup.find("h2", id="title").text.strip()
authors = [
a.text
for a in soup.find_all("a")
if a.parent.name == "p" and a.parent["class"] == ["lead"]
]
acl_id = paper_url.split("/")[-2]
return {"title": title, "authors": authors, "acl_id": acl_id}
def extract_author_info(author_url):
"""
Extracts author information from the given author URL.
Args:
author_url (str): The URL of the author's page on ACL Anthology.
Returns:
dict: A dictionary containing the author's name and a list of their papers.
Each paper is represented as a dictionary with keys "title" and "url".
"""
html_doc = requests.get(author_url, timeout=10).text
soup = BeautifulSoup(html_doc, "html.parser")
author_name = soup.find("h2", id="title").text.strip()
paper_elements = soup.find_all("p")
papers = []
for paper in paper_elements:
links = paper.find_all("a")
# Filter out a with text pdf and bib
links = [
l for l in links if l.text.strip() not in ["pdf", "bib", "abs"]
]
if not links:
continue
title = links[0].text.strip()
url = "https://aclanthology.org" + links[0]["href"]
papers.append({"title": title, "url": url})
return {"author": author_name, "papers": papers}
def extract_venue_info(venue_url):
"""
Extracts venue information from the given URL.
Args:
venue_url (str): The URL of the venue to extract information from.
Returns:
dict: A dictionary containing the venue name and a list of papers with their titles and URLs.
"""
html_doc = requests.get(venue_url, timeout=10).text
soup = BeautifulSoup(html_doc, "html.parser")
venue_name = soup.find("h2", id="title").text.strip()
paper_elements = soup.find_all("p")
papers = []
for paper in paper_elements:
links = paper.find_all("a")
# Filter out a with text pdf and bib
links = [
l for l in links if l.text.strip() not in ["pdf", "bib", "abs"]
]
if not links:
continue
title = links[0].text.strip()
url = "https://aclanthology.org" + links[0]["href"]
papers.append({"title": title, "url": url})
return {"venue": venue_name, "papers": papers}
def determine_page_type(url):
"""
Determine the type of ACL Anthology page given its URL.
Args:
url (str): The URL to be checked.
Returns:
str: "paper", "author", or "venue". Returns None if the type can't be determined.
"""
# Extract last segments from the URL
segments = [segment for segment in url.split("/") if segment]
# Check if the URL points to an event (venue)
if "events" in url or "volumes" in url:
return "venue"
# If URL ends in a pattern like "2023.acl-long.1" it's a paper
if len(segments) > 1 and segments[-2].isnumeric() and "." in segments[-1]:
return "paper"
if "people" in url:
return "author"
# If none of the above rules apply, fetch the page and check its content
try:
html_doc = requests.get(url, timeout=10).text
soup = BeautifulSoup(html_doc, "html.parser")
# Check for unique elements specific to each page type
if soup.find("h2", id="title"):
return (
"author"
if soup.find("a", href=True, text="Google Scholar")
else "paper"
)
elif soup.find("h1", text="Anthology Volume"):
return "venue"
except Exception as e:
print(f"Error determining page type: {e}")
return None
if __name__ == "__main__":
loop = asyncio.get_event_loop()
urls = [
"https://aclanthology.org/2023.acl-long.1/",
"https://aclanthology.org/people/a/anna-rogers/",
"https://aclanthology.org/events/acl-2022/",
]
for url in urls:
if determine_page_type(url) == "paper":
print(f"Paper: {url}")
res = extract_paper_info(url)
paper = loop.run_until_complete(
async_match_acl_id_to_s2_paper(res["acl_id"])
)
print(paper)
elif determine_page_type(url) == "author":
print(f"Author: {url}")
res = extract_author_info(url)
tasks = [
async_match_acl_id_to_s2_paper(paper["url"].split("/")[-2])
for paper in res["papers"]
]
s2_ids = loop.run_until_complete(asyncio.gather(*tasks))
for paper, s2_id in zip(res["papers"], s2_ids):
print(paper["paperId"])
elif determine_page_type(url) == "venue":
print(f"Venue: {url}")
res = extract_venue_info(url)
tasks = [
async_match_acl_id_to_s2_paper(paper["url"].split("/")[-2])
for paper in res["papers"]
]
s2_ids = loop.run_until_complete(asyncio.gather(*tasks))
for paper, s2_id in zip(res["papers"], s2_ids):
print(paper["paperId"])
|