|
import gradio as gr |
|
import requests |
|
from cachetools import cached, TTLCache |
|
from bs4 import BeautifulSoup |
|
from httpx import Client |
|
import json |
|
from pathlib import Path |
|
from huggingface_hub import CommitScheduler |
|
from dotenv import load_dotenv |
|
import os |
|
from functools import lru_cache |
|
|
|
load_dotenv() |
|
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
|
|
CACHE_TIME = 60 * 60 * 6 |
|
|
|
client = Client() |
|
|
|
REPO_ID = "librarian-bots/paper-recommendations-v2" |
|
|
|
scheduler = CommitScheduler( |
|
repo_id=REPO_ID, |
|
repo_type="dataset", |
|
folder_path="comments", |
|
path_in_repo="data", |
|
every=5, |
|
token=HF_TOKEN, |
|
) |
|
|
|
|
|
def parse_arxiv_id_from_paper_url(url): |
|
return url.split("/")[-1] |
|
|
|
|
|
@cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME)) |
|
def get_recommendations_from_semantic_scholar(semantic_scholar_id: str): |
|
try: |
|
r = requests.post( |
|
"https://api.semanticscholar.org/recommendations/v1/papers/", |
|
json={ |
|
"positivePaperIds": [semantic_scholar_id], |
|
}, |
|
params={"fields": "externalIds,title,year", "limit": 10}, |
|
) |
|
return r.json()["recommendedPapers"] |
|
except KeyError as e: |
|
raise gr.Error( |
|
"Error getting recommendations, if this is a new paper it may not yet have" |
|
" been indexed by Semantic Scholar." |
|
) from e |
|
|
|
|
|
def filter_recommendations(recommendations, max_paper_count=5): |
|
|
|
arxiv_paper = [ |
|
r for r in recommendations if r["externalIds"].get("ArXiv", None) is not None |
|
] |
|
if len(arxiv_paper) > max_paper_count: |
|
arxiv_paper = arxiv_paper[:max_paper_count] |
|
return arxiv_paper |
|
|
|
|
|
@cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME)) |
|
def get_paper_title_from_arxiv_id(arxiv_id): |
|
try: |
|
return requests.get(f"https://huggingface.co/api/papers/{arxiv_id}").json()[ |
|
"title" |
|
] |
|
except Exception as e: |
|
print(f"Error getting paper title for {arxiv_id}: {e}") |
|
raise gr.Error("Error getting paper title for {arxiv_id}: {e}") from e |
|
|
|
|
|
def format_recommendation_into_markdown(arxiv_id, recommendations): |
|
|
|
|
|
|
|
comment = "The following papers were recommended by the Semantic Scholar API \n\n" |
|
for r in recommendations: |
|
hub_paper_url = f"https://huggingface.co/papers/{r['externalIds']['ArXiv']}" |
|
comment += f"* [{r['title']}]({hub_paper_url}) ({r['year']})\n" |
|
return comment |
|
|
|
|
|
def format_comment(result: str): |
|
result = ( |
|
"This is an automated message from the [Librarian Bot](https://huggingface.co/librarian-bots). I found the following papers similar to this paper. \n\n" |
|
+ result |
|
) |
|
result += "\n\n Please give a thumbs up to this comment if you found it helpful!" |
|
result += "\n\n If you want recommendations for any Paper on Hugging Face checkout [this](https://huggingface.co/spaces/librarian-bots/recommend_similar_papers) Space" |
|
result += "\n\n You can directly ask Librarian Bot for paper recommendations by tagging it in a comment: `@librarian-bot recommend`" |
|
return result |
|
|
|
|
|
def post_comment( |
|
paper_url: str, comment: str, token: str | None = None, base_url: str | None = None |
|
) -> bool: |
|
if not base_url: |
|
base_url = "https://huggingface.co" |
|
paper_id = paper_url.split("/")[-1] |
|
url = f"{base_url}/api/papers/{paper_id}/comment" |
|
comment_data = {"comment": comment} |
|
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"} |
|
response = requests.post(url, json=comment_data, headers=headers) |
|
if response.status_code == 201: |
|
print(f"Comment posted successfully for {paper_url}!") |
|
return True |
|
else: |
|
print(f"Failed to post comment! (Status Code: {response.status_code})") |
|
print(response.text) |
|
return False |
|
|
|
|
|
@lru_cache(maxsize=500) |
|
def is_comment_from_librarian_bot(html: str) -> bool: |
|
""" |
|
Checks if the given HTML contains a comment from the librarian-bot. |
|
|
|
Args: |
|
html (str): The HTML content to check. |
|
|
|
Returns: |
|
bool: True if a comment from the librarian-bot is found, False otherwise. |
|
""" |
|
soup = BeautifulSoup(html, "lxml") |
|
librarian_bot_links = soup.find_all("a", string="librarian-bot") |
|
return any(librarian_bot_links) |
|
|
|
|
|
def check_if_lib_bot_comment_exists(paper_url: str) -> bool: |
|
""" |
|
Checks if a comment from the librarian bot exists for a given paper URL. |
|
|
|
Args: |
|
paper_url (str): The URL of the paper. |
|
|
|
Returns: |
|
bool: True if a comment from the librarian bot exists, False otherwise. |
|
""" |
|
try: |
|
resp = client.get(paper_url) |
|
return is_comment_from_librarian_bot(resp.text) |
|
except Exception as e: |
|
print(f"Error checking if comment exists for {paper_url}: {e}") |
|
return True |
|
|
|
|
|
def log_comments(paper_url: str, comment: str): |
|
""" |
|
Logs comments for a given paper URL. |
|
|
|
Args: |
|
paper_url (str): The URL of the paper. |
|
comment (str): The comment to be logged. |
|
|
|
Returns: |
|
None |
|
""" |
|
paper_id = paper_url.split("/")[-1] |
|
file_path = Path(f"comments/{paper_id}.json") |
|
if not file_path.exists(): |
|
with scheduler.lock: |
|
with open(file_path, "w") as f: |
|
data = {"paper_url": paper_url, "comment": comment} |
|
json.dump(data, f) |
|
|
|
|
|
def return_recommendations(url: str, post_to_paper: bool = True) -> str: |
|
arxiv_id = parse_arxiv_id_from_paper_url(url) |
|
recommendations = get_recommendations_from_semantic_scholar(f"ArXiv:{arxiv_id}") |
|
filtered_recommendations = filter_recommendations(recommendations) |
|
if post_to_paper: |
|
if comment_already_exists := check_if_lib_bot_comment_exists(url): |
|
gr.Info( |
|
f"Existing comment: {comment_already_exists}...skipping posting comment" |
|
) |
|
else: |
|
comment = format_comment( |
|
format_recommendation_into_markdown(arxiv_id, filtered_recommendations) |
|
) |
|
if comment_status := post_comment(url, comment, token=HF_TOKEN): |
|
log_comments(url, comment) |
|
gr.Info(f"Comment status: {comment_status}") |
|
else: |
|
gr.Info("Failed to post comment") |
|
return format_recommendation_into_markdown(arxiv_id, filtered_recommendations) |
|
|
|
|
|
title = "Semantic Scholar Paper Recommender" |
|
description = ( |
|
"Paste a link to a paper on Hugging Face Papers and get recommendations for similar" |
|
" papers from Semantic Scholar. **Note**: Some papers may not have recommendations" |
|
" yet if they are new or have not been indexed by Semantic Scholar." |
|
) |
|
examples = [ |
|
["https://huggingface.co/papers/2309.12307", False], |
|
["https://huggingface.co/papers/2211.10086", False], |
|
] |
|
interface = gr.Interface( |
|
return_recommendations, |
|
[ |
|
gr.Textbox(lines=1), |
|
gr.Checkbox(label="Post recommendations to Paper page?", default=False), |
|
], |
|
gr.Markdown(), |
|
examples=examples, |
|
title=title, |
|
description=description, |
|
) |
|
interface.queue() |
|
interface.launch() |
|
|