import httpx from cytoolz import groupby from functools import lru_cache from rich import print from functools import partial import gradio as gr from typing import Optional def query_author(author_name: str): url = f"https://api.semanticscholar.org/graph/v1/author/search?query={author_name}&fields=name,url,externalIds,papers.externalIds,papers.title,papers.year" resp = httpx.get(url) resp.raise_for_status() return resp.json()["data"] def get_arxiv_paper(papers): papers_with_externalIds = [paper for paper in papers if paper.get("externalIds")] return [ paper for paper in papers_with_externalIds if paper["externalIds"].get("ArXiv") ] def check_arxiv_in_papers(arxiv_ids, papers): papers_with_externalIds = [paper for paper in papers if paper.get("externalIds")] papers_with_arxiv_ids = [ paper for paper in papers_with_externalIds if paper["externalIds"].get("ArXiv") ] return any( paper for paper in papers_with_arxiv_ids if paper["externalIds"].get("ArXiv") in arxiv_ids ) def get_author_from_options(potential_authors, positive_arxiv_ids): return next( ( author for author in potential_authors if check_arxiv_in_papers(set(positive_arxiv_ids), author["papers"]) ), None, ) def sort_by_date(papers): return sorted(papers, key=lambda paper: paper["year"], reverse=True) @lru_cache() def lookup_hf_paper(arxiv_id): url = f"https://huggingface.co/api/papers/{arxiv_id}" resp = httpx.get(url) return resp.json() def check_if_index_hf_paper(paper): arxiv_id = paper["externalIds"]["ArXiv"] data = lookup_hf_paper(arxiv_id) return not data.get("error") def groupby_indexed_by_hf_papers(papers): return groupby(check_if_index_hf_paper, papers) def check_hf_user_in_authors(paper, hf_user_name): authors = paper["authors"] authors = [author for author in authors if author.get("user")] return any(author["user"]["user"] == hf_user_name for author in authors) def groupby_hf_user_papers(papers, hf_user_name): check_hf_user_in_authors_partial = partial( check_hf_user_in_authors, hf_user_name=hf_user_name ) return groupby(check_hf_user_in_authors_partial, papers) def get_papers( author_name: str, positive_arxiv_ids: str, hf_user_name: Optional[gr.OAuthProfile] ): if not hf_user_name: raise gr.Error("You must be logged in to use this Space") if not positive_arxiv_ids: raise gr.Error("You must enter at least one ArXiv ID") hf_user_name = hf_user_name.preferred_username positive_arxiv_ids = positive_arxiv_ids.split(",") # strip whitespace positive_arxiv_ids = [arxiv_id.strip() for arxiv_id in positive_arxiv_ids] potential_authors = query_author(author_name) if not potential_authors: raise gr.Error("No authors found with that name") author = get_author_from_options(potential_authors, positive_arxiv_ids) papers = get_arxiv_paper(author["papers"]) papers = sort_by_date(papers) papers_indexed_by_hf = groupby_indexed_by_hf_papers(papers) # print(papers_indexed_by_hf[True]) indexed_papers = [ lookup_hf_paper(paper["externalIds"]["ArXiv"]) for paper in papers_indexed_by_hf[True] ] already_claimed = groupby_hf_user_papers(indexed_papers, hf_user_name) if already_claimed.get(False): results = ( "# Papers already indexed by Hugging Face which you haven't claimed\n" + "These papers are already indexed by Hugging Face, but you haven't" " claimed them yet. You can claim them by clicking on the link to the" " paper and then clicking on your name in the author list.\n" ) for paper in already_claimed[False]: url = f"https://huggingface.co/papers/{paper['id']}" results += f"- [{paper['title']}]({url})\n" else: results = "You have claimed all papers indexed by Hugging Face!\n" if papers_indexed_by_hf.get(False): results += "# Papers not yet indexed by Hugging Face which you can claim\n" for paper in papers_indexed_by_hf[False]: paper_title = paper["title"] arxiv_id = paper["externalIds"]["ArXiv"] url = f"https://huggingface.co/papers/{arxiv_id}" results += f"- [{paper_title}]({url})\n" return results def get_name(hf_user_name: Optional[gr.OAuthProfile] = None): return hf_user_name.name if hf_user_name else "" with gr.Blocks() as demo: gr.HTML( "