import json from typing import Any, Dict, List, Optional, Union import gradio as gr import httpx from cachetools import TTLCache, cached from gradio_client import Client from toolz import groupby CACHE_TIME = 60 * 60 * 1 # 1 hour client = Client("https://librarian-bots-collection-papers-extractor.hf.space/") @cached(cache=TTLCache(maxsize=500, ttl=10)) def get_arxiv_ids_from_slug( slug: str, ) -> Dict[str, Union[None, Dict[str, Dict[str, Union[List[str], List[str]]]]]]: result = client.predict(slug, api_name="/predict") with open(result) as f: data = json.load(f) return data def format_arxiv_id_for_semantic_scholar(arxiv_id: str) -> str: return f"ArXiv:{arxiv_id}" def format_ids(data, exclude_keys: Optional[list[str]] = None) -> list[str]: arxiv_ids = [] if exclude_keys is not None: data = {k: v for k, v in data.items() if k not in exclude_keys} # check if dict now empty if not data: return [] for repo in data.values(): if repo is None: continue for item in repo.values(): arxiv_ids.extend(item["arxiv_ids"]) # format for semantic scholar return [format_arxiv_id_for_semantic_scholar(id) for id in arxiv_ids] @cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME)) def get_recommendations_from_semantic_scholar(paper_ids: tuple[str]): paper_ids = list(paper_ids) print(paper_ids) r = httpx.post( "https://api.semanticscholar.org/recommendations/v1/papers/", json={ "positivePaperIds": paper_ids, }, params={"fields": "externalIds,title,year", "limit": 10}, timeout=30, ) print(r.text) return r.json() def is_arxiv_paper(recommendation: Dict[str, Any]) -> bool: return recommendation["externalIds"].get("ArXiv", None) is not None def group_by_is_arxiv_paper( recommendations: List[Dict[str, Any]] ) -> Dict[bool, List[Dict[str, Any]]]: return groupby(is_arxiv_paper, recommendations) def format_recommendation_into_markdown( grouped_recommendations: Dict[bool, List[Dict[str, Any]]] ): comment = "The following papers were recommended by the Semantic Scholar API \n\n" arxiv_papers = grouped_recommendations.get(True) if arxiv_papers: comment += "## Papers available on Hugging Face Papers:\n\n" for r in arxiv_papers: hub_paper_url = f"https://huggingface.co/papers/{r['externalIds']['ArXiv']}" comment += f"* [{r['title']}]({hub_paper_url}) ({r['year']})\n" other_papers = grouped_recommendations.get(False) if other_papers: comment += "\n\n## Other papers:\n\n" for r in other_papers: comment += f"* {r['title']} ({r['year']})\n" return comment def map_repo_name_to_api_key(repo_name: str) -> str: return { "datasets": "dataset papers", "models": "model papers", "papers": "papers", }[repo_name] def get_recommendations_from_slug( slug: str, excluded_repo_types: Optional[list[str]] = None ): excluded_repo_types = tuple(excluded_repo_types) return _get_recommendations_from_slug(slug, excluded_repo_types=excluded_repo_types) @cached(cache=TTLCache(maxsize=500, ttl=60)) def _get_recommendations_from_slug( slug: str, excluded_repo_types: Optional[tuple[str]] = None ): data = get_arxiv_ids_from_slug(slug) if excluded_repo_types: excluded_repo_types = list(excluded_repo_types) excluded_repo_types = [map_repo_name_to_api_key(k) for k in excluded_repo_types] print(f"excluded_repo_types_remapped={excluded_repo_types}") ids = format_ids(data, exclude_keys=excluded_repo_types) if not ids: return ( "Based on your collection and exclusions" f" ({','.join(excluded_repo_types)}), there are no papers to recommend. Try" " removing some excluded repo types or adding more items to your" " collection." ) ids = tuple(ids) recommendations = get_recommendations_from_semantic_scholar(ids) recommendations = recommendations.get("recommendedPapers") if recommendations is None: raise gr.Error("Something went wrong with the Semantic Scholar API") grouped = group_by_is_arxiv_paper(recommendations) return format_recommendation_into_markdown(grouped) title = """📚 Collections Reading List Generator 📚""" description = """Mascot Bookie \n\n Hugging Face Collections allow you to curate models, datasets, spaces, and papers from the Hugging Face Hub. This Space will generate a reading list based on the items in your collection. This can be a great way to find related papers to the models and datasets in your collection and dive more deeply into a topic! The Space works by: - finding any papers in your collection - finding papers related to the models and datasets in your collection - requesting recommendations from the [Semantic Scholar API](https://api.semanticscholar.org/api-docs/recommendations#tag/Paper-Recommendations/operation/post_papers) for these papers. You can optionally exclude certain repo types fromm consideration when generating the reading list. """ slug_input = gr.Textbox( lines=1, label="Collection Slug", placeholder="merve/video-classification-models-6509edd0a6f657faa425e8c3", ) example_slugs = [ ["merve/video-classification-models-6509edd0a6f657faa425e8c3", []], ["osanseviero/model-merging-65097893623330a3a51ead66", []], ["hf4h/clinical-language-models-64f9c1cd0cedc04f3caca264", []], ] gr.Interface( get_recommendations_from_slug, inputs=[ slug_input, gr.Dropdown( label="Repos to exclude from contributing to recommendations", choices=["datasets", "models", "papers"], multiselect=True, ), ], outputs="markdown", description=description, title=title, allow_flagging="never", examples=example_slugs, ).launch(debug=True)