Spaces:
Runtime error
Runtime error
File size: 6,195 Bytes
32c6187 0741973 32c6187 0741973 32c6187 0741973 32c6187 0741973 32c6187 cbd65c5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
import json
from typing import Any, Dict, List, Optional, Union
import gradio as gr
import httpx
from cachetools import TTLCache, cached
from gradio_client import Client
from toolz import groupby
CACHE_TIME = 60 * 60 * 1 # 1 hour
client = Client("https://librarian-bots-collection-papers-extractor.hf.space/")
@cached(cache=TTLCache(maxsize=500, ttl=10))
def get_arxiv_ids_from_slug(
slug: str,
) -> Dict[str, Union[None, Dict[str, Dict[str, Union[List[str], List[str]]]]]]:
result = client.predict(slug, api_name="/predict")
with open(result) as f:
data = json.load(f)
return data
def format_arxiv_id_for_semantic_scholar(arxiv_id: str) -> str:
return f"ArXiv:{arxiv_id}"
def format_ids(data, exclude_keys: Optional[list[str]] = None) -> list[str]:
arxiv_ids = []
if exclude_keys is not None:
data = {k: v for k, v in data.items() if k not in exclude_keys}
# check if dict now empty
if not data:
return []
for repo in data.values():
if repo is None:
continue
for item in repo.values():
arxiv_ids.extend(item["arxiv_ids"])
# format for semantic scholar
return [format_arxiv_id_for_semantic_scholar(id) for id in arxiv_ids]
@cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME))
def get_recommendations_from_semantic_scholar(paper_ids: tuple[str]):
paper_ids = list(paper_ids)
print(paper_ids)
r = httpx.post(
"https://api.semanticscholar.org/recommendations/v1/papers/",
json={
"positivePaperIds": paper_ids,
},
params={"fields": "externalIds,title,year", "limit": 10},
timeout=30,
)
print(r.text)
return r.json()
def is_arxiv_paper(recommendation: Dict[str, Any]) -> bool:
return recommendation["externalIds"].get("ArXiv", None) is not None
def group_by_is_arxiv_paper(
recommendations: List[Dict[str, Any]]
) -> Dict[bool, List[Dict[str, Any]]]:
return groupby(is_arxiv_paper, recommendations)
def format_recommendation_into_markdown(
grouped_recommendations: Dict[bool, List[Dict[str, Any]]]
):
comment = "The following papers were recommended by the Semantic Scholar API \n\n"
arxiv_papers = grouped_recommendations.get(True)
if arxiv_papers:
comment += "## Papers available on Hugging Face Papers:\n\n"
for r in arxiv_papers:
hub_paper_url = f"https://huggingface.co/papers/{r['externalIds']['ArXiv']}"
comment += f"* [{r['title']}]({hub_paper_url}) ({r['year']})\n"
other_papers = grouped_recommendations.get(False)
if other_papers:
comment += "\n\n## Other papers:\n\n"
for r in other_papers:
comment += f"* {r['title']} ({r['year']})\n"
return comment
def map_repo_name_to_api_key(repo_name: str) -> str:
return {
"datasets": "dataset papers",
"models": "model papers",
"papers": "papers",
}[repo_name]
def get_recommendations_from_slug(
slug: str, excluded_repo_types: Optional[list[str]] = None
):
excluded_repo_types = tuple(excluded_repo_types)
return _get_recommendations_from_slug(slug, excluded_repo_types=excluded_repo_types)
@cached(cache=TTLCache(maxsize=500, ttl=60))
def _get_recommendations_from_slug(
slug: str, excluded_repo_types: Optional[tuple[str]] = None
):
data = get_arxiv_ids_from_slug(slug)
if excluded_repo_types:
excluded_repo_types = list(excluded_repo_types)
excluded_repo_types = [map_repo_name_to_api_key(k) for k in excluded_repo_types]
print(f"excluded_repo_types_remapped={excluded_repo_types}")
ids = format_ids(data, exclude_keys=excluded_repo_types)
if not ids:
return (
"Based on your collection and exclusions"
f" ({','.join(excluded_repo_types)}), there are no papers to recommend. Try"
" removing some excluded repo types or adding more items to your"
" collection."
)
ids = tuple(ids)
recommendations = get_recommendations_from_semantic_scholar(ids)
recommendations = recommendations.get("recommendedPapers")
if recommendations is None:
raise gr.Error("Something went wrong with the Semantic Scholar API")
grouped = group_by_is_arxiv_paper(recommendations)
return format_recommendation_into_markdown(grouped)
title = """π Collections Reading List Generator π"""
description = """<img src="https://huggingface.co/datasets/librarian-bots/images/raw/main/Mascot%20Bookie.svg"
alt="Mascot Bookie" width="200" style="float:left; margin-right:20px; margin-bottom:20px;">
\n\n
Hugging Face Collections allow you to curate models, datasets, spaces,
and papers from the Hugging Face Hub.
This Space will generate a reading list based on the items in your collection.
This can be a great way to find related papers to the models and datasets in your collection and dive more deeply into a topic!
The Space works by:
- finding any papers in your collection
- finding papers related to the models and datasets in your collection
- requesting recommendations from the [Semantic Scholar API](https://api.semanticscholar.org/api-docs/recommendations#tag/Paper-Recommendations/operation/post_papers) for these papers.
You can optionally exclude certain repo types fromm consideration when generating the reading list.
"""
slug_input = gr.Textbox(
lines=1,
label="Collection Slug",
placeholder="merve/video-classification-models-6509edd0a6f657faa425e8c3",
)
example_slugs = [
["merve/video-classification-models-6509edd0a6f657faa425e8c3", []],
["osanseviero/model-merging-65097893623330a3a51ead66", []],
["hf4h/clinical-language-models-64f9c1cd0cedc04f3caca264", []],
]
gr.Interface(
get_recommendations_from_slug,
inputs=[
slug_input,
gr.Dropdown(
label="Repos to exclude from contributing to recommendations",
choices=["datasets", "models", "papers"],
multiselect=True,
),
],
outputs="markdown",
description=description,
title=title,
allow_flagging="never",
examples=example_slugs,
).launch()
|