Spaces:

librarian-bots
/

collection-reading-list-generator

Runtime error

File size: 6,195 Bytes

import json
from typing import Any, Dict, List, Optional, Union

import gradio as gr
import httpx
from cachetools import TTLCache, cached
from gradio_client import Client
from toolz import groupby

CACHE_TIME = 60 * 60 * 1  # 1 hour

client = Client("https://librarian-bots-collection-papers-extractor.hf.space/")


@cached(cache=TTLCache(maxsize=500, ttl=10))
def get_arxiv_ids_from_slug(
    slug: str,
) -> Dict[str, Union[None, Dict[str, Dict[str, Union[List[str], List[str]]]]]]:
    result = client.predict(slug, api_name="/predict")
    with open(result) as f:
        data = json.load(f)
    return data


def format_arxiv_id_for_semantic_scholar(arxiv_id: str) -> str:
    return f"ArXiv:{arxiv_id}"


def format_ids(data, exclude_keys: Optional[list[str]] = None) -> list[str]:
    arxiv_ids = []
    if exclude_keys is not None:
        data = {k: v for k, v in data.items() if k not in exclude_keys}
        # check if dict now empty
        if not data:
            return []
    for repo in data.values():
        if repo is None:
            continue
        for item in repo.values():
            arxiv_ids.extend(item["arxiv_ids"])
    # format for semantic scholar
    return [format_arxiv_id_for_semantic_scholar(id) for id in arxiv_ids]


@cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME))
def get_recommendations_from_semantic_scholar(paper_ids: tuple[str]):
    paper_ids = list(paper_ids)
    print(paper_ids)
    r = httpx.post(
        "https://api.semanticscholar.org/recommendations/v1/papers/",
        json={
            "positivePaperIds": paper_ids,
        },
        params={"fields": "externalIds,title,year", "limit": 10},
        timeout=30,
    )
    print(r.text)
    return r.json()


def is_arxiv_paper(recommendation: Dict[str, Any]) -> bool:
    return recommendation["externalIds"].get("ArXiv", None) is not None


def group_by_is_arxiv_paper(
    recommendations: List[Dict[str, Any]]
) -> Dict[bool, List[Dict[str, Any]]]:
    return groupby(is_arxiv_paper, recommendations)


def format_recommendation_into_markdown(
    grouped_recommendations: Dict[bool, List[Dict[str, Any]]]
):
    comment = "The following papers were recommended by the Semantic Scholar API \n\n"
    arxiv_papers = grouped_recommendations.get(True)
    if arxiv_papers:
        comment += "## Papers available on Hugging Face Papers:\n\n"
        for r in arxiv_papers:
            hub_paper_url = f"https://huggingface.co/papers/{r['externalIds']['ArXiv']}"
            comment += f"* [{r['title']}]({hub_paper_url}) ({r['year']})\n"
    other_papers = grouped_recommendations.get(False)
    if other_papers:
        comment += "\n\n## Other papers:\n\n"
        for r in other_papers:
            comment += f"* {r['title']} ({r['year']})\n"
    return comment


def map_repo_name_to_api_key(repo_name: str) -> str:
    return {
        "datasets": "dataset papers",
        "models": "model papers",
        "papers": "papers",
    }[repo_name]


def get_recommendations_from_slug(
    slug: str, excluded_repo_types: Optional[list[str]] = None
):
    excluded_repo_types = tuple(excluded_repo_types)
    return _get_recommendations_from_slug(slug, excluded_repo_types=excluded_repo_types)


@cached(cache=TTLCache(maxsize=500, ttl=60))
def _get_recommendations_from_slug(
    slug: str, excluded_repo_types: Optional[tuple[str]] = None
):
    data = get_arxiv_ids_from_slug(slug)
    if excluded_repo_types:
        excluded_repo_types = list(excluded_repo_types)
        excluded_repo_types = [map_repo_name_to_api_key(k) for k in excluded_repo_types]
        print(f"excluded_repo_types_remapped={excluded_repo_types}")
    ids = format_ids(data, exclude_keys=excluded_repo_types)
    if not ids:
        return (
            "Based on your collection and exclusions"
            f" ({','.join(excluded_repo_types)}), there are no papers to recommend. Try"
            " removing some excluded repo types or adding more items to your"
            " collection."
        )
    ids = tuple(ids)
    recommendations = get_recommendations_from_semantic_scholar(ids)
    recommendations = recommendations.get("recommendedPapers")
    if recommendations is None:
        raise gr.Error("Something went wrong with the Semantic Scholar API")
    grouped = group_by_is_arxiv_paper(recommendations)
    return format_recommendation_into_markdown(grouped)


title = """📚 Collections Reading List Generator                 📚"""
description = """<img src="https://huggingface.co/datasets/librarian-bots/images/raw/main/Mascot%20Bookie.svg" 
alt="Mascot Bookie" width="200" style="float:left; margin-right:20px; margin-bottom:20px;"> 

\n\n 
Hugging Face Collections allow you to curate models, datasets, spaces, 
and papers from the Hugging Face Hub. 
This Space will generate a reading list based on the items in your collection. 
This can be a great way to find related papers to the models and datasets in your collection and dive more deeply into a topic!

The Space works by:

- finding any papers in your collection 
- finding papers related to the models and datasets in your collection 
- requesting recommendations from the [Semantic Scholar API](https://api.semanticscholar.org/api-docs/recommendations#tag/Paper-Recommendations/operation/post_papers) for these papers.

You can optionally exclude certain repo types fromm consideration when generating the reading list.
"""

slug_input = gr.Textbox(
    lines=1,
    label="Collection Slug",
    placeholder="merve/video-classification-models-6509edd0a6f657faa425e8c3",
)
example_slugs = [
    ["merve/video-classification-models-6509edd0a6f657faa425e8c3", []],
    ["osanseviero/model-merging-65097893623330a3a51ead66", []],
    ["hf4h/clinical-language-models-64f9c1cd0cedc04f3caca264", []],
]

gr.Interface(
    get_recommendations_from_slug,
    inputs=[
        slug_input,
        gr.Dropdown(
            label="Repos to exclude from contributing to recommendations",
            choices=["datasets", "models", "papers"],
            multiselect=True,
        ),
    ],
    outputs="markdown",
    description=description,
    title=title,
    allow_flagging="never",
    examples=example_slugs,
).launch()