import gradio as gr import requests from datetime import datetime, timezone from concurrent.futures import ThreadPoolExecutor, as_completed API_URL = "https://huggingface.co/api/daily_papers" REPOS_API_URL_TEMPLATE = "https://huggingface.co/api/arxiv/{arxiv_id}/repos" class PaperManager: def __init__(self, papers_per_page=30): self.papers_per_page = papers_per_page self.current_page = 1 self.papers = [] self.total_pages = 1 self.sort_method = "hot" # Default sort method self.raw_papers = [] # To store fetched data def calculate_score(self, paper): """ Calculate the score of a paper based on upvotes and age. This mimics the "hotness" algorithm used by platforms like Hacker News. """ upvotes = paper.get('paper', {}).get('upvotes', 0) published_at_str = paper.get('publishedAt', datetime.now(timezone.utc).isoformat()) try: published_time = datetime.fromisoformat(published_at_str.replace('Z', '+00:00')) except ValueError: # If parsing fails, use current time to minimize the impact on sorting published_time = datetime.now(timezone.utc) time_diff = datetime.now(timezone.utc) - published_time time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours # Avoid division by zero and apply the hotness formula score = upvotes / ((time_diff_hours + 2) ** 1.5) return score def fetch_repos_counts(self, arxiv_id): """ Fetch the repositories (models, datasets, Spaces) associated with a given arxiv_id. Returns a dictionary with counts for each type. """ if not arxiv_id: print("Empty arxiv_id provided.") return {'models': 0, 'datasets': 0, 'spaces': 0} try: print(f"Fetching repositories for arxiv_id: {arxiv_id}") response = requests.get(REPOS_API_URL_TEMPLATE.format(arxiv_id=arxiv_id)) response.raise_for_status() data = response.json() # Debugging: Print the fetched data print(f"Repositories data for {arxiv_id}: {data}") counts = {'models': 0, 'datasets': 0, 'spaces': 0} for repo in data: repo_type = repo.get('type', '').strip().lower() print(f"Repo type found: {repo_type}") # Debugging if repo_type == 'models': counts['models'] += 1 elif repo_type == 'datasets': counts['datasets'] += 1 elif repo_type == 'spaces': counts['spaces'] += 1 else: print(f"Unknown repo type: {repo_type}") # Debugging unknown types print(f"Counts for {arxiv_id}: {counts}") # Debugging return counts except requests.RequestException as e: print(f"HTTP error fetching repos for arxiv_id {arxiv_id}: {e}") return {'models': 0, 'datasets': 0, 'spaces': 0} except ValueError as e: print(f"JSON decoding error for arxiv_id {arxiv_id}: {e}") return {'models': 0, 'datasets': 0, 'spaces': 0} except Exception as e: print(f"Unexpected error fetching repos for arxiv_id {arxiv_id}: {e}") return {'models': 0, 'datasets': 0, 'spaces': 0} def fetch_papers(self): try: response = requests.get(f"{API_URL}?limit=100") response.raise_for_status() data = response.json() if not data: print("No data received from API.") return False self.raw_papers = data # Store raw data # Debugging: Print some arxiv_ids for paper in self.raw_papers[:5]: arxiv_id = paper.get('paper', {}).get('arxiv_id', '') print(f"Sample arxiv_id: {arxiv_id}") # Fetch repos counts concurrently with ThreadPoolExecutor(max_workers=20) as executor: future_to_paper = { executor.submit(self.fetch_repos_counts, paper.get('paper', {}).get('arxiv_id', '')): paper for paper in self.raw_papers } for future in as_completed(future_to_paper): paper = future_to_paper[future] counts = future.result() paper['models'] = counts['models'] paper['datasets'] = counts['datasets'] paper['spaces'] = counts['spaces'] self.sort_papers() self.total_pages = max((len(self.papers) + self.papers_per_page - 1) // self.papers_per_page, 1) self.current_page = 1 return True except requests.RequestException as e: print(f"Error fetching papers: {e}") return False except Exception as e: print(f"Unexpected error: {e}") return False def sort_papers(self): if self.sort_method == "hot": self.papers = sorted( self.raw_papers, key=lambda x: self.calculate_score(x), reverse=True ) elif self.sort_method == "new": self.papers = sorted( self.raw_papers, key=lambda x: x.get('publishedAt', ''), reverse=True ) elif self.sort_method == "most_models": self.papers = sorted( self.raw_papers, key=lambda x: x.get('models', 0), reverse=True ) elif self.sort_method == "most_datasets": self.papers = sorted( self.raw_papers, key=lambda x: x.get('datasets', 0), reverse=True ) elif self.sort_method == "most_spaces": self.papers = sorted( self.raw_papers, key=lambda x: x.get('spaces', 0), reverse=True ) else: # Default to hot if unknown sort method self.papers = sorted( self.raw_papers, key=lambda x: self.calculate_score(x), reverse=True ) print(f"Papers sorted by {self.sort_method}") # Debug def set_sort_method(self, method): valid_methods = ["hot", "new", "most_models", "most_datasets", "most_spaces"] if method not in valid_methods: method = "hot" print(f"Setting sort method to: {method}") # Debug self.sort_method = method self.sort_papers() self.current_page = 1 return True # Assume success def format_paper(self, paper, rank): title = paper.get('title', 'No title') paper_id = paper.get('paper', {}).get('id', '') url = f"https://huggingface.co/papers/{paper_id}" authors = ', '.join([author.get('name', '') for author in paper.get('paper', {}).get('authors', [])]) or 'Unknown' upvotes = paper.get('paper', {}).get('upvotes', 0) comments = paper.get('numComments', 0) published_time_str = paper.get('publishedAt', datetime.now(timezone.utc).isoformat()) try: published_time = datetime.fromisoformat(published_time_str.replace('Z', '+00:00')) except ValueError: published_time = datetime.now(timezone.utc) time_diff = datetime.now(timezone.utc) - published_time time_ago_days = time_diff.days time_ago = f"{time_ago_days} days ago" if time_ago_days > 0 else "today" models = paper.get('models', 0) datasets = paper.get('datasets', 0) spaces = paper.get('spaces', 0) return f"""
Daily Papers |