Spaces:

cmcmaster
/

this_week_in_rheumatology

Sleeping

App Files Files Community

cmcmaster commited on Nov 10, 2024

Commit

e4f5c0d

verified ·

1 Parent(s): e43dfcd

deploy at 2024-11-11 09:48:16.356051

Browse files

Files changed (8) hide show

Dockerfile +10 -0
README.md +6 -5
generate_newsletter.py +382 -0
hf_api.py +217 -0
main.py +167 -0
requirements.txt +12 -0
search_terms.json +104 -0
templates/newsletter_pdf.html +13 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,10 @@

+FROM python:3.10
+WORKDIR /code
+COPY --link --chown=1000 . .
+RUN mkdir -p /tmp/cache/
+RUN chmod a+rwx -R /tmp/cache/
+ENV HF_HUB_CACHE=HF_HOME
+RUN pip install --no-cache-dir -r requirements.txt
+ENV PYTHONUNBUFFERED=1 PORT=7860
+CMD ["python", "main.py"]

README.md CHANGED Viewed

@@ -1,10 +1,11 @@
 ---
-title: This Week In Rheumatology
-emoji: 🦀
-colorFrom: pink
 colorTo: red
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: cmcmaster/this_week_in_rheumatology
+emoji: 🚀
+colorFrom: purple
 colorTo: red
 sdk: docker
+app_file: app.py
 pinned: false
+termination_grace_period: 2m
 ---

generate_newsletter.py ADDED Viewed

	@@ -0,0 +1,382 @@

+import pandas as pd
+import os
+from datetime import datetime, timedelta, timezone
+import json
+from Bio import Entrez, Medline
+from huggingface_hub import HfApi, hf_hub_download, DatasetCard, DatasetCardData
+from datasets import Dataset, load_dataset
+from hf_api import (
+    evaluate_relevance,
+    summarize_abstract,
+    compose_newsletter
+)
+import logging
+import argparse
+from huggingface_hub import HfFileSystem
+import pdfkit
+from jinja2 import Environment, FileSystemLoader
+import markdown2
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler("app.log"),
+        logging.StreamHandler()
+    ]
+)
+# Retrieve environment variables
+HF_TOKEN = os.environ.get("HF_TOKEN")
+DATASET_NAME = os.environ.get("DATASET_NAME", "cmcmaster/this_week_in_rheumatology")
+if not HF_TOKEN:
+    logging.error("Hugging Face token not found. Set the HF_TOKEN environment variable.")
+    exit(1)
+# Initialize Hugging Face Hub API
+api = HfApi(token=HF_TOKEN)
+def ensure_repo_exists(api, repo_id, repo_type, token):
+    try:
+        api.repo_info(repo_id=repo_id, repo_type=repo_type)
+        logging.info(f"Repository {repo_id} already exists.")
+    except Exception as e:
+        logging.info(f"Repository {repo_id} not found. Creating a new one.")
+        try:
+            api.create_repo(
+                repo_id=repo_id,
+                repo_type=repo_type,
+                token=token,
+                private=False,
+                exist_ok=True
+            )
+            # Create a dataset card
+            card_data = DatasetCardData(
+                language="en",
+                license="cc-by-sa-4.0",
+                task_categories=["text-classification"],
+                tags=["rheumatology", "medical-research"]
+            )
+            card = DatasetCard("---\n" + card_data.to_yaml() + "\n---\n# This Week in Rheumatology\n\nA weekly collection of relevant rheumatology papers.")
+            api.upload_file(
+                path_or_fileobj=str(card).encode(),
+                path_in_repo="README.md",
+                repo_id=repo_id,
+                repo_type=repo_type,
+                commit_message="Add dataset card",
+                token=token
+            )
+            logging.info(f"Repository {repo_id} created successfully with a dataset card.")
+        except Exception as create_error:
+            logging.error(f"Failed to create repository {repo_id}: {create_error}")
+            exit(1)
+# Ensure the repository exists before proceeding
+ensure_repo_exists(api, DATASET_NAME, repo_type="dataset", token=HF_TOKEN)
+# Load search terms from JSON
+with open('search_terms.json', 'r') as f:
+    search_terms = json.load(f)
+def build_query():
+    # Constructing MeSH terms
+    mesh_terms = ' OR '.join(f'"{term}"[MeSH Terms]' for term in search_terms['search_strategy']['mesh_terms'])
+    # Constructing keywords
+    keywords = ' OR '.join(f'"{term}"[Title/Abstract]' for term in search_terms['search_strategy']['keywords'])
+    # Constructing specific conditions
+    specific_conditions = ' OR '.join(f'"{term}"[Title/Abstract]' for term in search_terms['search_strategy']['specific_conditions'])
+    # Constructing research-related terms
+    research_terms = ' OR '.join(f'"{term}"[Title/Abstract]' for term in search_terms['search_strategy']['research_related_terms'])
+    # Constructing journal names
+    journals = ' OR '.join(f'"{journal}"[Journal]' for journal in search_terms['journals'])
+    # Correctly grouping exclusion terms with parentheses and using OR
+    exclusion_terms = 'NOT (' + ' OR '.join(f'"{term}"[Title/Abstract]' for term in search_terms['search_strategy']['exclusion_terms']) + ')'
+    # Grouping all inclusion terms within parentheses and combining with OR
+    inclusion_terms = f"({mesh_terms} OR {keywords} OR {specific_conditions} OR {journals})"
+    # Enclosing research terms within parentheses
+    research_terms_grouped = f"({research_terms})"
+    # Constructing the final query with proper grouping and operator precedence
+    query = f"{inclusion_terms} AND {research_terms_grouped} {exclusion_terms}"
+    # Adding filters for human studies, English language, and publication types
+    human_filter = 'AND "humans"[MeSH Terms]'
+    language_filter = 'AND "english"[Language]'
+    pub_types = ' OR '.join(f'"{pt}"[Publication Type]' for pt in search_terms['publication_types'])
+    pub_type_filter = f'AND ({pub_types})'
+    # Exclude case reports
+    exclude_case_reports = 'NOT "Case Reports"[Publication Type]'
+    query = f"{query} {human_filter} {language_filter} {pub_type_filter} {exclude_case_reports}"
+    logging.info(f"Built PubMed query: {query}")
+    return query
+def search_pubmed(query, start_date: datetime, end_date: datetime):
+    Entrez.email = "mcmastc1@gmail.com"  # Replace with your actual email
+    try:
+        handle = Entrez.esearch(
+            db="pubmed",
+            term=query,
+            mindate=start_date.strftime('%Y/%m/%d'),
+            maxdate=end_date.strftime('%Y/%m/%d'),
+            usehistory="y",
+            retmax=1000
+        )
+        results = Entrez.read(handle)
+        logging.info(f"PubMed search completed. Found {results['Count']} papers.")
+        return results
+    except Exception as e:
+        logging.error(f"Error searching PubMed: {e}")
+        logging.error(f"Query: {query}")
+        logging.error(f"Date range: {start_date.strftime('%Y/%m/%d')} to {end_date.strftime('%Y/%m/%d')}")
+        raise
+def fetch_details(id_list):
+    ids = ",".join(id_list)
+    handle = Entrez.efetch(db="pubmed", id=ids, rettype="medline", retmode="text")
+    records = list(Medline.parse(handle))
+    logging.info(f"Fetched details for {len(records)} papers.")
+    return records
+def process_papers(records):
+    data = []
+    relevant_count = 0
+    for record in records:
+        article = {
+            "PMID": record.get("PMID", ""),
+            "Title": record.get("TI", ""),
+            "Authors": ", ".join(record.get("AU", [])),
+            "Journal": record.get("JT", ""),
+            "Abstract": record.get("AB", ""),
+            "Publication Type": ", ".join(record.get("PT", [])),
+        }
+        try:
+            relevance = evaluate_relevance(article["Title"], article["Abstract"])
+            # If relevant and confidence is > 7, add to data
+            if relevance.get("relevance_score", 0) > 8:
+                summary = summarize_abstract(article["Abstract"])
+                article["Summary"] = summary.get("summary", "")
+                article["Topic"] = summary.get("topic", "")
+                # Drop Abstract and Publication Type from article
+                article.pop("Abstract", None)
+                article.pop("Publication Type", None)
+                data.append(article)
+                relevant_count += 1
+            logging.info(f"Paper PMID {article['PMID']} processed successfully. Relevance Score: {relevance.get('relevance_score', 0)}")
+        except json.JSONDecodeError as json_err:
+            logging.error(f"JSON decode error for paper PMID {article['PMID']}: {json_err}")
+        except Exception as e:
+            logging.error(f"Error processing paper PMID {article['PMID']}: {e}")
+    logging.info(f"Processed {len(records)} papers. {relevant_count} were deemed relevant.")
+    return pd.DataFrame(data)
+def get_rheumatology_papers(start_date: datetime, end_date: datetime, test: bool = False):
+    query = build_query()
+    logging.info(f"Searching PubMed for papers between {start_date.strftime('%Y-%m-%d')} and {end_date.strftime('%Y-%m-%d')}")
+    logging.debug(f"PubMed query: {query}")  # Add this line to log the query
+    search_results = search_pubmed(query, start_date, end_date)
+    id_list = search_results.get("IdList", [])
+    if not id_list:
+        logging.info("No new papers found.")
+        return pd.DataFrame()
+    logging.info(f"Fetching details for {len(id_list)} papers.")
+    records = fetch_details(id_list)
+    if test:
+        logging.info("Running in test mode. Processing only 50 papers.")
+        return process_papers(records[:50])
+    else:
+        return process_papers(records)
+def cache_dataset(papers_df: pd.DataFrame, start_date: datetime, end_date: datetime):
+    try:
+        # Convert Dataframe to a dict so it can be uploaded to the Hub
+        papers_dict = papers_df.to_dict(orient="records")
+        repo_path = f"{end_date.strftime('%Y%m%d')}/papers.jsonl"
+        # Upload to the Hub
+        api.upload_file(
+            path_or_fileobj=json.dumps(papers_dict).encode('utf-8'),
+            path_in_repo=repo_path,
+            repo_id=DATASET_NAME,
+            repo_type="dataset",
+            commit_message=f"Add papers from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}",
+            token=HF_TOKEN
+        )
+        logging.info(f"Papers cached successfully to repository {DATASET_NAME}.")
+    except Exception as e:
+        logging.error(f"Failed to cache papers: {e}")
+def load_cached_papers(start_date: datetime, end_date: datetime, test: bool = False) -> pd.DataFrame:
+    try:
+        fs = HfFileSystem()
+        # Updated dataset_path to point to the specific parquet file within the subdirectory
+        dataset_path = f"datasets/cmcmaster/this_week_in_rheumatology/{end_date.strftime('%Y%m%d')}/papers.jsonl"
+        if fs.exists(dataset_path):
+            dataset = load_dataset("jsonl", data_files={"train": dataset_path}, split="train")
+            papers_df = dataset.to_pandas()
+            return papers_df
+        else:
+            logging.info(f"No cache found for {end_date.strftime('%Y-%m-%d')}. Processing new papers.")
+            return get_rheumatology_papers(start_date, end_date, test)
+    except Exception as e:
+        logging.info(f"Error loading cache: {e}. Processing new papers.")
+        return get_rheumatology_papers(start_date, end_date, test)
+def generate_pdf_newsletter(content: dict, end_date: datetime):
+    """Generate a PDF version of the newsletter using pdfkit"""
+    try:
+        # Convert markdown to HTML
+        html_content = markdown2.markdown(content['content'])
+        # Setup Jinja2 template environment
+        env = Environment(loader=FileSystemLoader('templates'))
+        template = env.get_template('newsletter_pdf.html')
+        # Render the template
+        html = template.render(
+            title=f"This Week in Rheumatology - {content['date']}",
+            content=html_content
+        )
+        # Configure PDF options
+        options = {
+            'page-size': 'A4',
+            'margin-top': '2cm',
+            'margin-right': '2cm',
+            'margin-bottom': '2cm',
+            'margin-left': '2cm',
+            'encoding': 'UTF-8',
+            'enable-local-file-access': None,
+            'quiet': ''
+        }
+        # Generate PDF
+        pdf_path = f"{end_date.strftime('%Y%m%d')}/newsletter.pdf"
+        os.makedirs(os.path.dirname(pdf_path), exist_ok=True)
+        # Add CSS to HTML string
+        html_with_style = f"""
+        <html>
+        <head>
+            <style>
+                body {{
+                    font-family: Arial, sans-serif;
+                    line-height: 1.6;
+                    margin: 0 auto;
+                    max-width: 21cm;  /* A4 width */
+                    color: #333;
+                }}
+                h1, h2 {{ color: #2c3e50; }}
+                h1 {{ font-size: 24px; margin-top: 2em; }}
+                h2 {{ font-size: 20px; margin-top: 1.5em; }}
+                a {{ color: #3498db; text-decoration: none; }}
+                p {{ margin-bottom: 1em; }}
+            </style>
+        </head>
+        <body>
+            {html}
+        </body>
+        </html>
+        """
+        pdfkit.from_string(html_with_style, pdf_path, options=options)
+        # Upload PDF to Hub
+        with open(pdf_path, 'rb') as f:
+            api.upload_file(
+                path_or_fileobj=f,
+                path_in_repo=pdf_path,
+                repo_id=DATASET_NAME,
+                repo_type="dataset",
+                commit_message=f"Add PDF newsletter for {end_date.strftime('%Y-%m-%d')}",
+                token=HF_TOKEN
+            )
+        logging.info("PDF newsletter generated and uploaded successfully")
+    except Exception as e:
+        logging.error(f"Failed to generate PDF newsletter: {e}")
+def generate_and_store_newsletter(papers_df: pd.DataFrame, end_date: datetime):
+    if papers_df.empty:
+        logging.info("No papers to include in the newsletter.")
+        return
+    try:
+        logging.info(f"Generating newsletter with {len(papers_df)} papers.")
+        newsletter_content = compose_newsletter(papers_df)
+        newsletter_data = {
+            "date": end_date.strftime('%Y-%m-%d'),
+            "content": newsletter_content
+        }
+        # Store JSON version
+        newsletter_json = json.dumps(newsletter_data, indent=4)
+        repo_path = f'{end_date.strftime("%Y%m%d")}/newsletter.json'
+        api.upload_file(
+            path_or_fileobj=newsletter_json.encode('utf-8'),
+            path_in_repo=repo_path,
+            repo_id=DATASET_NAME,
+            repo_type="dataset",
+            commit_message=f"Add newsletter for {end_date.strftime('%Y-%m-%d')}",
+            token=HF_TOKEN
+        )
+        # Generate and store PDF version
+        generate_pdf_newsletter(newsletter_data, end_date)
+        logging.info(f"Newsletter (JSON and PDF) successfully pushed to repository {DATASET_NAME}.")
+    except Exception as e:
+        logging.error(f"Failed to generate or store newsletter: {e}")
+def process_new_papers(end_date: datetime = None, test: bool = False):
+    end_date = end_date or datetime.now(timezone.utc)
+    start_date = end_date - timedelta(days=7)
+    # Adjust the date range to search for papers published in the last 30 days
+    search_start_date = end_date - timedelta(days=30)
+    logging.info(f"Processing papers for the week: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
+    logging.info(f"Searching for papers published between: {search_start_date.strftime('%Y-%m-%d')} and {end_date.strftime('%Y-%m-%d')}")
+    papers_df = load_cached_papers(search_start_date, end_date, test)
+    if papers_df.empty and not test:
+        logging.info("No relevant papers found in cache or recent search.")
+        return
+    logging.info(f"Found {len(papers_df)} relevant papers for the newsletter.")
+    # Cache the papers_df as a Hugging Face dataset
+    cache_dataset(papers_df, start_date, end_date)
+    # Generate and store the newsletter
+    generate_and_store_newsletter(papers_df, end_date)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate a weekly Rheumatology newsletter.")
+    parser.add_argument('--end_date', type=str, help='End date for the newsletter in YYYY-MM-DD format. Defaults to today.')
+    parser.add_argument('--test', action='store_true', help='Run the script in test mode.')
+    args = parser.parse_args()
+    end_date = None
+    if args.end_date:
+        try:
+            end_date = datetime.strptime(args.end_date, '%Y-%m-%d').replace(tzinfo=timezone.utc)
+        except ValueError:
+            logging.error("Invalid date format for --end_date. Use YYYY-MM-DD.")
+            exit(1)
+    process_new_papers(end_date, args.test)

hf_api.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import os
+import json
+import logging
+from enum import Enum
+from pydantic import BaseModel, Field
+import pandas as pd
+from huggingface_hub import InferenceClient
+from tenacity import retry, stop_after_attempt, wait_exponential
+# Configure logging
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+# Create handlers
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.INFO)
+file_handler = logging.FileHandler("hf_api.log")
+file_handler.setLevel(logging.INFO)
+# Create formatters and add to handlers
+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+console_handler.setFormatter(formatter)
+file_handler.setFormatter(formatter)
+# Add handlers to the logger
+if not logger.handlers:
+    logger.addHandler(console_handler)
+    logger.addHandler(file_handler)
+# Validate and retrieve the Hugging Face API token
+HF_TOKEN = os.environ.get('HF_TOKEN')
+if not HF_TOKEN:
+    logger.error("Hugging Face API token not found. Set the HF_TOKEN environment variable.")
+    raise EnvironmentError("HF_TOKEN environment variable is not set.")
+# Initialize the InferenceClient
+MODEL_NAME1 = "meta-llama/Llama-3.1-8B-Instruct"
+MODEL_NAME2 = "Qwen/Qwen2.5-72B-Instruct"
+try:
+    client1 = InferenceClient(model=MODEL_NAME1, token=HF_TOKEN)
+    logger.info(f"InferenceClient for model '{MODEL_NAME1}' instantiated successfully.")
+except Exception as e:
+    logger.error(f"Failed to instantiate InferenceClient for model '{MODEL_NAME1}': {e}")
+    raise
+try:
+    client2 = InferenceClient(model=MODEL_NAME2, token=HF_TOKEN)
+    logger.info(f"InferenceClient for model '{MODEL_NAME2}' instantiated successfully.")
+except Exception as e:
+    logger.error(f"Failed to instantiate InferenceClient for model '{MODEL_NAME2}': {e}")
+    raise
+# Define Pydantic schemas
+class EvaluationSchema(BaseModel):
+    reasoning: str
+    relevance_score: int = Field(ge=0, le=10)
+class TopicEnum(Enum):
+    Rheumatoid_Arthritis = "Rheumatoid Arthritis"
+    Systemic_Lupus_Erythematosus = "Systemic Lupus Erythematosus"
+    Scleroderma = "Scleroderma"
+    Sjogren_s_Disease = "Sjogren's Disease"
+    Ankylosing_Spondylitis = "Ankylosing Spondylitis"
+    Psoriatic_Arthritis = "Psoriatic Arthritis"
+    Gout = "Gout"
+    Vasculitis = "Vasculitis"
+    Osteoarthritis = "Osteoarthritis"
+    Infectious_Diseases = "Infectious Diseases"
+    Immunology = "Immunology"
+    Genetics = "Genetics"
+    Biologics = "Biologics"
+    Biosimilars = "Biosimilars"
+    Small_Molecules = "Small Molecules"
+    Clinical_Trials = "Clinical Trials"
+    Health_Policy = "Health Policy"
+    Patient_Education = "Patient Education"
+    Other_Rheumatic_Diseases = "Other Rheumatic Diseases"
+class SummarySchema(BaseModel):
+    summary: str
+    # Enum for topic
+    topic: TopicEnum = TopicEnum.Other_Rheumatic_Diseases
+class PaperSchema(BaseModel):
+    title: str
+    authors: str
+    journal: str
+    pmid: str
+class TopicSummarySchema(BaseModel):
+    planning: str
+    summary: str
+def evaluate_relevance(title: str, abstract: str) -> EvaluationSchema:
+    prompt = f"""
+    Title: {title}
+    Abstract: {abstract}
+    Instructions: Evaluate the relevance of this medical abstract for an audience of rheumatologists on a scale of 0 to 10 with 10 being reserved only for large clinical trials in rheumatology.
+    Be very discerning and only give a score above 8 for papers that are highly clinically relevant to rheumatologists.
+    Respond in JSON format using the following schema:
+    {json.dumps(EvaluationSchema.model_json_schema())}
+    """
+    try:
+        response = client1.text_generation(
+            prompt,
+            max_new_tokens=512,
+            temperature=0.2,
+            grammar={"type": "json", "value": EvaluationSchema.model_json_schema()}
+        )
+        result = json.loads(response)
+        return result
+    except Exception as e:
+        logger.error(f"Error in evaluate_relevance: {e}")
+        raise
+def summarize_abstract(abstract: str) -> SummarySchema:
+    prompt = f"""
+    Abstract: {abstract}
+    Instructions: Summarize this medical abstract in 1 sentence and select the most relevant topic from the following enum:
+    {TopicEnum.__doc__}
+    Respond in JSON format using the following schema:
+    {json.dumps(SummarySchema.model_json_schema())}
+    """
+    try:
+        response = client1.text_generation(
+            prompt,
+            max_new_tokens=512,
+            temperature=0.2,
+            grammar={"type": "json", "value": SummarySchema.model_json_schema()}
+        )
+        result = json.loads(response)
+        return result
+    except Exception as e:
+        logger.error(f"Error in summarize_abstract: {e}")
+        raise
+@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
+def _make_api_call(client, prompt, max_tokens=4096, temp=0.2, schema=None):
+    try:
+        response = client.text_generation(
+            prompt,
+            max_new_tokens=max_tokens,
+            temperature=temp,
+            grammar={"type": "json", "value": schema} if schema else None
+        )
+        return json.loads(response)
+    except Exception as e:
+        logger.error(f"API call failed: {e}")
+        raise
+def compose_newsletter(papers: pd.DataFrame) -> str:
+    if papers.empty:
+        logger.info("No papers provided to compose the newsletter.")
+        return ""
+    content = ["# This Week in Rheumatology\n"]
+    topics = papers['Topic'].unique()
+    for topic in topics:
+        try:
+            relevant_papers = papers[papers['Topic'] == topic]
+            # Convert to dict with lowercase keys to match the expected schema
+            papers_dict = relevant_papers.rename(columns={
+                'Title': 'title',
+                'Authors': 'authors',
+                'Journal': 'journal',
+                'PMID': 'pmid',
+                'Summary': 'summary'
+            }).to_dict('records')
+            prompt = f"""
+            Instructions: Generate a brief summary of the latest research on {topic} using the following papers.
+            Papers: {json.dumps(papers_dict)}
+            Respond in JSON format using the following schema:
+            {json.dumps(TopicSummarySchema.model_json_schema())}
+            You have the option of using the planning field first to organize your thoughts before writing the summary.
+            The summary should be concise, but because you are summarizing several papers, it should be detailed enough to give the reader a good idea of the latest research in the field.
+            The papers may be somewhat disjointed, so you will need to think carefully about how you can transition between them with clever wording.
+            You can use anywhere from 1 to 3 paragraphs for the summary.
+            """
+            result = _make_api_call(
+                client2,
+                prompt,
+                max_tokens=4096,
+                temp=0.2,
+                schema=TopicSummarySchema.model_json_schema()
+            )
+            # Log the raw response for debugging
+            logger.debug(f"Raw response from Hugging Face: {result}")
+            # Parse the JSON response
+            summary = TopicSummarySchema(**result)
+            # Convert the structured summary to Markdown
+            topic_content = f"## {topic}\n\n"
+            topic_content += f"{summary.summary}\n\n"
+            # Add a references section
+            topic_content += "### References\n\n"
+            relevant_papers = papers[papers['Topic'] == topic]
+            for _, paper in relevant_papers.iterrows():
+                topic_content += (f"- {paper['Title']} by {paper['Authors']}. {paper['Journal']}. "
+                               f"[PMID: {paper['PMID']}](https://pubmed.ncbi.nlm.nih.gov/{paper['PMID']}/)\n")
+            content.append(topic_content)
+        except Exception as e:
+            logger.error(f"Error processing topic {topic}: {e}")
+            logger.error(f"Raw response: {result}")
+            continue
+    return "\n".join(content)

main.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import json
+import os
+from datetime import datetime, timezone
+from fasthtml.common import *
+from huggingface_hub import HfApi, hf_hub_download
+from starlette.responses import FileResponse
+from generate_newsletter import process_new_papers
+from apscheduler.schedulers.background import BackgroundScheduler
+from apscheduler.triggers.cron import CronTrigger
+from fasthtml_hf import setup_hf_backup
+# Initialize Hugging Face API
+HF_TOKEN = os.environ.get("HF_TOKEN")
+DATASET_NAME = "cmcmaster/this_week_in_rheumatology"
+api = HfApi(token=HF_TOKEN)
+# Initialize scheduler
+scheduler = BackgroundScheduler()
+# Schedule newsletter generation to run every Monday at 1 AM UTC
+scheduler.add_job(process_new_papers,
+                  CronTrigger(day_of_week='mon', hour=1),
+                  kwargs={
+                      'end_date': None,
+                      'test': False
+                  },
+                  id='generate_newsletter',
+                  name='Weekly newsletter generation',
+                  replace_existing=True)
+css = Style("""
+    body {
+        font-family: Georgia, Times, serif;
+        line-height: 1.6;
+        color: #333;
+        max-width: 800px;
+        margin: 0 auto;
+        padding: 20px;
+        background: #fff;
+    }
+    h1, h2 {
+        color: #2c3e50;
+        font-family: Georgia, Times, serif;
+    }
+    a {
+        color: #2c3e50;
+        text-decoration: none;
+    }
+    a:hover {
+        text-decoration: underline;
+    }
+    ul {
+        list-style-type: none;
+        padding: 0;
+    }
+    li {
+        margin-bottom: 10px;
+    }
+    .newsletter-content {
+        margin-top: 20px;
+    }
+    .download-link {
+        display: inline-block;
+        padding: 10px 20px;
+        background-color: #2c3e50;
+        color: white;
+        border-radius: 3px;
+        margin: 10px 0;
+        font-family: Georgia, Times, serif;
+    }
+    .download-link:hover {
+        background-color: #34495e;
+        text-decoration: none;
+    }
+""")
+app = FastHTML(hdrs=(css, MarkdownJS(),
+                     HighlightJS(
+                         langs=['python', 'javascript', 'html', 'css'])))
+# Start the scheduler when the app starts
+@app.on_event("startup")
+async def start_scheduler():
+  scheduler.start()
+# Shut down the scheduler when the app stops
+@app.on_event("shutdown")
+async def shutdown_scheduler():
+  scheduler.shutdown()
+def get_newsletter_list():
+  # Fetch the list of newsletters from the Hugging Face repository
+  files = api.list_repo_files(repo_id=DATASET_NAME, repo_type="dataset")
+  newsletters = [f for f in files if f.endswith('newsletter.json')]
+  return sorted(newsletters, reverse=True)
+def get_newsletter_content(path):
+  # Download and parse the newsletter content
+  content = api.hf_hub_download(repo_id=DATASET_NAME,
+                                filename=path,
+                                repo_type="dataset")
+  with open(content, 'r') as f:
+    return json.load(f)
+@app.get("/")
+def index():
+  newsletters = get_newsletter_list()
+  links = [
+      Li(
+          A(datetime.strptime(n.split('/')[0], '%Y%m%d').strftime('%B %d, %Y'),
+            href=f"/newsletter/{n.split('/')[0]}")) for n in newsletters
+  ]
+  return Titled("This Week in Rheumatology", H2("Available Newsletters"),
+                Ul(*links))
+@app.get("/newsletter/{date}")
+def newsletter(date: str):
+  path = f"{date}/newsletter.json"
+  pdf_path = f"{date}/newsletter.pdf"
+  try:
+    content = get_newsletter_content(path)
+    return Titled(
+        f"This Week in Rheumatology - {content['date']}",
+        A("Back to Index", href="/"),
+        Div(
+        A("Download PDF", href=f"/download/{date}", cls="download-link")
+        ),
+        Div(content['content'], cls="marked"))
+  except Exception as e:
+    return Titled("Error", H2("Newsletter not found"),
+                  P(f"Unable to load newsletter for date: {date}"),
+                  A("Back to Index", href="/"))
+@app.get("/download/{date}")
+def download_pdf(date: str):
+  try:
+    pdf_path = f"{date}/newsletter.pdf"
+    content = api.hf_hub_download(repo_id=DATASET_NAME,
+                                  filename=pdf_path,
+                                  repo_type="dataset")
+    return FileResponse(content,
+                        media_type="application/pdf",
+                        filename=f"newsletter_{date}.pdf")
+  except Exception as e:
+    return Titled("Error", H2("PDF not found"),
+                  P(f"Unable to load PDF for date: {date}"),
+                  A("Back to Index", href="/"))
+setup_hf_backup(app)
+serve()

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+fasthtml-hf
+huggingface-hub
+starlette
+apscheduler
+bio
+datasets
+pdfkit
+jinja2
+markdown2
+pandas
+pydantic
+tenacity

search_terms.json ADDED Viewed

	@@ -0,0 +1,104 @@

+{
+    "search_strategy": {
+      "mesh_terms": [
+        "Rheumatic Diseases",
+        "Rheumatology",
+        "Arthritis, Rheumatoid",
+        "Lupus Erythematosus, Systemic",
+        "Osteoarthritis",
+        "Fibromyalgia",
+        "Sjogren's Syndrome",
+        "Scleroderma, Systemic",
+        "Polymyositis",
+        "Dermatomyositis",
+        "Vasculitis",
+        "Gout",
+        "Spondylarthropathies",
+        "Polymyalgia Rheumatica",
+        "Arthritis, Psoriatic",
+        "Arthritis, Juvenile"
+      ],
+      "keywords": [
+        "rheumat*",
+        "autoimmune",
+        "connective tissue disease",
+        "inflammatory arthritis",
+        "systemic inflammatory disease",
+        "musculoskeletal disorder",
+        "autoinflammatory syndrome",
+        "immunologic disease",
+        "crystal arthropathy"
+      ],
+      "specific_conditions": [
+        "ankylosing spondylitis",
+        "reactive arthritis",
+        "enteropathic arthritis",
+        "systemic sclerosis",
+        "mixed connective tissue disease",
+        "antiphospholipid syndrome",
+        "Behcet's disease",
+        "giant cell arteritis",
+        "Takayasu arteritis",
+        "ANCA-associated vasculitis",
+        "polymyositis",
+        "dermatomyositis",
+        "inclusion body myositis"
+      ],
+      "research_related_terms": [
+        "epidemiology",
+        "etiology",
+        "pathogenesis",
+        "diagnosis",
+        "treatment",
+        "therapy",
+        "prognosis",
+        "outcome",
+        "clinical trial",
+        "cohort study",
+        "case-control study",
+        "systematic review",
+        "meta-analysis",
+        "biomarker",
+        "genetic",
+        "immunology",
+        "imaging"
+      ],
+      "exclusion_terms": [
+        "veterinary",
+        "animal model"
+      ]
+    },
+    "search_fields": [
+      "Title/Abstract",
+      "MeSH Terms",
+      "Publication Type",
+      "Journal"
+    ],
+    "publication_types": [
+      "Journal Article",
+      "Review",
+      "Clinical Trial",
+      "Meta-Analysis",
+      "Randomized Controlled Trial",
+      "Practice Guideline"
+    ],
+    "languages": [
+      "English"
+    ],
+    "species": [
+      "Humans"
+    ],
+    "journals": [
+      "Annals of the Rheumatic Diseases",
+      "Arthritis & Rheumatology",
+      "Rheumatology",
+      "Journal of Rheumatology",
+      "Arthritis Research & Therapy",
+      "Seminars in Arthritis and Rheumatism",
+      "RMD Open",
+      "Clinical Rheumatology",
+      "Arthritis Care & Research",
+      "International Journal of Rheumatic Diseases"
+    ]
+  }

templates/newsletter_pdf.html ADDED Viewed

	@@ -0,0 +1,13 @@

+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8">
+    <title>{{ title }}</title>
+</head>
+<body>
+    <h1>{{ title }}</h1>
+    <div class="content">
+        {{ content|safe }}
+    </div>
+</body>
+</html>