cmcmaster commited on
Commit
e4f5c0d
·
verified ·
1 Parent(s): e43dfcd

deploy at 2024-11-11 09:48:16.356051

Browse files
Files changed (8) hide show
  1. Dockerfile +10 -0
  2. README.md +6 -5
  3. generate_newsletter.py +382 -0
  4. hf_api.py +217 -0
  5. main.py +167 -0
  6. requirements.txt +12 -0
  7. search_terms.json +104 -0
  8. templates/newsletter_pdf.html +13 -0
Dockerfile ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+ WORKDIR /code
3
+ COPY --link --chown=1000 . .
4
+ RUN mkdir -p /tmp/cache/
5
+ RUN chmod a+rwx -R /tmp/cache/
6
+ ENV HF_HUB_CACHE=HF_HOME
7
+ RUN pip install --no-cache-dir -r requirements.txt
8
+
9
+ ENV PYTHONUNBUFFERED=1 PORT=7860
10
+ CMD ["python", "main.py"]
README.md CHANGED
@@ -1,10 +1,11 @@
 
1
  ---
2
- title: This Week In Rheumatology
3
- emoji: 🦀
4
- colorFrom: pink
5
  colorTo: red
6
  sdk: docker
 
7
  pinned: false
 
8
  ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+
2
  ---
3
+ title: cmcmaster/this_week_in_rheumatology
4
+ emoji: 🚀
5
+ colorFrom: purple
6
  colorTo: red
7
  sdk: docker
8
+ app_file: app.py
9
  pinned: false
10
+ termination_grace_period: 2m
11
  ---
 
 
generate_newsletter.py ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ from datetime import datetime, timedelta, timezone
4
+ import json
5
+ from Bio import Entrez, Medline
6
+ from huggingface_hub import HfApi, hf_hub_download, DatasetCard, DatasetCardData
7
+ from datasets import Dataset, load_dataset
8
+ from hf_api import (
9
+ evaluate_relevance,
10
+ summarize_abstract,
11
+ compose_newsletter
12
+ )
13
+ import logging
14
+ import argparse
15
+ from huggingface_hub import HfFileSystem
16
+ import pdfkit
17
+ from jinja2 import Environment, FileSystemLoader
18
+ import markdown2
19
+
20
+ # Configure logging
21
+ logging.basicConfig(
22
+ level=logging.INFO,
23
+ format='%(asctime)s - %(levelname)s - %(message)s',
24
+ handlers=[
25
+ logging.FileHandler("app.log"),
26
+ logging.StreamHandler()
27
+ ]
28
+ )
29
+
30
+ # Retrieve environment variables
31
+ HF_TOKEN = os.environ.get("HF_TOKEN")
32
+ DATASET_NAME = os.environ.get("DATASET_NAME", "cmcmaster/this_week_in_rheumatology")
33
+
34
+ if not HF_TOKEN:
35
+ logging.error("Hugging Face token not found. Set the HF_TOKEN environment variable.")
36
+ exit(1)
37
+
38
+ # Initialize Hugging Face Hub API
39
+ api = HfApi(token=HF_TOKEN)
40
+
41
+ def ensure_repo_exists(api, repo_id, repo_type, token):
42
+ try:
43
+ api.repo_info(repo_id=repo_id, repo_type=repo_type)
44
+ logging.info(f"Repository {repo_id} already exists.")
45
+ except Exception as e:
46
+ logging.info(f"Repository {repo_id} not found. Creating a new one.")
47
+ try:
48
+ api.create_repo(
49
+ repo_id=repo_id,
50
+ repo_type=repo_type,
51
+ token=token,
52
+ private=False,
53
+ exist_ok=True
54
+ )
55
+ # Create a dataset card
56
+ card_data = DatasetCardData(
57
+ language="en",
58
+ license="cc-by-sa-4.0",
59
+ task_categories=["text-classification"],
60
+ tags=["rheumatology", "medical-research"]
61
+ )
62
+ card = DatasetCard("---\n" + card_data.to_yaml() + "\n---\n# This Week in Rheumatology\n\nA weekly collection of relevant rheumatology papers.")
63
+ api.upload_file(
64
+ path_or_fileobj=str(card).encode(),
65
+ path_in_repo="README.md",
66
+ repo_id=repo_id,
67
+ repo_type=repo_type,
68
+ commit_message="Add dataset card",
69
+ token=token
70
+ )
71
+ logging.info(f"Repository {repo_id} created successfully with a dataset card.")
72
+ except Exception as create_error:
73
+ logging.error(f"Failed to create repository {repo_id}: {create_error}")
74
+ exit(1)
75
+
76
+ # Ensure the repository exists before proceeding
77
+ ensure_repo_exists(api, DATASET_NAME, repo_type="dataset", token=HF_TOKEN)
78
+
79
+ # Load search terms from JSON
80
+ with open('search_terms.json', 'r') as f:
81
+ search_terms = json.load(f)
82
+
83
+ def build_query():
84
+ # Constructing MeSH terms
85
+ mesh_terms = ' OR '.join(f'"{term}"[MeSH Terms]' for term in search_terms['search_strategy']['mesh_terms'])
86
+
87
+ # Constructing keywords
88
+ keywords = ' OR '.join(f'"{term}"[Title/Abstract]' for term in search_terms['search_strategy']['keywords'])
89
+
90
+ # Constructing specific conditions
91
+ specific_conditions = ' OR '.join(f'"{term}"[Title/Abstract]' for term in search_terms['search_strategy']['specific_conditions'])
92
+
93
+ # Constructing research-related terms
94
+ research_terms = ' OR '.join(f'"{term}"[Title/Abstract]' for term in search_terms['search_strategy']['research_related_terms'])
95
+
96
+ # Constructing journal names
97
+ journals = ' OR '.join(f'"{journal}"[Journal]' for journal in search_terms['journals'])
98
+
99
+ # Correctly grouping exclusion terms with parentheses and using OR
100
+ exclusion_terms = 'NOT (' + ' OR '.join(f'"{term}"[Title/Abstract]' for term in search_terms['search_strategy']['exclusion_terms']) + ')'
101
+
102
+ # Grouping all inclusion terms within parentheses and combining with OR
103
+ inclusion_terms = f"({mesh_terms} OR {keywords} OR {specific_conditions} OR {journals})"
104
+
105
+ # Enclosing research terms within parentheses
106
+ research_terms_grouped = f"({research_terms})"
107
+
108
+ # Constructing the final query with proper grouping and operator precedence
109
+ query = f"{inclusion_terms} AND {research_terms_grouped} {exclusion_terms}"
110
+
111
+ # Adding filters for human studies, English language, and publication types
112
+ human_filter = 'AND "humans"[MeSH Terms]'
113
+ language_filter = 'AND "english"[Language]'
114
+ pub_types = ' OR '.join(f'"{pt}"[Publication Type]' for pt in search_terms['publication_types'])
115
+ pub_type_filter = f'AND ({pub_types})'
116
+
117
+ # Exclude case reports
118
+ exclude_case_reports = 'NOT "Case Reports"[Publication Type]'
119
+
120
+ query = f"{query} {human_filter} {language_filter} {pub_type_filter} {exclude_case_reports}"
121
+
122
+ logging.info(f"Built PubMed query: {query}")
123
+ return query
124
+
125
+ def search_pubmed(query, start_date: datetime, end_date: datetime):
126
+ Entrez.email = "mcmastc1@gmail.com" # Replace with your actual email
127
+ try:
128
+ handle = Entrez.esearch(
129
+ db="pubmed",
130
+ term=query,
131
+ mindate=start_date.strftime('%Y/%m/%d'),
132
+ maxdate=end_date.strftime('%Y/%m/%d'),
133
+ usehistory="y",
134
+ retmax=1000
135
+ )
136
+ results = Entrez.read(handle)
137
+ logging.info(f"PubMed search completed. Found {results['Count']} papers.")
138
+ return results
139
+ except Exception as e:
140
+ logging.error(f"Error searching PubMed: {e}")
141
+ logging.error(f"Query: {query}")
142
+ logging.error(f"Date range: {start_date.strftime('%Y/%m/%d')} to {end_date.strftime('%Y/%m/%d')}")
143
+ raise
144
+
145
+ def fetch_details(id_list):
146
+ ids = ",".join(id_list)
147
+ handle = Entrez.efetch(db="pubmed", id=ids, rettype="medline", retmode="text")
148
+ records = list(Medline.parse(handle))
149
+ logging.info(f"Fetched details for {len(records)} papers.")
150
+ return records
151
+
152
+ def process_papers(records):
153
+ data = []
154
+ relevant_count = 0
155
+ for record in records:
156
+ article = {
157
+ "PMID": record.get("PMID", ""),
158
+ "Title": record.get("TI", ""),
159
+ "Authors": ", ".join(record.get("AU", [])),
160
+ "Journal": record.get("JT", ""),
161
+ "Abstract": record.get("AB", ""),
162
+ "Publication Type": ", ".join(record.get("PT", [])),
163
+ }
164
+ try:
165
+ relevance = evaluate_relevance(article["Title"], article["Abstract"])
166
+ # If relevant and confidence is > 7, add to data
167
+ if relevance.get("relevance_score", 0) > 8:
168
+ summary = summarize_abstract(article["Abstract"])
169
+ article["Summary"] = summary.get("summary", "")
170
+ article["Topic"] = summary.get("topic", "")
171
+ # Drop Abstract and Publication Type from article
172
+ article.pop("Abstract", None)
173
+ article.pop("Publication Type", None)
174
+ data.append(article)
175
+ relevant_count += 1
176
+ logging.info(f"Paper PMID {article['PMID']} processed successfully. Relevance Score: {relevance.get('relevance_score', 0)}")
177
+ except json.JSONDecodeError as json_err:
178
+ logging.error(f"JSON decode error for paper PMID {article['PMID']}: {json_err}")
179
+ except Exception as e:
180
+ logging.error(f"Error processing paper PMID {article['PMID']}: {e}")
181
+
182
+ logging.info(f"Processed {len(records)} papers. {relevant_count} were deemed relevant.")
183
+ return pd.DataFrame(data)
184
+
185
+ def get_rheumatology_papers(start_date: datetime, end_date: datetime, test: bool = False):
186
+ query = build_query()
187
+ logging.info(f"Searching PubMed for papers between {start_date.strftime('%Y-%m-%d')} and {end_date.strftime('%Y-%m-%d')}")
188
+ logging.debug(f"PubMed query: {query}") # Add this line to log the query
189
+ search_results = search_pubmed(query, start_date, end_date)
190
+ id_list = search_results.get("IdList", [])
191
+ if not id_list:
192
+ logging.info("No new papers found.")
193
+ return pd.DataFrame()
194
+
195
+ logging.info(f"Fetching details for {len(id_list)} papers.")
196
+ records = fetch_details(id_list)
197
+ if test:
198
+ logging.info("Running in test mode. Processing only 50 papers.")
199
+ return process_papers(records[:50])
200
+ else:
201
+ return process_papers(records)
202
+
203
+ def cache_dataset(papers_df: pd.DataFrame, start_date: datetime, end_date: datetime):
204
+ try:
205
+ # Convert Dataframe to a dict so it can be uploaded to the Hub
206
+ papers_dict = papers_df.to_dict(orient="records")
207
+ repo_path = f"{end_date.strftime('%Y%m%d')}/papers.jsonl"
208
+ # Upload to the Hub
209
+ api.upload_file(
210
+ path_or_fileobj=json.dumps(papers_dict).encode('utf-8'),
211
+ path_in_repo=repo_path,
212
+ repo_id=DATASET_NAME,
213
+ repo_type="dataset",
214
+ commit_message=f"Add papers from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}",
215
+ token=HF_TOKEN
216
+ )
217
+ logging.info(f"Papers cached successfully to repository {DATASET_NAME}.")
218
+ except Exception as e:
219
+ logging.error(f"Failed to cache papers: {e}")
220
+
221
+ def load_cached_papers(start_date: datetime, end_date: datetime, test: bool = False) -> pd.DataFrame:
222
+ try:
223
+ fs = HfFileSystem()
224
+ # Updated dataset_path to point to the specific parquet file within the subdirectory
225
+ dataset_path = f"datasets/cmcmaster/this_week_in_rheumatology/{end_date.strftime('%Y%m%d')}/papers.jsonl"
226
+ if fs.exists(dataset_path):
227
+ dataset = load_dataset("jsonl", data_files={"train": dataset_path}, split="train")
228
+ papers_df = dataset.to_pandas()
229
+ return papers_df
230
+ else:
231
+ logging.info(f"No cache found for {end_date.strftime('%Y-%m-%d')}. Processing new papers.")
232
+ return get_rheumatology_papers(start_date, end_date, test)
233
+ except Exception as e:
234
+ logging.info(f"Error loading cache: {e}. Processing new papers.")
235
+ return get_rheumatology_papers(start_date, end_date, test)
236
+
237
+ def generate_pdf_newsletter(content: dict, end_date: datetime):
238
+ """Generate a PDF version of the newsletter using pdfkit"""
239
+ try:
240
+ # Convert markdown to HTML
241
+ html_content = markdown2.markdown(content['content'])
242
+
243
+ # Setup Jinja2 template environment
244
+ env = Environment(loader=FileSystemLoader('templates'))
245
+ template = env.get_template('newsletter_pdf.html')
246
+
247
+ # Render the template
248
+ html = template.render(
249
+ title=f"This Week in Rheumatology - {content['date']}",
250
+ content=html_content
251
+ )
252
+
253
+ # Configure PDF options
254
+ options = {
255
+ 'page-size': 'A4',
256
+ 'margin-top': '2cm',
257
+ 'margin-right': '2cm',
258
+ 'margin-bottom': '2cm',
259
+ 'margin-left': '2cm',
260
+ 'encoding': 'UTF-8',
261
+ 'enable-local-file-access': None,
262
+ 'quiet': ''
263
+ }
264
+
265
+ # Generate PDF
266
+ pdf_path = f"{end_date.strftime('%Y%m%d')}/newsletter.pdf"
267
+ os.makedirs(os.path.dirname(pdf_path), exist_ok=True)
268
+
269
+ # Add CSS to HTML string
270
+ html_with_style = f"""
271
+ <html>
272
+ <head>
273
+ <style>
274
+ body {{
275
+ font-family: Arial, sans-serif;
276
+ line-height: 1.6;
277
+ margin: 0 auto;
278
+ max-width: 21cm; /* A4 width */
279
+ color: #333;
280
+ }}
281
+ h1, h2 {{ color: #2c3e50; }}
282
+ h1 {{ font-size: 24px; margin-top: 2em; }}
283
+ h2 {{ font-size: 20px; margin-top: 1.5em; }}
284
+ a {{ color: #3498db; text-decoration: none; }}
285
+ p {{ margin-bottom: 1em; }}
286
+ </style>
287
+ </head>
288
+ <body>
289
+ {html}
290
+ </body>
291
+ </html>
292
+ """
293
+
294
+ pdfkit.from_string(html_with_style, pdf_path, options=options)
295
+
296
+ # Upload PDF to Hub
297
+ with open(pdf_path, 'rb') as f:
298
+ api.upload_file(
299
+ path_or_fileobj=f,
300
+ path_in_repo=pdf_path,
301
+ repo_id=DATASET_NAME,
302
+ repo_type="dataset",
303
+ commit_message=f"Add PDF newsletter for {end_date.strftime('%Y-%m-%d')}",
304
+ token=HF_TOKEN
305
+ )
306
+ logging.info("PDF newsletter generated and uploaded successfully")
307
+
308
+ except Exception as e:
309
+ logging.error(f"Failed to generate PDF newsletter: {e}")
310
+
311
+ def generate_and_store_newsletter(papers_df: pd.DataFrame, end_date: datetime):
312
+ if papers_df.empty:
313
+ logging.info("No papers to include in the newsletter.")
314
+ return
315
+
316
+ try:
317
+ logging.info(f"Generating newsletter with {len(papers_df)} papers.")
318
+ newsletter_content = compose_newsletter(papers_df)
319
+ newsletter_data = {
320
+ "date": end_date.strftime('%Y-%m-%d'),
321
+ "content": newsletter_content
322
+ }
323
+
324
+ # Store JSON version
325
+ newsletter_json = json.dumps(newsletter_data, indent=4)
326
+ repo_path = f'{end_date.strftime("%Y%m%d")}/newsletter.json'
327
+ api.upload_file(
328
+ path_or_fileobj=newsletter_json.encode('utf-8'),
329
+ path_in_repo=repo_path,
330
+ repo_id=DATASET_NAME,
331
+ repo_type="dataset",
332
+ commit_message=f"Add newsletter for {end_date.strftime('%Y-%m-%d')}",
333
+ token=HF_TOKEN
334
+ )
335
+
336
+ # Generate and store PDF version
337
+ generate_pdf_newsletter(newsletter_data, end_date)
338
+
339
+ logging.info(f"Newsletter (JSON and PDF) successfully pushed to repository {DATASET_NAME}.")
340
+ except Exception as e:
341
+ logging.error(f"Failed to generate or store newsletter: {e}")
342
+
343
+ def process_new_papers(end_date: datetime = None, test: bool = False):
344
+ end_date = end_date or datetime.now(timezone.utc)
345
+ start_date = end_date - timedelta(days=7)
346
+
347
+ # Adjust the date range to search for papers published in the last 30 days
348
+ search_start_date = end_date - timedelta(days=30)
349
+
350
+ logging.info(f"Processing papers for the week: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
351
+ logging.info(f"Searching for papers published between: {search_start_date.strftime('%Y-%m-%d')} and {end_date.strftime('%Y-%m-%d')}")
352
+
353
+ papers_df = load_cached_papers(search_start_date, end_date, test)
354
+
355
+ if papers_df.empty and not test:
356
+ logging.info("No relevant papers found in cache or recent search.")
357
+ return
358
+
359
+ logging.info(f"Found {len(papers_df)} relevant papers for the newsletter.")
360
+
361
+ # Cache the papers_df as a Hugging Face dataset
362
+ cache_dataset(papers_df, start_date, end_date)
363
+
364
+ # Generate and store the newsletter
365
+ generate_and_store_newsletter(papers_df, end_date)
366
+
367
+
368
+ if __name__ == "__main__":
369
+ parser = argparse.ArgumentParser(description="Generate a weekly Rheumatology newsletter.")
370
+ parser.add_argument('--end_date', type=str, help='End date for the newsletter in YYYY-MM-DD format. Defaults to today.')
371
+ parser.add_argument('--test', action='store_true', help='Run the script in test mode.')
372
+ args = parser.parse_args()
373
+
374
+ end_date = None
375
+ if args.end_date:
376
+ try:
377
+ end_date = datetime.strptime(args.end_date, '%Y-%m-%d').replace(tzinfo=timezone.utc)
378
+ except ValueError:
379
+ logging.error("Invalid date format for --end_date. Use YYYY-MM-DD.")
380
+ exit(1)
381
+
382
+ process_new_papers(end_date, args.test)
hf_api.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import logging
4
+ from enum import Enum
5
+ from pydantic import BaseModel, Field
6
+ import pandas as pd
7
+ from huggingface_hub import InferenceClient
8
+ from tenacity import retry, stop_after_attempt, wait_exponential
9
+
10
+ # Configure logging
11
+ logger = logging.getLogger(__name__)
12
+ logger.setLevel(logging.INFO)
13
+
14
+ # Create handlers
15
+ console_handler = logging.StreamHandler()
16
+ console_handler.setLevel(logging.INFO)
17
+
18
+ file_handler = logging.FileHandler("hf_api.log")
19
+ file_handler.setLevel(logging.INFO)
20
+
21
+ # Create formatters and add to handlers
22
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
23
+ console_handler.setFormatter(formatter)
24
+ file_handler.setFormatter(formatter)
25
+
26
+ # Add handlers to the logger
27
+ if not logger.handlers:
28
+ logger.addHandler(console_handler)
29
+ logger.addHandler(file_handler)
30
+
31
+ # Validate and retrieve the Hugging Face API token
32
+ HF_TOKEN = os.environ.get('HF_TOKEN')
33
+ if not HF_TOKEN:
34
+ logger.error("Hugging Face API token not found. Set the HF_TOKEN environment variable.")
35
+ raise EnvironmentError("HF_TOKEN environment variable is not set.")
36
+
37
+ # Initialize the InferenceClient
38
+ MODEL_NAME1 = "meta-llama/Llama-3.1-8B-Instruct"
39
+ MODEL_NAME2 = "Qwen/Qwen2.5-72B-Instruct"
40
+ try:
41
+ client1 = InferenceClient(model=MODEL_NAME1, token=HF_TOKEN)
42
+ logger.info(f"InferenceClient for model '{MODEL_NAME1}' instantiated successfully.")
43
+ except Exception as e:
44
+ logger.error(f"Failed to instantiate InferenceClient for model '{MODEL_NAME1}': {e}")
45
+ raise
46
+
47
+ try:
48
+ client2 = InferenceClient(model=MODEL_NAME2, token=HF_TOKEN)
49
+ logger.info(f"InferenceClient for model '{MODEL_NAME2}' instantiated successfully.")
50
+ except Exception as e:
51
+ logger.error(f"Failed to instantiate InferenceClient for model '{MODEL_NAME2}': {e}")
52
+ raise
53
+
54
+ # Define Pydantic schemas
55
+ class EvaluationSchema(BaseModel):
56
+ reasoning: str
57
+ relevance_score: int = Field(ge=0, le=10)
58
+
59
+ class TopicEnum(Enum):
60
+ Rheumatoid_Arthritis = "Rheumatoid Arthritis"
61
+ Systemic_Lupus_Erythematosus = "Systemic Lupus Erythematosus"
62
+ Scleroderma = "Scleroderma"
63
+ Sjogren_s_Disease = "Sjogren's Disease"
64
+ Ankylosing_Spondylitis = "Ankylosing Spondylitis"
65
+ Psoriatic_Arthritis = "Psoriatic Arthritis"
66
+ Gout = "Gout"
67
+ Vasculitis = "Vasculitis"
68
+ Osteoarthritis = "Osteoarthritis"
69
+ Infectious_Diseases = "Infectious Diseases"
70
+ Immunology = "Immunology"
71
+ Genetics = "Genetics"
72
+ Biologics = "Biologics"
73
+ Biosimilars = "Biosimilars"
74
+ Small_Molecules = "Small Molecules"
75
+ Clinical_Trials = "Clinical Trials"
76
+ Health_Policy = "Health Policy"
77
+ Patient_Education = "Patient Education"
78
+ Other_Rheumatic_Diseases = "Other Rheumatic Diseases"
79
+
80
+ class SummarySchema(BaseModel):
81
+ summary: str
82
+ # Enum for topic
83
+ topic: TopicEnum = TopicEnum.Other_Rheumatic_Diseases
84
+
85
+ class PaperSchema(BaseModel):
86
+ title: str
87
+ authors: str
88
+ journal: str
89
+ pmid: str
90
+
91
+ class TopicSummarySchema(BaseModel):
92
+ planning: str
93
+ summary: str
94
+
95
+ def evaluate_relevance(title: str, abstract: str) -> EvaluationSchema:
96
+ prompt = f"""
97
+ Title: {title}
98
+ Abstract: {abstract}
99
+ Instructions: Evaluate the relevance of this medical abstract for an audience of rheumatologists on a scale of 0 to 10 with 10 being reserved only for large clinical trials in rheumatology.
100
+ Be very discerning and only give a score above 8 for papers that are highly clinically relevant to rheumatologists.
101
+ Respond in JSON format using the following schema:
102
+ {json.dumps(EvaluationSchema.model_json_schema())}
103
+ """
104
+
105
+ try:
106
+ response = client1.text_generation(
107
+ prompt,
108
+ max_new_tokens=512,
109
+ temperature=0.2,
110
+ grammar={"type": "json", "value": EvaluationSchema.model_json_schema()}
111
+ )
112
+ result = json.loads(response)
113
+ return result
114
+ except Exception as e:
115
+ logger.error(f"Error in evaluate_relevance: {e}")
116
+ raise
117
+
118
+ def summarize_abstract(abstract: str) -> SummarySchema:
119
+ prompt = f"""
120
+ Abstract: {abstract}
121
+ Instructions: Summarize this medical abstract in 1 sentence and select the most relevant topic from the following enum:
122
+ {TopicEnum.__doc__}
123
+ Respond in JSON format using the following schema:
124
+ {json.dumps(SummarySchema.model_json_schema())}
125
+ """
126
+
127
+ try:
128
+ response = client1.text_generation(
129
+ prompt,
130
+ max_new_tokens=512,
131
+ temperature=0.2,
132
+ grammar={"type": "json", "value": SummarySchema.model_json_schema()}
133
+ )
134
+ result = json.loads(response)
135
+ return result
136
+ except Exception as e:
137
+ logger.error(f"Error in summarize_abstract: {e}")
138
+ raise
139
+
140
+ @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
141
+ def _make_api_call(client, prompt, max_tokens=4096, temp=0.2, schema=None):
142
+ try:
143
+ response = client.text_generation(
144
+ prompt,
145
+ max_new_tokens=max_tokens,
146
+ temperature=temp,
147
+ grammar={"type": "json", "value": schema} if schema else None
148
+ )
149
+ return json.loads(response)
150
+ except Exception as e:
151
+ logger.error(f"API call failed: {e}")
152
+ raise
153
+
154
+ def compose_newsletter(papers: pd.DataFrame) -> str:
155
+ if papers.empty:
156
+ logger.info("No papers provided to compose the newsletter.")
157
+ return ""
158
+
159
+ content = ["# This Week in Rheumatology\n"]
160
+ topics = papers['Topic'].unique()
161
+
162
+ for topic in topics:
163
+ try:
164
+ relevant_papers = papers[papers['Topic'] == topic]
165
+ # Convert to dict with lowercase keys to match the expected schema
166
+ papers_dict = relevant_papers.rename(columns={
167
+ 'Title': 'title',
168
+ 'Authors': 'authors',
169
+ 'Journal': 'journal',
170
+ 'PMID': 'pmid',
171
+ 'Summary': 'summary'
172
+ }).to_dict('records')
173
+
174
+ prompt = f"""
175
+ Instructions: Generate a brief summary of the latest research on {topic} using the following papers.
176
+ Papers: {json.dumps(papers_dict)}
177
+ Respond in JSON format using the following schema:
178
+ {json.dumps(TopicSummarySchema.model_json_schema())}
179
+ You have the option of using the planning field first to organize your thoughts before writing the summary.
180
+ The summary should be concise, but because you are summarizing several papers, it should be detailed enough to give the reader a good idea of the latest research in the field.
181
+ The papers may be somewhat disjointed, so you will need to think carefully about how you can transition between them with clever wording.
182
+ You can use anywhere from 1 to 3 paragraphs for the summary.
183
+ """
184
+
185
+ result = _make_api_call(
186
+ client2,
187
+ prompt,
188
+ max_tokens=4096,
189
+ temp=0.2,
190
+ schema=TopicSummarySchema.model_json_schema()
191
+ )
192
+
193
+ # Log the raw response for debugging
194
+ logger.debug(f"Raw response from Hugging Face: {result}")
195
+
196
+ # Parse the JSON response
197
+ summary = TopicSummarySchema(**result)
198
+
199
+ # Convert the structured summary to Markdown
200
+ topic_content = f"## {topic}\n\n"
201
+ topic_content += f"{summary.summary}\n\n"
202
+
203
+ # Add a references section
204
+ topic_content += "### References\n\n"
205
+ relevant_papers = papers[papers['Topic'] == topic]
206
+ for _, paper in relevant_papers.iterrows():
207
+ topic_content += (f"- {paper['Title']} by {paper['Authors']}. {paper['Journal']}. "
208
+ f"[PMID: {paper['PMID']}](https://pubmed.ncbi.nlm.nih.gov/{paper['PMID']}/)\n")
209
+
210
+ content.append(topic_content)
211
+
212
+ except Exception as e:
213
+ logger.error(f"Error processing topic {topic}: {e}")
214
+ logger.error(f"Raw response: {result}")
215
+ continue
216
+
217
+ return "\n".join(content)
main.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from datetime import datetime, timezone
4
+
5
+ from fasthtml.common import *
6
+ from huggingface_hub import HfApi, hf_hub_download
7
+ from starlette.responses import FileResponse
8
+ from generate_newsletter import process_new_papers
9
+ from apscheduler.schedulers.background import BackgroundScheduler
10
+ from apscheduler.triggers.cron import CronTrigger
11
+
12
+ from fasthtml_hf import setup_hf_backup
13
+
14
+ # Initialize Hugging Face API
15
+ HF_TOKEN = os.environ.get("HF_TOKEN")
16
+ DATASET_NAME = "cmcmaster/this_week_in_rheumatology"
17
+ api = HfApi(token=HF_TOKEN)
18
+
19
+ # Initialize scheduler
20
+ scheduler = BackgroundScheduler()
21
+
22
+ # Schedule newsletter generation to run every Monday at 1 AM UTC
23
+ scheduler.add_job(process_new_papers,
24
+ CronTrigger(day_of_week='mon', hour=1),
25
+ kwargs={
26
+ 'end_date': None,
27
+ 'test': False
28
+ },
29
+ id='generate_newsletter',
30
+ name='Weekly newsletter generation',
31
+ replace_existing=True)
32
+
33
+ css = Style("""
34
+ body {
35
+ font-family: Georgia, Times, serif;
36
+ line-height: 1.6;
37
+ color: #333;
38
+ max-width: 800px;
39
+ margin: 0 auto;
40
+ padding: 20px;
41
+ background: #fff;
42
+ }
43
+
44
+ h1, h2 {
45
+ color: #2c3e50;
46
+ font-family: Georgia, Times, serif;
47
+ }
48
+
49
+ a {
50
+ color: #2c3e50;
51
+ text-decoration: none;
52
+ }
53
+
54
+ a:hover {
55
+ text-decoration: underline;
56
+ }
57
+
58
+ ul {
59
+ list-style-type: none;
60
+ padding: 0;
61
+ }
62
+
63
+ li {
64
+ margin-bottom: 10px;
65
+ }
66
+
67
+ .newsletter-content {
68
+ margin-top: 20px;
69
+ }
70
+
71
+ .download-link {
72
+ display: inline-block;
73
+ padding: 10px 20px;
74
+ background-color: #2c3e50;
75
+ color: white;
76
+ border-radius: 3px;
77
+ margin: 10px 0;
78
+ font-family: Georgia, Times, serif;
79
+ }
80
+
81
+ .download-link:hover {
82
+ background-color: #34495e;
83
+ text-decoration: none;
84
+ }
85
+ """)
86
+
87
+ app = FastHTML(hdrs=(css, MarkdownJS(),
88
+ HighlightJS(
89
+ langs=['python', 'javascript', 'html', 'css'])))
90
+
91
+
92
+ # Start the scheduler when the app starts
93
+ @app.on_event("startup")
94
+ async def start_scheduler():
95
+ scheduler.start()
96
+
97
+
98
+ # Shut down the scheduler when the app stops
99
+ @app.on_event("shutdown")
100
+ async def shutdown_scheduler():
101
+ scheduler.shutdown()
102
+
103
+
104
+ def get_newsletter_list():
105
+ # Fetch the list of newsletters from the Hugging Face repository
106
+ files = api.list_repo_files(repo_id=DATASET_NAME, repo_type="dataset")
107
+ newsletters = [f for f in files if f.endswith('newsletter.json')]
108
+ return sorted(newsletters, reverse=True)
109
+
110
+
111
+ def get_newsletter_content(path):
112
+ # Download and parse the newsletter content
113
+ content = api.hf_hub_download(repo_id=DATASET_NAME,
114
+ filename=path,
115
+ repo_type="dataset")
116
+ with open(content, 'r') as f:
117
+ return json.load(f)
118
+
119
+
120
+ @app.get("/")
121
+ def index():
122
+ newsletters = get_newsletter_list()
123
+ links = [
124
+ Li(
125
+ A(datetime.strptime(n.split('/')[0], '%Y%m%d').strftime('%B %d, %Y'),
126
+ href=f"/newsletter/{n.split('/')[0]}")) for n in newsletters
127
+ ]
128
+ return Titled("This Week in Rheumatology", H2("Available Newsletters"),
129
+ Ul(*links))
130
+
131
+
132
+ @app.get("/newsletter/{date}")
133
+ def newsletter(date: str):
134
+ path = f"{date}/newsletter.json"
135
+ pdf_path = f"{date}/newsletter.pdf"
136
+ try:
137
+ content = get_newsletter_content(path)
138
+ return Titled(
139
+ f"This Week in Rheumatology - {content['date']}",
140
+ A("Back to Index", href="/"),
141
+ Div(
142
+ A("Download PDF", href=f"/download/{date}", cls="download-link")
143
+ ),
144
+ Div(content['content'], cls="marked"))
145
+ except Exception as e:
146
+ return Titled("Error", H2("Newsletter not found"),
147
+ P(f"Unable to load newsletter for date: {date}"),
148
+ A("Back to Index", href="/"))
149
+
150
+
151
+ @app.get("/download/{date}")
152
+ def download_pdf(date: str):
153
+ try:
154
+ pdf_path = f"{date}/newsletter.pdf"
155
+ content = api.hf_hub_download(repo_id=DATASET_NAME,
156
+ filename=pdf_path,
157
+ repo_type="dataset")
158
+ return FileResponse(content,
159
+ media_type="application/pdf",
160
+ filename=f"newsletter_{date}.pdf")
161
+ except Exception as e:
162
+ return Titled("Error", H2("PDF not found"),
163
+ P(f"Unable to load PDF for date: {date}"),
164
+ A("Back to Index", href="/"))
165
+
166
+ setup_hf_backup(app)
167
+ serve()
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fasthtml-hf
2
+ huggingface-hub
3
+ starlette
4
+ apscheduler
5
+ bio
6
+ datasets
7
+ pdfkit
8
+ jinja2
9
+ markdown2
10
+ pandas
11
+ pydantic
12
+ tenacity
search_terms.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "search_strategy": {
3
+ "mesh_terms": [
4
+ "Rheumatic Diseases",
5
+ "Rheumatology",
6
+ "Arthritis, Rheumatoid",
7
+ "Lupus Erythematosus, Systemic",
8
+ "Osteoarthritis",
9
+ "Fibromyalgia",
10
+ "Sjogren's Syndrome",
11
+ "Scleroderma, Systemic",
12
+ "Polymyositis",
13
+ "Dermatomyositis",
14
+ "Vasculitis",
15
+ "Gout",
16
+ "Spondylarthropathies",
17
+ "Polymyalgia Rheumatica",
18
+ "Arthritis, Psoriatic",
19
+ "Arthritis, Juvenile"
20
+ ],
21
+ "keywords": [
22
+ "rheumat*",
23
+ "autoimmune",
24
+ "connective tissue disease",
25
+ "inflammatory arthritis",
26
+ "systemic inflammatory disease",
27
+ "musculoskeletal disorder",
28
+ "autoinflammatory syndrome",
29
+ "immunologic disease",
30
+ "crystal arthropathy"
31
+ ],
32
+ "specific_conditions": [
33
+ "ankylosing spondylitis",
34
+ "reactive arthritis",
35
+ "enteropathic arthritis",
36
+ "systemic sclerosis",
37
+ "mixed connective tissue disease",
38
+ "antiphospholipid syndrome",
39
+ "Behcet's disease",
40
+ "giant cell arteritis",
41
+ "Takayasu arteritis",
42
+ "ANCA-associated vasculitis",
43
+ "polymyositis",
44
+ "dermatomyositis",
45
+ "inclusion body myositis"
46
+ ],
47
+ "research_related_terms": [
48
+ "epidemiology",
49
+ "etiology",
50
+ "pathogenesis",
51
+ "diagnosis",
52
+ "treatment",
53
+ "therapy",
54
+ "prognosis",
55
+ "outcome",
56
+ "clinical trial",
57
+ "cohort study",
58
+ "case-control study",
59
+ "systematic review",
60
+ "meta-analysis",
61
+ "biomarker",
62
+ "genetic",
63
+ "immunology",
64
+ "imaging"
65
+ ],
66
+ "exclusion_terms": [
67
+ "veterinary",
68
+ "animal model"
69
+ ]
70
+ },
71
+ "search_fields": [
72
+ "Title/Abstract",
73
+ "MeSH Terms",
74
+ "Publication Type",
75
+ "Journal"
76
+ ],
77
+ "publication_types": [
78
+ "Journal Article",
79
+ "Review",
80
+ "Clinical Trial",
81
+ "Meta-Analysis",
82
+ "Randomized Controlled Trial",
83
+ "Practice Guideline"
84
+ ],
85
+ "languages": [
86
+ "English"
87
+ ],
88
+ "species": [
89
+ "Humans"
90
+ ],
91
+ "journals": [
92
+ "Annals of the Rheumatic Diseases",
93
+ "Arthritis & Rheumatology",
94
+ "Rheumatology",
95
+ "Journal of Rheumatology",
96
+ "Arthritis Research & Therapy",
97
+ "Seminars in Arthritis and Rheumatism",
98
+ "RMD Open",
99
+ "Clinical Rheumatology",
100
+ "Arthritis Care & Research",
101
+ "International Journal of Rheumatic Diseases"
102
+ ]
103
+ }
104
+
templates/newsletter_pdf.html ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>{{ title }}</title>
6
+ </head>
7
+ <body>
8
+ <h1>{{ title }}</h1>
9
+ <div class="content">
10
+ {{ content|safe }}
11
+ </div>
12
+ </body>
13
+ </html>