Spaces:
Sleeping
Sleeping
deploy at 2024-11-11 09:48:16.356051
Browse files- Dockerfile +10 -0
- README.md +6 -5
- generate_newsletter.py +382 -0
- hf_api.py +217 -0
- main.py +167 -0
- requirements.txt +12 -0
- search_terms.json +104 -0
- templates/newsletter_pdf.html +13 -0
Dockerfile
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10
|
2 |
+
WORKDIR /code
|
3 |
+
COPY --link --chown=1000 . .
|
4 |
+
RUN mkdir -p /tmp/cache/
|
5 |
+
RUN chmod a+rwx -R /tmp/cache/
|
6 |
+
ENV HF_HUB_CACHE=HF_HOME
|
7 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
8 |
+
|
9 |
+
ENV PYTHONUNBUFFERED=1 PORT=7860
|
10 |
+
CMD ["python", "main.py"]
|
README.md
CHANGED
@@ -1,10 +1,11 @@
|
|
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
colorTo: red
|
6 |
sdk: docker
|
|
|
7 |
pinned: false
|
|
|
8 |
---
|
9 |
-
|
10 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
|
2 |
---
|
3 |
+
title: cmcmaster/this_week_in_rheumatology
|
4 |
+
emoji: 🚀
|
5 |
+
colorFrom: purple
|
6 |
colorTo: red
|
7 |
sdk: docker
|
8 |
+
app_file: app.py
|
9 |
pinned: false
|
10 |
+
termination_grace_period: 2m
|
11 |
---
|
|
|
|
generate_newsletter.py
ADDED
@@ -0,0 +1,382 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import os
|
3 |
+
from datetime import datetime, timedelta, timezone
|
4 |
+
import json
|
5 |
+
from Bio import Entrez, Medline
|
6 |
+
from huggingface_hub import HfApi, hf_hub_download, DatasetCard, DatasetCardData
|
7 |
+
from datasets import Dataset, load_dataset
|
8 |
+
from hf_api import (
|
9 |
+
evaluate_relevance,
|
10 |
+
summarize_abstract,
|
11 |
+
compose_newsletter
|
12 |
+
)
|
13 |
+
import logging
|
14 |
+
import argparse
|
15 |
+
from huggingface_hub import HfFileSystem
|
16 |
+
import pdfkit
|
17 |
+
from jinja2 import Environment, FileSystemLoader
|
18 |
+
import markdown2
|
19 |
+
|
20 |
+
# Configure logging
|
21 |
+
logging.basicConfig(
|
22 |
+
level=logging.INFO,
|
23 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
24 |
+
handlers=[
|
25 |
+
logging.FileHandler("app.log"),
|
26 |
+
logging.StreamHandler()
|
27 |
+
]
|
28 |
+
)
|
29 |
+
|
30 |
+
# Retrieve environment variables
|
31 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
32 |
+
DATASET_NAME = os.environ.get("DATASET_NAME", "cmcmaster/this_week_in_rheumatology")
|
33 |
+
|
34 |
+
if not HF_TOKEN:
|
35 |
+
logging.error("Hugging Face token not found. Set the HF_TOKEN environment variable.")
|
36 |
+
exit(1)
|
37 |
+
|
38 |
+
# Initialize Hugging Face Hub API
|
39 |
+
api = HfApi(token=HF_TOKEN)
|
40 |
+
|
41 |
+
def ensure_repo_exists(api, repo_id, repo_type, token):
|
42 |
+
try:
|
43 |
+
api.repo_info(repo_id=repo_id, repo_type=repo_type)
|
44 |
+
logging.info(f"Repository {repo_id} already exists.")
|
45 |
+
except Exception as e:
|
46 |
+
logging.info(f"Repository {repo_id} not found. Creating a new one.")
|
47 |
+
try:
|
48 |
+
api.create_repo(
|
49 |
+
repo_id=repo_id,
|
50 |
+
repo_type=repo_type,
|
51 |
+
token=token,
|
52 |
+
private=False,
|
53 |
+
exist_ok=True
|
54 |
+
)
|
55 |
+
# Create a dataset card
|
56 |
+
card_data = DatasetCardData(
|
57 |
+
language="en",
|
58 |
+
license="cc-by-sa-4.0",
|
59 |
+
task_categories=["text-classification"],
|
60 |
+
tags=["rheumatology", "medical-research"]
|
61 |
+
)
|
62 |
+
card = DatasetCard("---\n" + card_data.to_yaml() + "\n---\n# This Week in Rheumatology\n\nA weekly collection of relevant rheumatology papers.")
|
63 |
+
api.upload_file(
|
64 |
+
path_or_fileobj=str(card).encode(),
|
65 |
+
path_in_repo="README.md",
|
66 |
+
repo_id=repo_id,
|
67 |
+
repo_type=repo_type,
|
68 |
+
commit_message="Add dataset card",
|
69 |
+
token=token
|
70 |
+
)
|
71 |
+
logging.info(f"Repository {repo_id} created successfully with a dataset card.")
|
72 |
+
except Exception as create_error:
|
73 |
+
logging.error(f"Failed to create repository {repo_id}: {create_error}")
|
74 |
+
exit(1)
|
75 |
+
|
76 |
+
# Ensure the repository exists before proceeding
|
77 |
+
ensure_repo_exists(api, DATASET_NAME, repo_type="dataset", token=HF_TOKEN)
|
78 |
+
|
79 |
+
# Load search terms from JSON
|
80 |
+
with open('search_terms.json', 'r') as f:
|
81 |
+
search_terms = json.load(f)
|
82 |
+
|
83 |
+
def build_query():
|
84 |
+
# Constructing MeSH terms
|
85 |
+
mesh_terms = ' OR '.join(f'"{term}"[MeSH Terms]' for term in search_terms['search_strategy']['mesh_terms'])
|
86 |
+
|
87 |
+
# Constructing keywords
|
88 |
+
keywords = ' OR '.join(f'"{term}"[Title/Abstract]' for term in search_terms['search_strategy']['keywords'])
|
89 |
+
|
90 |
+
# Constructing specific conditions
|
91 |
+
specific_conditions = ' OR '.join(f'"{term}"[Title/Abstract]' for term in search_terms['search_strategy']['specific_conditions'])
|
92 |
+
|
93 |
+
# Constructing research-related terms
|
94 |
+
research_terms = ' OR '.join(f'"{term}"[Title/Abstract]' for term in search_terms['search_strategy']['research_related_terms'])
|
95 |
+
|
96 |
+
# Constructing journal names
|
97 |
+
journals = ' OR '.join(f'"{journal}"[Journal]' for journal in search_terms['journals'])
|
98 |
+
|
99 |
+
# Correctly grouping exclusion terms with parentheses and using OR
|
100 |
+
exclusion_terms = 'NOT (' + ' OR '.join(f'"{term}"[Title/Abstract]' for term in search_terms['search_strategy']['exclusion_terms']) + ')'
|
101 |
+
|
102 |
+
# Grouping all inclusion terms within parentheses and combining with OR
|
103 |
+
inclusion_terms = f"({mesh_terms} OR {keywords} OR {specific_conditions} OR {journals})"
|
104 |
+
|
105 |
+
# Enclosing research terms within parentheses
|
106 |
+
research_terms_grouped = f"({research_terms})"
|
107 |
+
|
108 |
+
# Constructing the final query with proper grouping and operator precedence
|
109 |
+
query = f"{inclusion_terms} AND {research_terms_grouped} {exclusion_terms}"
|
110 |
+
|
111 |
+
# Adding filters for human studies, English language, and publication types
|
112 |
+
human_filter = 'AND "humans"[MeSH Terms]'
|
113 |
+
language_filter = 'AND "english"[Language]'
|
114 |
+
pub_types = ' OR '.join(f'"{pt}"[Publication Type]' for pt in search_terms['publication_types'])
|
115 |
+
pub_type_filter = f'AND ({pub_types})'
|
116 |
+
|
117 |
+
# Exclude case reports
|
118 |
+
exclude_case_reports = 'NOT "Case Reports"[Publication Type]'
|
119 |
+
|
120 |
+
query = f"{query} {human_filter} {language_filter} {pub_type_filter} {exclude_case_reports}"
|
121 |
+
|
122 |
+
logging.info(f"Built PubMed query: {query}")
|
123 |
+
return query
|
124 |
+
|
125 |
+
def search_pubmed(query, start_date: datetime, end_date: datetime):
|
126 |
+
Entrez.email = "mcmastc1@gmail.com" # Replace with your actual email
|
127 |
+
try:
|
128 |
+
handle = Entrez.esearch(
|
129 |
+
db="pubmed",
|
130 |
+
term=query,
|
131 |
+
mindate=start_date.strftime('%Y/%m/%d'),
|
132 |
+
maxdate=end_date.strftime('%Y/%m/%d'),
|
133 |
+
usehistory="y",
|
134 |
+
retmax=1000
|
135 |
+
)
|
136 |
+
results = Entrez.read(handle)
|
137 |
+
logging.info(f"PubMed search completed. Found {results['Count']} papers.")
|
138 |
+
return results
|
139 |
+
except Exception as e:
|
140 |
+
logging.error(f"Error searching PubMed: {e}")
|
141 |
+
logging.error(f"Query: {query}")
|
142 |
+
logging.error(f"Date range: {start_date.strftime('%Y/%m/%d')} to {end_date.strftime('%Y/%m/%d')}")
|
143 |
+
raise
|
144 |
+
|
145 |
+
def fetch_details(id_list):
|
146 |
+
ids = ",".join(id_list)
|
147 |
+
handle = Entrez.efetch(db="pubmed", id=ids, rettype="medline", retmode="text")
|
148 |
+
records = list(Medline.parse(handle))
|
149 |
+
logging.info(f"Fetched details for {len(records)} papers.")
|
150 |
+
return records
|
151 |
+
|
152 |
+
def process_papers(records):
|
153 |
+
data = []
|
154 |
+
relevant_count = 0
|
155 |
+
for record in records:
|
156 |
+
article = {
|
157 |
+
"PMID": record.get("PMID", ""),
|
158 |
+
"Title": record.get("TI", ""),
|
159 |
+
"Authors": ", ".join(record.get("AU", [])),
|
160 |
+
"Journal": record.get("JT", ""),
|
161 |
+
"Abstract": record.get("AB", ""),
|
162 |
+
"Publication Type": ", ".join(record.get("PT", [])),
|
163 |
+
}
|
164 |
+
try:
|
165 |
+
relevance = evaluate_relevance(article["Title"], article["Abstract"])
|
166 |
+
# If relevant and confidence is > 7, add to data
|
167 |
+
if relevance.get("relevance_score", 0) > 8:
|
168 |
+
summary = summarize_abstract(article["Abstract"])
|
169 |
+
article["Summary"] = summary.get("summary", "")
|
170 |
+
article["Topic"] = summary.get("topic", "")
|
171 |
+
# Drop Abstract and Publication Type from article
|
172 |
+
article.pop("Abstract", None)
|
173 |
+
article.pop("Publication Type", None)
|
174 |
+
data.append(article)
|
175 |
+
relevant_count += 1
|
176 |
+
logging.info(f"Paper PMID {article['PMID']} processed successfully. Relevance Score: {relevance.get('relevance_score', 0)}")
|
177 |
+
except json.JSONDecodeError as json_err:
|
178 |
+
logging.error(f"JSON decode error for paper PMID {article['PMID']}: {json_err}")
|
179 |
+
except Exception as e:
|
180 |
+
logging.error(f"Error processing paper PMID {article['PMID']}: {e}")
|
181 |
+
|
182 |
+
logging.info(f"Processed {len(records)} papers. {relevant_count} were deemed relevant.")
|
183 |
+
return pd.DataFrame(data)
|
184 |
+
|
185 |
+
def get_rheumatology_papers(start_date: datetime, end_date: datetime, test: bool = False):
|
186 |
+
query = build_query()
|
187 |
+
logging.info(f"Searching PubMed for papers between {start_date.strftime('%Y-%m-%d')} and {end_date.strftime('%Y-%m-%d')}")
|
188 |
+
logging.debug(f"PubMed query: {query}") # Add this line to log the query
|
189 |
+
search_results = search_pubmed(query, start_date, end_date)
|
190 |
+
id_list = search_results.get("IdList", [])
|
191 |
+
if not id_list:
|
192 |
+
logging.info("No new papers found.")
|
193 |
+
return pd.DataFrame()
|
194 |
+
|
195 |
+
logging.info(f"Fetching details for {len(id_list)} papers.")
|
196 |
+
records = fetch_details(id_list)
|
197 |
+
if test:
|
198 |
+
logging.info("Running in test mode. Processing only 50 papers.")
|
199 |
+
return process_papers(records[:50])
|
200 |
+
else:
|
201 |
+
return process_papers(records)
|
202 |
+
|
203 |
+
def cache_dataset(papers_df: pd.DataFrame, start_date: datetime, end_date: datetime):
|
204 |
+
try:
|
205 |
+
# Convert Dataframe to a dict so it can be uploaded to the Hub
|
206 |
+
papers_dict = papers_df.to_dict(orient="records")
|
207 |
+
repo_path = f"{end_date.strftime('%Y%m%d')}/papers.jsonl"
|
208 |
+
# Upload to the Hub
|
209 |
+
api.upload_file(
|
210 |
+
path_or_fileobj=json.dumps(papers_dict).encode('utf-8'),
|
211 |
+
path_in_repo=repo_path,
|
212 |
+
repo_id=DATASET_NAME,
|
213 |
+
repo_type="dataset",
|
214 |
+
commit_message=f"Add papers from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}",
|
215 |
+
token=HF_TOKEN
|
216 |
+
)
|
217 |
+
logging.info(f"Papers cached successfully to repository {DATASET_NAME}.")
|
218 |
+
except Exception as e:
|
219 |
+
logging.error(f"Failed to cache papers: {e}")
|
220 |
+
|
221 |
+
def load_cached_papers(start_date: datetime, end_date: datetime, test: bool = False) -> pd.DataFrame:
|
222 |
+
try:
|
223 |
+
fs = HfFileSystem()
|
224 |
+
# Updated dataset_path to point to the specific parquet file within the subdirectory
|
225 |
+
dataset_path = f"datasets/cmcmaster/this_week_in_rheumatology/{end_date.strftime('%Y%m%d')}/papers.jsonl"
|
226 |
+
if fs.exists(dataset_path):
|
227 |
+
dataset = load_dataset("jsonl", data_files={"train": dataset_path}, split="train")
|
228 |
+
papers_df = dataset.to_pandas()
|
229 |
+
return papers_df
|
230 |
+
else:
|
231 |
+
logging.info(f"No cache found for {end_date.strftime('%Y-%m-%d')}. Processing new papers.")
|
232 |
+
return get_rheumatology_papers(start_date, end_date, test)
|
233 |
+
except Exception as e:
|
234 |
+
logging.info(f"Error loading cache: {e}. Processing new papers.")
|
235 |
+
return get_rheumatology_papers(start_date, end_date, test)
|
236 |
+
|
237 |
+
def generate_pdf_newsletter(content: dict, end_date: datetime):
|
238 |
+
"""Generate a PDF version of the newsletter using pdfkit"""
|
239 |
+
try:
|
240 |
+
# Convert markdown to HTML
|
241 |
+
html_content = markdown2.markdown(content['content'])
|
242 |
+
|
243 |
+
# Setup Jinja2 template environment
|
244 |
+
env = Environment(loader=FileSystemLoader('templates'))
|
245 |
+
template = env.get_template('newsletter_pdf.html')
|
246 |
+
|
247 |
+
# Render the template
|
248 |
+
html = template.render(
|
249 |
+
title=f"This Week in Rheumatology - {content['date']}",
|
250 |
+
content=html_content
|
251 |
+
)
|
252 |
+
|
253 |
+
# Configure PDF options
|
254 |
+
options = {
|
255 |
+
'page-size': 'A4',
|
256 |
+
'margin-top': '2cm',
|
257 |
+
'margin-right': '2cm',
|
258 |
+
'margin-bottom': '2cm',
|
259 |
+
'margin-left': '2cm',
|
260 |
+
'encoding': 'UTF-8',
|
261 |
+
'enable-local-file-access': None,
|
262 |
+
'quiet': ''
|
263 |
+
}
|
264 |
+
|
265 |
+
# Generate PDF
|
266 |
+
pdf_path = f"{end_date.strftime('%Y%m%d')}/newsletter.pdf"
|
267 |
+
os.makedirs(os.path.dirname(pdf_path), exist_ok=True)
|
268 |
+
|
269 |
+
# Add CSS to HTML string
|
270 |
+
html_with_style = f"""
|
271 |
+
<html>
|
272 |
+
<head>
|
273 |
+
<style>
|
274 |
+
body {{
|
275 |
+
font-family: Arial, sans-serif;
|
276 |
+
line-height: 1.6;
|
277 |
+
margin: 0 auto;
|
278 |
+
max-width: 21cm; /* A4 width */
|
279 |
+
color: #333;
|
280 |
+
}}
|
281 |
+
h1, h2 {{ color: #2c3e50; }}
|
282 |
+
h1 {{ font-size: 24px; margin-top: 2em; }}
|
283 |
+
h2 {{ font-size: 20px; margin-top: 1.5em; }}
|
284 |
+
a {{ color: #3498db; text-decoration: none; }}
|
285 |
+
p {{ margin-bottom: 1em; }}
|
286 |
+
</style>
|
287 |
+
</head>
|
288 |
+
<body>
|
289 |
+
{html}
|
290 |
+
</body>
|
291 |
+
</html>
|
292 |
+
"""
|
293 |
+
|
294 |
+
pdfkit.from_string(html_with_style, pdf_path, options=options)
|
295 |
+
|
296 |
+
# Upload PDF to Hub
|
297 |
+
with open(pdf_path, 'rb') as f:
|
298 |
+
api.upload_file(
|
299 |
+
path_or_fileobj=f,
|
300 |
+
path_in_repo=pdf_path,
|
301 |
+
repo_id=DATASET_NAME,
|
302 |
+
repo_type="dataset",
|
303 |
+
commit_message=f"Add PDF newsletter for {end_date.strftime('%Y-%m-%d')}",
|
304 |
+
token=HF_TOKEN
|
305 |
+
)
|
306 |
+
logging.info("PDF newsletter generated and uploaded successfully")
|
307 |
+
|
308 |
+
except Exception as e:
|
309 |
+
logging.error(f"Failed to generate PDF newsletter: {e}")
|
310 |
+
|
311 |
+
def generate_and_store_newsletter(papers_df: pd.DataFrame, end_date: datetime):
|
312 |
+
if papers_df.empty:
|
313 |
+
logging.info("No papers to include in the newsletter.")
|
314 |
+
return
|
315 |
+
|
316 |
+
try:
|
317 |
+
logging.info(f"Generating newsletter with {len(papers_df)} papers.")
|
318 |
+
newsletter_content = compose_newsletter(papers_df)
|
319 |
+
newsletter_data = {
|
320 |
+
"date": end_date.strftime('%Y-%m-%d'),
|
321 |
+
"content": newsletter_content
|
322 |
+
}
|
323 |
+
|
324 |
+
# Store JSON version
|
325 |
+
newsletter_json = json.dumps(newsletter_data, indent=4)
|
326 |
+
repo_path = f'{end_date.strftime("%Y%m%d")}/newsletter.json'
|
327 |
+
api.upload_file(
|
328 |
+
path_or_fileobj=newsletter_json.encode('utf-8'),
|
329 |
+
path_in_repo=repo_path,
|
330 |
+
repo_id=DATASET_NAME,
|
331 |
+
repo_type="dataset",
|
332 |
+
commit_message=f"Add newsletter for {end_date.strftime('%Y-%m-%d')}",
|
333 |
+
token=HF_TOKEN
|
334 |
+
)
|
335 |
+
|
336 |
+
# Generate and store PDF version
|
337 |
+
generate_pdf_newsletter(newsletter_data, end_date)
|
338 |
+
|
339 |
+
logging.info(f"Newsletter (JSON and PDF) successfully pushed to repository {DATASET_NAME}.")
|
340 |
+
except Exception as e:
|
341 |
+
logging.error(f"Failed to generate or store newsletter: {e}")
|
342 |
+
|
343 |
+
def process_new_papers(end_date: datetime = None, test: bool = False):
|
344 |
+
end_date = end_date or datetime.now(timezone.utc)
|
345 |
+
start_date = end_date - timedelta(days=7)
|
346 |
+
|
347 |
+
# Adjust the date range to search for papers published in the last 30 days
|
348 |
+
search_start_date = end_date - timedelta(days=30)
|
349 |
+
|
350 |
+
logging.info(f"Processing papers for the week: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
|
351 |
+
logging.info(f"Searching for papers published between: {search_start_date.strftime('%Y-%m-%d')} and {end_date.strftime('%Y-%m-%d')}")
|
352 |
+
|
353 |
+
papers_df = load_cached_papers(search_start_date, end_date, test)
|
354 |
+
|
355 |
+
if papers_df.empty and not test:
|
356 |
+
logging.info("No relevant papers found in cache or recent search.")
|
357 |
+
return
|
358 |
+
|
359 |
+
logging.info(f"Found {len(papers_df)} relevant papers for the newsletter.")
|
360 |
+
|
361 |
+
# Cache the papers_df as a Hugging Face dataset
|
362 |
+
cache_dataset(papers_df, start_date, end_date)
|
363 |
+
|
364 |
+
# Generate and store the newsletter
|
365 |
+
generate_and_store_newsletter(papers_df, end_date)
|
366 |
+
|
367 |
+
|
368 |
+
if __name__ == "__main__":
|
369 |
+
parser = argparse.ArgumentParser(description="Generate a weekly Rheumatology newsletter.")
|
370 |
+
parser.add_argument('--end_date', type=str, help='End date for the newsletter in YYYY-MM-DD format. Defaults to today.')
|
371 |
+
parser.add_argument('--test', action='store_true', help='Run the script in test mode.')
|
372 |
+
args = parser.parse_args()
|
373 |
+
|
374 |
+
end_date = None
|
375 |
+
if args.end_date:
|
376 |
+
try:
|
377 |
+
end_date = datetime.strptime(args.end_date, '%Y-%m-%d').replace(tzinfo=timezone.utc)
|
378 |
+
except ValueError:
|
379 |
+
logging.error("Invalid date format for --end_date. Use YYYY-MM-DD.")
|
380 |
+
exit(1)
|
381 |
+
|
382 |
+
process_new_papers(end_date, args.test)
|
hf_api.py
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import logging
|
4 |
+
from enum import Enum
|
5 |
+
from pydantic import BaseModel, Field
|
6 |
+
import pandas as pd
|
7 |
+
from huggingface_hub import InferenceClient
|
8 |
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
9 |
+
|
10 |
+
# Configure logging
|
11 |
+
logger = logging.getLogger(__name__)
|
12 |
+
logger.setLevel(logging.INFO)
|
13 |
+
|
14 |
+
# Create handlers
|
15 |
+
console_handler = logging.StreamHandler()
|
16 |
+
console_handler.setLevel(logging.INFO)
|
17 |
+
|
18 |
+
file_handler = logging.FileHandler("hf_api.log")
|
19 |
+
file_handler.setLevel(logging.INFO)
|
20 |
+
|
21 |
+
# Create formatters and add to handlers
|
22 |
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
23 |
+
console_handler.setFormatter(formatter)
|
24 |
+
file_handler.setFormatter(formatter)
|
25 |
+
|
26 |
+
# Add handlers to the logger
|
27 |
+
if not logger.handlers:
|
28 |
+
logger.addHandler(console_handler)
|
29 |
+
logger.addHandler(file_handler)
|
30 |
+
|
31 |
+
# Validate and retrieve the Hugging Face API token
|
32 |
+
HF_TOKEN = os.environ.get('HF_TOKEN')
|
33 |
+
if not HF_TOKEN:
|
34 |
+
logger.error("Hugging Face API token not found. Set the HF_TOKEN environment variable.")
|
35 |
+
raise EnvironmentError("HF_TOKEN environment variable is not set.")
|
36 |
+
|
37 |
+
# Initialize the InferenceClient
|
38 |
+
MODEL_NAME1 = "meta-llama/Llama-3.1-8B-Instruct"
|
39 |
+
MODEL_NAME2 = "Qwen/Qwen2.5-72B-Instruct"
|
40 |
+
try:
|
41 |
+
client1 = InferenceClient(model=MODEL_NAME1, token=HF_TOKEN)
|
42 |
+
logger.info(f"InferenceClient for model '{MODEL_NAME1}' instantiated successfully.")
|
43 |
+
except Exception as e:
|
44 |
+
logger.error(f"Failed to instantiate InferenceClient for model '{MODEL_NAME1}': {e}")
|
45 |
+
raise
|
46 |
+
|
47 |
+
try:
|
48 |
+
client2 = InferenceClient(model=MODEL_NAME2, token=HF_TOKEN)
|
49 |
+
logger.info(f"InferenceClient for model '{MODEL_NAME2}' instantiated successfully.")
|
50 |
+
except Exception as e:
|
51 |
+
logger.error(f"Failed to instantiate InferenceClient for model '{MODEL_NAME2}': {e}")
|
52 |
+
raise
|
53 |
+
|
54 |
+
# Define Pydantic schemas
|
55 |
+
class EvaluationSchema(BaseModel):
|
56 |
+
reasoning: str
|
57 |
+
relevance_score: int = Field(ge=0, le=10)
|
58 |
+
|
59 |
+
class TopicEnum(Enum):
|
60 |
+
Rheumatoid_Arthritis = "Rheumatoid Arthritis"
|
61 |
+
Systemic_Lupus_Erythematosus = "Systemic Lupus Erythematosus"
|
62 |
+
Scleroderma = "Scleroderma"
|
63 |
+
Sjogren_s_Disease = "Sjogren's Disease"
|
64 |
+
Ankylosing_Spondylitis = "Ankylosing Spondylitis"
|
65 |
+
Psoriatic_Arthritis = "Psoriatic Arthritis"
|
66 |
+
Gout = "Gout"
|
67 |
+
Vasculitis = "Vasculitis"
|
68 |
+
Osteoarthritis = "Osteoarthritis"
|
69 |
+
Infectious_Diseases = "Infectious Diseases"
|
70 |
+
Immunology = "Immunology"
|
71 |
+
Genetics = "Genetics"
|
72 |
+
Biologics = "Biologics"
|
73 |
+
Biosimilars = "Biosimilars"
|
74 |
+
Small_Molecules = "Small Molecules"
|
75 |
+
Clinical_Trials = "Clinical Trials"
|
76 |
+
Health_Policy = "Health Policy"
|
77 |
+
Patient_Education = "Patient Education"
|
78 |
+
Other_Rheumatic_Diseases = "Other Rheumatic Diseases"
|
79 |
+
|
80 |
+
class SummarySchema(BaseModel):
|
81 |
+
summary: str
|
82 |
+
# Enum for topic
|
83 |
+
topic: TopicEnum = TopicEnum.Other_Rheumatic_Diseases
|
84 |
+
|
85 |
+
class PaperSchema(BaseModel):
|
86 |
+
title: str
|
87 |
+
authors: str
|
88 |
+
journal: str
|
89 |
+
pmid: str
|
90 |
+
|
91 |
+
class TopicSummarySchema(BaseModel):
|
92 |
+
planning: str
|
93 |
+
summary: str
|
94 |
+
|
95 |
+
def evaluate_relevance(title: str, abstract: str) -> EvaluationSchema:
|
96 |
+
prompt = f"""
|
97 |
+
Title: {title}
|
98 |
+
Abstract: {abstract}
|
99 |
+
Instructions: Evaluate the relevance of this medical abstract for an audience of rheumatologists on a scale of 0 to 10 with 10 being reserved only for large clinical trials in rheumatology.
|
100 |
+
Be very discerning and only give a score above 8 for papers that are highly clinically relevant to rheumatologists.
|
101 |
+
Respond in JSON format using the following schema:
|
102 |
+
{json.dumps(EvaluationSchema.model_json_schema())}
|
103 |
+
"""
|
104 |
+
|
105 |
+
try:
|
106 |
+
response = client1.text_generation(
|
107 |
+
prompt,
|
108 |
+
max_new_tokens=512,
|
109 |
+
temperature=0.2,
|
110 |
+
grammar={"type": "json", "value": EvaluationSchema.model_json_schema()}
|
111 |
+
)
|
112 |
+
result = json.loads(response)
|
113 |
+
return result
|
114 |
+
except Exception as e:
|
115 |
+
logger.error(f"Error in evaluate_relevance: {e}")
|
116 |
+
raise
|
117 |
+
|
118 |
+
def summarize_abstract(abstract: str) -> SummarySchema:
|
119 |
+
prompt = f"""
|
120 |
+
Abstract: {abstract}
|
121 |
+
Instructions: Summarize this medical abstract in 1 sentence and select the most relevant topic from the following enum:
|
122 |
+
{TopicEnum.__doc__}
|
123 |
+
Respond in JSON format using the following schema:
|
124 |
+
{json.dumps(SummarySchema.model_json_schema())}
|
125 |
+
"""
|
126 |
+
|
127 |
+
try:
|
128 |
+
response = client1.text_generation(
|
129 |
+
prompt,
|
130 |
+
max_new_tokens=512,
|
131 |
+
temperature=0.2,
|
132 |
+
grammar={"type": "json", "value": SummarySchema.model_json_schema()}
|
133 |
+
)
|
134 |
+
result = json.loads(response)
|
135 |
+
return result
|
136 |
+
except Exception as e:
|
137 |
+
logger.error(f"Error in summarize_abstract: {e}")
|
138 |
+
raise
|
139 |
+
|
140 |
+
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
141 |
+
def _make_api_call(client, prompt, max_tokens=4096, temp=0.2, schema=None):
|
142 |
+
try:
|
143 |
+
response = client.text_generation(
|
144 |
+
prompt,
|
145 |
+
max_new_tokens=max_tokens,
|
146 |
+
temperature=temp,
|
147 |
+
grammar={"type": "json", "value": schema} if schema else None
|
148 |
+
)
|
149 |
+
return json.loads(response)
|
150 |
+
except Exception as e:
|
151 |
+
logger.error(f"API call failed: {e}")
|
152 |
+
raise
|
153 |
+
|
154 |
+
def compose_newsletter(papers: pd.DataFrame) -> str:
|
155 |
+
if papers.empty:
|
156 |
+
logger.info("No papers provided to compose the newsletter.")
|
157 |
+
return ""
|
158 |
+
|
159 |
+
content = ["# This Week in Rheumatology\n"]
|
160 |
+
topics = papers['Topic'].unique()
|
161 |
+
|
162 |
+
for topic in topics:
|
163 |
+
try:
|
164 |
+
relevant_papers = papers[papers['Topic'] == topic]
|
165 |
+
# Convert to dict with lowercase keys to match the expected schema
|
166 |
+
papers_dict = relevant_papers.rename(columns={
|
167 |
+
'Title': 'title',
|
168 |
+
'Authors': 'authors',
|
169 |
+
'Journal': 'journal',
|
170 |
+
'PMID': 'pmid',
|
171 |
+
'Summary': 'summary'
|
172 |
+
}).to_dict('records')
|
173 |
+
|
174 |
+
prompt = f"""
|
175 |
+
Instructions: Generate a brief summary of the latest research on {topic} using the following papers.
|
176 |
+
Papers: {json.dumps(papers_dict)}
|
177 |
+
Respond in JSON format using the following schema:
|
178 |
+
{json.dumps(TopicSummarySchema.model_json_schema())}
|
179 |
+
You have the option of using the planning field first to organize your thoughts before writing the summary.
|
180 |
+
The summary should be concise, but because you are summarizing several papers, it should be detailed enough to give the reader a good idea of the latest research in the field.
|
181 |
+
The papers may be somewhat disjointed, so you will need to think carefully about how you can transition between them with clever wording.
|
182 |
+
You can use anywhere from 1 to 3 paragraphs for the summary.
|
183 |
+
"""
|
184 |
+
|
185 |
+
result = _make_api_call(
|
186 |
+
client2,
|
187 |
+
prompt,
|
188 |
+
max_tokens=4096,
|
189 |
+
temp=0.2,
|
190 |
+
schema=TopicSummarySchema.model_json_schema()
|
191 |
+
)
|
192 |
+
|
193 |
+
# Log the raw response for debugging
|
194 |
+
logger.debug(f"Raw response from Hugging Face: {result}")
|
195 |
+
|
196 |
+
# Parse the JSON response
|
197 |
+
summary = TopicSummarySchema(**result)
|
198 |
+
|
199 |
+
# Convert the structured summary to Markdown
|
200 |
+
topic_content = f"## {topic}\n\n"
|
201 |
+
topic_content += f"{summary.summary}\n\n"
|
202 |
+
|
203 |
+
# Add a references section
|
204 |
+
topic_content += "### References\n\n"
|
205 |
+
relevant_papers = papers[papers['Topic'] == topic]
|
206 |
+
for _, paper in relevant_papers.iterrows():
|
207 |
+
topic_content += (f"- {paper['Title']} by {paper['Authors']}. {paper['Journal']}. "
|
208 |
+
f"[PMID: {paper['PMID']}](https://pubmed.ncbi.nlm.nih.gov/{paper['PMID']}/)\n")
|
209 |
+
|
210 |
+
content.append(topic_content)
|
211 |
+
|
212 |
+
except Exception as e:
|
213 |
+
logger.error(f"Error processing topic {topic}: {e}")
|
214 |
+
logger.error(f"Raw response: {result}")
|
215 |
+
continue
|
216 |
+
|
217 |
+
return "\n".join(content)
|
main.py
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
from datetime import datetime, timezone
|
4 |
+
|
5 |
+
from fasthtml.common import *
|
6 |
+
from huggingface_hub import HfApi, hf_hub_download
|
7 |
+
from starlette.responses import FileResponse
|
8 |
+
from generate_newsletter import process_new_papers
|
9 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
10 |
+
from apscheduler.triggers.cron import CronTrigger
|
11 |
+
|
12 |
+
from fasthtml_hf import setup_hf_backup
|
13 |
+
|
14 |
+
# Initialize Hugging Face API
|
15 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
16 |
+
DATASET_NAME = "cmcmaster/this_week_in_rheumatology"
|
17 |
+
api = HfApi(token=HF_TOKEN)
|
18 |
+
|
19 |
+
# Initialize scheduler
|
20 |
+
scheduler = BackgroundScheduler()
|
21 |
+
|
22 |
+
# Schedule newsletter generation to run every Monday at 1 AM UTC
|
23 |
+
scheduler.add_job(process_new_papers,
|
24 |
+
CronTrigger(day_of_week='mon', hour=1),
|
25 |
+
kwargs={
|
26 |
+
'end_date': None,
|
27 |
+
'test': False
|
28 |
+
},
|
29 |
+
id='generate_newsletter',
|
30 |
+
name='Weekly newsletter generation',
|
31 |
+
replace_existing=True)
|
32 |
+
|
33 |
+
css = Style("""
|
34 |
+
body {
|
35 |
+
font-family: Georgia, Times, serif;
|
36 |
+
line-height: 1.6;
|
37 |
+
color: #333;
|
38 |
+
max-width: 800px;
|
39 |
+
margin: 0 auto;
|
40 |
+
padding: 20px;
|
41 |
+
background: #fff;
|
42 |
+
}
|
43 |
+
|
44 |
+
h1, h2 {
|
45 |
+
color: #2c3e50;
|
46 |
+
font-family: Georgia, Times, serif;
|
47 |
+
}
|
48 |
+
|
49 |
+
a {
|
50 |
+
color: #2c3e50;
|
51 |
+
text-decoration: none;
|
52 |
+
}
|
53 |
+
|
54 |
+
a:hover {
|
55 |
+
text-decoration: underline;
|
56 |
+
}
|
57 |
+
|
58 |
+
ul {
|
59 |
+
list-style-type: none;
|
60 |
+
padding: 0;
|
61 |
+
}
|
62 |
+
|
63 |
+
li {
|
64 |
+
margin-bottom: 10px;
|
65 |
+
}
|
66 |
+
|
67 |
+
.newsletter-content {
|
68 |
+
margin-top: 20px;
|
69 |
+
}
|
70 |
+
|
71 |
+
.download-link {
|
72 |
+
display: inline-block;
|
73 |
+
padding: 10px 20px;
|
74 |
+
background-color: #2c3e50;
|
75 |
+
color: white;
|
76 |
+
border-radius: 3px;
|
77 |
+
margin: 10px 0;
|
78 |
+
font-family: Georgia, Times, serif;
|
79 |
+
}
|
80 |
+
|
81 |
+
.download-link:hover {
|
82 |
+
background-color: #34495e;
|
83 |
+
text-decoration: none;
|
84 |
+
}
|
85 |
+
""")
|
86 |
+
|
87 |
+
app = FastHTML(hdrs=(css, MarkdownJS(),
|
88 |
+
HighlightJS(
|
89 |
+
langs=['python', 'javascript', 'html', 'css'])))
|
90 |
+
|
91 |
+
|
92 |
+
# Start the scheduler when the app starts
|
93 |
+
@app.on_event("startup")
|
94 |
+
async def start_scheduler():
|
95 |
+
scheduler.start()
|
96 |
+
|
97 |
+
|
98 |
+
# Shut down the scheduler when the app stops
|
99 |
+
@app.on_event("shutdown")
|
100 |
+
async def shutdown_scheduler():
|
101 |
+
scheduler.shutdown()
|
102 |
+
|
103 |
+
|
104 |
+
def get_newsletter_list():
|
105 |
+
# Fetch the list of newsletters from the Hugging Face repository
|
106 |
+
files = api.list_repo_files(repo_id=DATASET_NAME, repo_type="dataset")
|
107 |
+
newsletters = [f for f in files if f.endswith('newsletter.json')]
|
108 |
+
return sorted(newsletters, reverse=True)
|
109 |
+
|
110 |
+
|
111 |
+
def get_newsletter_content(path):
|
112 |
+
# Download and parse the newsletter content
|
113 |
+
content = api.hf_hub_download(repo_id=DATASET_NAME,
|
114 |
+
filename=path,
|
115 |
+
repo_type="dataset")
|
116 |
+
with open(content, 'r') as f:
|
117 |
+
return json.load(f)
|
118 |
+
|
119 |
+
|
120 |
+
@app.get("/")
|
121 |
+
def index():
|
122 |
+
newsletters = get_newsletter_list()
|
123 |
+
links = [
|
124 |
+
Li(
|
125 |
+
A(datetime.strptime(n.split('/')[0], '%Y%m%d').strftime('%B %d, %Y'),
|
126 |
+
href=f"/newsletter/{n.split('/')[0]}")) for n in newsletters
|
127 |
+
]
|
128 |
+
return Titled("This Week in Rheumatology", H2("Available Newsletters"),
|
129 |
+
Ul(*links))
|
130 |
+
|
131 |
+
|
132 |
+
@app.get("/newsletter/{date}")
|
133 |
+
def newsletter(date: str):
|
134 |
+
path = f"{date}/newsletter.json"
|
135 |
+
pdf_path = f"{date}/newsletter.pdf"
|
136 |
+
try:
|
137 |
+
content = get_newsletter_content(path)
|
138 |
+
return Titled(
|
139 |
+
f"This Week in Rheumatology - {content['date']}",
|
140 |
+
A("Back to Index", href="/"),
|
141 |
+
Div(
|
142 |
+
A("Download PDF", href=f"/download/{date}", cls="download-link")
|
143 |
+
),
|
144 |
+
Div(content['content'], cls="marked"))
|
145 |
+
except Exception as e:
|
146 |
+
return Titled("Error", H2("Newsletter not found"),
|
147 |
+
P(f"Unable to load newsletter for date: {date}"),
|
148 |
+
A("Back to Index", href="/"))
|
149 |
+
|
150 |
+
|
151 |
+
@app.get("/download/{date}")
|
152 |
+
def download_pdf(date: str):
|
153 |
+
try:
|
154 |
+
pdf_path = f"{date}/newsletter.pdf"
|
155 |
+
content = api.hf_hub_download(repo_id=DATASET_NAME,
|
156 |
+
filename=pdf_path,
|
157 |
+
repo_type="dataset")
|
158 |
+
return FileResponse(content,
|
159 |
+
media_type="application/pdf",
|
160 |
+
filename=f"newsletter_{date}.pdf")
|
161 |
+
except Exception as e:
|
162 |
+
return Titled("Error", H2("PDF not found"),
|
163 |
+
P(f"Unable to load PDF for date: {date}"),
|
164 |
+
A("Back to Index", href="/"))
|
165 |
+
|
166 |
+
setup_hf_backup(app)
|
167 |
+
serve()
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fasthtml-hf
|
2 |
+
huggingface-hub
|
3 |
+
starlette
|
4 |
+
apscheduler
|
5 |
+
bio
|
6 |
+
datasets
|
7 |
+
pdfkit
|
8 |
+
jinja2
|
9 |
+
markdown2
|
10 |
+
pandas
|
11 |
+
pydantic
|
12 |
+
tenacity
|
search_terms.json
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"search_strategy": {
|
3 |
+
"mesh_terms": [
|
4 |
+
"Rheumatic Diseases",
|
5 |
+
"Rheumatology",
|
6 |
+
"Arthritis, Rheumatoid",
|
7 |
+
"Lupus Erythematosus, Systemic",
|
8 |
+
"Osteoarthritis",
|
9 |
+
"Fibromyalgia",
|
10 |
+
"Sjogren's Syndrome",
|
11 |
+
"Scleroderma, Systemic",
|
12 |
+
"Polymyositis",
|
13 |
+
"Dermatomyositis",
|
14 |
+
"Vasculitis",
|
15 |
+
"Gout",
|
16 |
+
"Spondylarthropathies",
|
17 |
+
"Polymyalgia Rheumatica",
|
18 |
+
"Arthritis, Psoriatic",
|
19 |
+
"Arthritis, Juvenile"
|
20 |
+
],
|
21 |
+
"keywords": [
|
22 |
+
"rheumat*",
|
23 |
+
"autoimmune",
|
24 |
+
"connective tissue disease",
|
25 |
+
"inflammatory arthritis",
|
26 |
+
"systemic inflammatory disease",
|
27 |
+
"musculoskeletal disorder",
|
28 |
+
"autoinflammatory syndrome",
|
29 |
+
"immunologic disease",
|
30 |
+
"crystal arthropathy"
|
31 |
+
],
|
32 |
+
"specific_conditions": [
|
33 |
+
"ankylosing spondylitis",
|
34 |
+
"reactive arthritis",
|
35 |
+
"enteropathic arthritis",
|
36 |
+
"systemic sclerosis",
|
37 |
+
"mixed connective tissue disease",
|
38 |
+
"antiphospholipid syndrome",
|
39 |
+
"Behcet's disease",
|
40 |
+
"giant cell arteritis",
|
41 |
+
"Takayasu arteritis",
|
42 |
+
"ANCA-associated vasculitis",
|
43 |
+
"polymyositis",
|
44 |
+
"dermatomyositis",
|
45 |
+
"inclusion body myositis"
|
46 |
+
],
|
47 |
+
"research_related_terms": [
|
48 |
+
"epidemiology",
|
49 |
+
"etiology",
|
50 |
+
"pathogenesis",
|
51 |
+
"diagnosis",
|
52 |
+
"treatment",
|
53 |
+
"therapy",
|
54 |
+
"prognosis",
|
55 |
+
"outcome",
|
56 |
+
"clinical trial",
|
57 |
+
"cohort study",
|
58 |
+
"case-control study",
|
59 |
+
"systematic review",
|
60 |
+
"meta-analysis",
|
61 |
+
"biomarker",
|
62 |
+
"genetic",
|
63 |
+
"immunology",
|
64 |
+
"imaging"
|
65 |
+
],
|
66 |
+
"exclusion_terms": [
|
67 |
+
"veterinary",
|
68 |
+
"animal model"
|
69 |
+
]
|
70 |
+
},
|
71 |
+
"search_fields": [
|
72 |
+
"Title/Abstract",
|
73 |
+
"MeSH Terms",
|
74 |
+
"Publication Type",
|
75 |
+
"Journal"
|
76 |
+
],
|
77 |
+
"publication_types": [
|
78 |
+
"Journal Article",
|
79 |
+
"Review",
|
80 |
+
"Clinical Trial",
|
81 |
+
"Meta-Analysis",
|
82 |
+
"Randomized Controlled Trial",
|
83 |
+
"Practice Guideline"
|
84 |
+
],
|
85 |
+
"languages": [
|
86 |
+
"English"
|
87 |
+
],
|
88 |
+
"species": [
|
89 |
+
"Humans"
|
90 |
+
],
|
91 |
+
"journals": [
|
92 |
+
"Annals of the Rheumatic Diseases",
|
93 |
+
"Arthritis & Rheumatology",
|
94 |
+
"Rheumatology",
|
95 |
+
"Journal of Rheumatology",
|
96 |
+
"Arthritis Research & Therapy",
|
97 |
+
"Seminars in Arthritis and Rheumatism",
|
98 |
+
"RMD Open",
|
99 |
+
"Clinical Rheumatology",
|
100 |
+
"Arthritis Care & Research",
|
101 |
+
"International Journal of Rheumatic Diseases"
|
102 |
+
]
|
103 |
+
}
|
104 |
+
|
templates/newsletter_pdf.html
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html>
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<title>{{ title }}</title>
|
6 |
+
</head>
|
7 |
+
<body>
|
8 |
+
<h1>{{ title }}</h1>
|
9 |
+
<div class="content">
|
10 |
+
{{ content|safe }}
|
11 |
+
</div>
|
12 |
+
</body>
|
13 |
+
</html>
|