rbiswasfc commited on
Commit
d2071ed
1 Parent(s): 75d46f7
Files changed (6) hide show
  1. Dockerfile +4 -23
  2. app.py +6 -1
  3. main.py +0 -618
  4. profile_app.py +0 -52
  5. requirements.txt +1 -3
  6. supervisord.conf +0 -20
Dockerfile CHANGED
@@ -1,32 +1,13 @@
1
  FROM python:3.10
2
 
3
- RUN useradd -m -u 1000 user
4
- USER user
5
- ENV HOME=/home/user \
6
- PATH=/home/user/.local/bin:$PATH
7
 
8
- # Set the working directory
9
- WORKDIR $HOME/app
10
-
11
- COPY requirements.txt .
12
- RUN pip install --no-cache-dir -r requirements.txt
13
- RUN git config --global credential.helper store
14
-
15
- COPY . .
16
- COPY supervisord.conf .
17
-
18
- # Set permissions on the log file
19
- USER root
20
- RUN touch $HOME/app/mylog.log $HOME/app/supervisord.log && chmod a+rwx $HOME/app/mylog.log $HOME/app/supervisord.log
21
 
22
  RUN mkdir -p /tmp/cache/
23
- RUN mkdir -p /.cache
24
  RUN chmod a+rwx -R /tmp/cache/
25
- RUN chmod a+rwx -R /.cache
26
  ENV HF_HUB_CACHE=HF_HOME
 
27
 
28
  ENV PYTHONUNBUFFERED=1 PORT=7860
29
- CMD ["python", "app.py"]
30
-
31
- # # Run supervisord
32
- # CMD ["supervisord", "-c", "supervisord.conf"]
 
1
  FROM python:3.10
2
 
3
+ WORKDIR /code
 
 
 
4
 
5
+ COPY --link --chown=1000 . .
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  RUN mkdir -p /tmp/cache/
 
8
  RUN chmod a+rwx -R /tmp/cache/
 
9
  ENV HF_HUB_CACHE=HF_HOME
10
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
11
 
12
  ENV PYTHONUNBUFFERED=1 PORT=7860
13
+ CMD ["python", "app.py"]
 
 
 
app.py CHANGED
@@ -159,4 +159,9 @@ def get(date: str):
159
  return Div(f"Error displaying articles: {str(e)}")
160
 
161
 
162
- serve()
 
 
 
 
 
 
159
  return Div(f"Error displaying articles: {str(e)}")
160
 
161
 
162
+ # serve()
163
+
164
+ if __name__ == "__main__":
165
+ import uvicorn
166
+
167
+ uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))
main.py DELETED
@@ -1,618 +0,0 @@
1
- import os
2
- import re
3
- import shutil
4
- import time
5
-
6
- import dotenv
7
- import fitz # PyMuPDF
8
- import pandas as pd
9
- import requests
10
- import schedule
11
- import srsly
12
- from bs4 import BeautifulSoup
13
- from datasets import Dataset, Image, concatenate_datasets, load_dataset
14
- from huggingface_hub import create_repo, login, whoami
15
- from PIL import Image as PILImage
16
- from retry import retry
17
- from tqdm.auto import tqdm
18
-
19
- dotenv.load_dotenv()
20
- login(token=os.environ.get("HF_TOKEN"))
21
-
22
- hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
23
- HF_REPO_ID_TXT = f"{hf_user}/zotero-answer-ai-texts"
24
- HF_REPO_ID_IMG = f"{hf_user}/zotero-answer-ai-images"
25
-
26
-
27
- ########################################################
28
- ### GET ZOTERO ITEMS
29
- ########################################################
30
-
31
-
32
- @retry(tries=3, delay=8)
33
- def _fetch_one_zotero_batch(url, headers, params):
34
- """
35
- Fetch articles from Zotero API
36
- """
37
- response = requests.get(url, headers=headers, params=params)
38
- response.raise_for_status()
39
- return response.json()
40
-
41
-
42
- def get_zotero_items(debug=False):
43
- """
44
- fetch items from zotero library
45
- """
46
-
47
- GROUP_ID = os.getenv("GROUP_ID")
48
- API_KEY = os.getenv("API_KEY")
49
- BASE_URL = f"https://api.zotero.org/groups/{GROUP_ID}/items"
50
- LIMIT = 100
51
-
52
- headers = {"Zotero-API-Key": API_KEY, "Content-Type": "application/json"}
53
-
54
- items = []
55
- start = 0
56
-
57
- i = 1
58
- while True:
59
- i += 1
60
- params = {"limit": LIMIT, "start": start}
61
- page_items = _fetch_one_zotero_batch(BASE_URL, headers, params)
62
-
63
- if not page_items:
64
- break
65
-
66
- items.extend(page_items)
67
- start += LIMIT
68
- print(f"# items fetched {len(items)}")
69
-
70
- if debug:
71
- if len(items) > 1600:
72
- break
73
-
74
- return items
75
-
76
-
77
- ########################################################
78
- ### EXTRACT ARXIV LINKS AND PDFs
79
- ########################################################
80
-
81
-
82
- def get_arxiv_items(items):
83
- visited = set()
84
-
85
- arxiv_items = []
86
- arxiv_pattern = re.compile(r"arxiv.org/abs/(\d+\.\d+)")
87
-
88
- for item in items:
89
- data = item.get("data", {})
90
- attachments = item.get("links", {}).get("attachment", {})
91
-
92
- arxiv_url = None
93
- pdf_url = None
94
-
95
- if "url" in data and "arxiv.org" in data["url"]:
96
- arxiv_match = arxiv_pattern.search(data["url"])
97
- if arxiv_match:
98
- arxiv_url = data["url"]
99
-
100
- if attachments:
101
- pdf_url = attachments["href"]
102
-
103
- if arxiv_url:
104
- arxiv_id = arxiv_url.split("/")[-1]
105
- if arxiv_id in visited:
106
- continue
107
-
108
- authors = []
109
- for author in data.get("creators", []):
110
- authors.append(f"{author.get('firstName', '')} {author.get('lastName', '')}")
111
-
112
- arxiv_items.append(
113
- {
114
- "arxiv_id": arxiv_id,
115
- "arxiv_url": arxiv_url,
116
- "title": data.get("title", ""),
117
- "authors": authors,
118
- "pdf_url": pdf_url,
119
- "date_published": data.get("date", ""),
120
- "added_by": item["meta"]["createdByUser"]["username"],
121
- "date_added": data.get("dateAdded", ""),
122
- }
123
- )
124
-
125
- visited.add(arxiv_id)
126
-
127
- return arxiv_items
128
-
129
-
130
- @retry(tries=3, delay=15, backoff=2)
131
- def fetch_arxiv_html(arxiv_id):
132
- url = f"https://ar5iv.labs.arxiv.org/html/{arxiv_id.split('v')[0]}"
133
- response = requests.get(url)
134
- return response.text if response.status_code == 200 else None
135
-
136
-
137
- def fetch_arxiv_htmls(arxiv_items):
138
- for item in tqdm(arxiv_items):
139
- html = fetch_arxiv_html(item["arxiv_id"])
140
- if html:
141
- item["raw_content"] = html
142
- else:
143
- print(f"failed to fetch html for {item['arxiv_id']}")
144
- item["raw_content"] = "Error"
145
-
146
- return arxiv_items
147
-
148
-
149
- ########################################################
150
- ### PARSE CONTENT FROM ARXIV HTML #
151
- ########################################################
152
-
153
-
154
- def parse_html_content(html):
155
- """
156
- Parse content from arxiv html
157
- """
158
- arxiv_id_match = re.search(r"\[(\d+\.\d+(v\d+)?)\]", html)
159
- arxiv_id = arxiv_id_match.group(1) if arxiv_id_match else None
160
- soup = BeautifulSoup(html, "html.parser")
161
- result = []
162
-
163
- # Extract paper title
164
- try:
165
- paper_title = soup.find("h1", class_="ltx_title ltx_title_document").get_text(strip=True)
166
- except Exception:
167
- paper_title = soup.find("title").get_text(strip=True)
168
- paper_title = re.sub(r"^\[\d+\.\d+(v\d+)?\]\s*", "", paper_title)
169
-
170
- for math in soup.find_all("math"):
171
- math.decompose()
172
- for cite in soup.find_all("cite"):
173
- cite.decompose()
174
-
175
- # Extract abstract
176
- abstract = soup.find("div", class_="ltx_abstract")
177
- if abstract:
178
- result.append(
179
- {
180
- "content": " ".join(p.get_text(strip=True) for p in abstract.find_all("p")).replace(")", ") "),
181
- "title": "Abstract",
182
- "paper_title": paper_title,
183
- "content_type": "abstract",
184
- }
185
- )
186
- # Extract sections
187
- sections = soup.find_all("section", class_="ltx_section")
188
- for index, section in enumerate(sections):
189
- section_title = section.find("h2", class_="ltx_title ltx_title_section")
190
- section_title = section_title.get_text(strip=True) if section_title else f"Section {index + 1}"
191
- section_content = section.get_text(strip=True).replace(")", ") ")
192
-
193
- content_type = "body"
194
- if index == 0:
195
- content_type = "introduction"
196
- elif index == len(sections) - 1:
197
- content_type = "conclusion"
198
-
199
- result.append(
200
- {
201
- "content": section_content,
202
- "title": section_title,
203
- "paper_title": paper_title,
204
- "content_type": content_type,
205
- }
206
- )
207
-
208
- for c in result:
209
- c["arxiv_id"] = arxiv_id
210
-
211
- return result
212
-
213
-
214
- ########################################################
215
- ### GET TEXTS FROM PDF & PARSE
216
- ########################################################
217
-
218
-
219
- def get_pdf_text(arxiv_id):
220
- url = "http://147.189.194.113:80/extract" # fix: currently down
221
-
222
- try:
223
- response = requests.get(url, params={"arxiv_id": arxiv_id})
224
- response = response.json()
225
- if "text" in response:
226
- return response["text"]
227
- return None
228
- except Exception as e:
229
- print(e)
230
- return None
231
-
232
-
233
- def get_content_type(section_type, section_count):
234
- """Determine the content type based on the section type and count"""
235
- if section_type == "abstract":
236
- return "abstract"
237
- elif section_type == "introduction" or section_count == 1:
238
- return "introduction"
239
- elif section_type == "conclusion" or section_type == "references":
240
- return section_type
241
- else:
242
- return "body"
243
-
244
-
245
- def get_section_type(title):
246
- """Determine the section type based on the title"""
247
- title_lower = title.lower()
248
- if "abstract" in title_lower:
249
- return "abstract"
250
- elif "introduction" in title_lower:
251
- return "introduction"
252
- elif "conclusion" in title_lower:
253
- return "conclusion"
254
- elif "reference" in title_lower:
255
- return "references"
256
- else:
257
- return "body"
258
-
259
-
260
- def parse_markdown_content(md_content, arxiv_id):
261
- """
262
- Parses markdown content to identify and extract sections based on headers.
263
- """
264
-
265
- lines = md_content.split("\n")
266
- parsed = []
267
- current_section = None
268
- content = []
269
- paper_title = None
270
- current_title = None
271
-
272
- # identify sections based on headers
273
- for line in lines:
274
- if line.startswith("#"):
275
- if paper_title is None:
276
- paper_title = line.lstrip("#").strip()
277
- continue
278
- if content:
279
- if current_title:
280
- parsed.append(
281
- {
282
- "content": " ".join(content),
283
- "title": current_title,
284
- "paper_title": paper_title,
285
- "content_type": get_content_type(current_section, len(parsed)),
286
- "arxiv_id": arxiv_id,
287
- }
288
- )
289
- content = []
290
-
291
- current_title = line.lstrip("#").lstrip("#").lstrip()
292
- if "bit" not in current_title:
293
- current_title = (
294
- current_title.lstrip("123456789")
295
- .lstrip()
296
- .lstrip(".")
297
- .lstrip()
298
- .lstrip("123456789")
299
- .lstrip()
300
- .lstrip(".")
301
- .lstrip()
302
- )
303
- current_section = get_section_type(current_title)
304
-
305
- else:
306
- content.append(line)
307
-
308
- # Add the last section
309
- if content and current_title:
310
- parsed.append(
311
- {
312
- "content": " ".join(content).replace(")", ") "),
313
- "title": current_title,
314
- "paper_title": paper_title,
315
- "content_type": get_content_type(current_section, len(parsed)),
316
- "arxiv_id": arxiv_id,
317
- }
318
- )
319
-
320
- return parsed
321
-
322
-
323
- ########################################################
324
- ### Image Dataset
325
- ########################################################
326
-
327
-
328
- def download_arxiv_pdf(arxiv_id):
329
- arxiv_id = arxiv_id.split("v")[0]
330
- url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
331
- response = requests.get(url)
332
- if response.status_code == 200:
333
- return response.content
334
- else:
335
- raise Exception(f"Failed to download PDF. Status code: {response.status_code}")
336
-
337
-
338
- def pdf_to_jpegs(pdf_content, output_folder, max_pages=128):
339
- # Create output folder if it doesn't exist
340
- os.makedirs(output_folder, exist_ok=True)
341
-
342
- # Open the PDF
343
- doc = fitz.open(stream=pdf_content, filetype="pdf")
344
-
345
- # Iterate through pages
346
- for page_num in range(len(doc)):
347
- page = doc.load_page(page_num)
348
-
349
- # Convert page to image
350
- pix = page.get_pixmap()
351
-
352
- # Save image as JPEG
353
- image_path = os.path.join(output_folder, f"page_{page_num + 1}.jpg")
354
- pix.save(image_path)
355
- # print(f"Saved {image_path}")
356
-
357
- if page_num >= max_pages:
358
- break
359
-
360
- doc.close()
361
-
362
-
363
- def save_arxiv_article_images(arxiv_id):
364
- output_folder = os.path.join("data", "arxiv_images", arxiv_id)
365
- try:
366
- pdf_content = download_arxiv_pdf(arxiv_id)
367
- pdf_to_jpegs(pdf_content, output_folder)
368
- except Exception as e:
369
- print(f"An error occurred: {str(e)}")
370
-
371
-
372
- def create_hf_image_dataset(base_dir):
373
- data = []
374
-
375
- # Walk through the directory
376
- for root, dirs, files in os.walk(base_dir):
377
- for file in files:
378
- if file.endswith(".jpg"):
379
- # Extract arxiv_id from the path
380
- arxiv_id = os.path.basename(root)
381
-
382
- # Extract page number from the filename
383
- match = re.search(r"page_(\d+)", file)
384
- if match:
385
- page_number = int(match.group(1))
386
- else:
387
- continue # Skip if page number can't be extracted
388
-
389
- # Full path to the image
390
- image_path = os.path.join(root, file)
391
-
392
- # Open the image to get its size
393
- with PILImage.open(image_path) as img:
394
- width, height = img.size
395
-
396
- # Add the data
397
- data.append(
398
- {"image": image_path, "arxiv_id": arxiv_id, "page_number": page_number, "width": width, "height": height}
399
- )
400
-
401
- # Create the dataset
402
- dataset = Dataset.from_dict(
403
- {
404
- "image": [d["image"] for d in data],
405
- "arxiv_id": [d["arxiv_id"] for d in data],
406
- "page_number": [d["page_number"] for d in data],
407
- }
408
- )
409
-
410
- # Cast the image column to Image
411
- dataset = dataset.cast_column("image", Image())
412
-
413
- return dataset
414
-
415
-
416
- ########################################################
417
- ### HF UPLOAD
418
- ########################################################
419
-
420
-
421
- def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
422
- # repo_id = HF_REPO_ID
423
- create_repo(
424
- repo_id=HF_REPO_ID_TXT,
425
- token=os.environ.get("HF_TOKEN"),
426
- private=True,
427
- repo_type="dataset",
428
- exist_ok=True,
429
- )
430
-
431
- create_repo(
432
- repo_id=HF_REPO_ID_IMG,
433
- token=os.environ.get("HF_TOKEN"),
434
- private=True,
435
- repo_type="dataset",
436
- exist_ok=True,
437
- )
438
-
439
- # upload image dataset
440
- try:
441
- img_ds = create_hf_image_dataset("data/arxiv_images")
442
- try:
443
- old_img_ds = load_dataset(HF_REPO_ID_IMG, "images")["train"]
444
- img_ds = concatenate_datasets([old_img_ds, img_ds])
445
- except Exception as e:
446
- print(e)
447
- img_ds.push_to_hub(HF_REPO_ID_IMG, "images", token=os.environ.get("HF_TOKEN"))
448
- except Exception as e:
449
- print(e)
450
-
451
- # upload first pages only
452
- try:
453
- img_ds = img_ds.filter(lambda x: x["page_number"] == 1)
454
- img_ds.push_to_hub(HF_REPO_ID_IMG, "images_first_page", token=os.environ.get("HF_TOKEN"))
455
- except Exception as e:
456
- print(e)
457
-
458
- try:
459
- # push id_to_abstract
460
- abstract_ds = Dataset.from_pandas(abstract_df)
461
- abstract_ds.push_to_hub(HF_REPO_ID_TXT, "abstracts", token=os.environ.get("HF_TOKEN"))
462
-
463
- # push arxiv_items
464
- arxiv_ds = Dataset.from_pandas(contents_df)
465
- arxiv_ds.push_to_hub(HF_REPO_ID_TXT, "articles", token=os.environ.get("HF_TOKEN"))
466
-
467
- # push processed_arxiv_ids
468
- processed_arxiv_ids = [{"arxiv_id": arxiv_id} for arxiv_id in processed_arxiv_ids]
469
- processed_arxiv_ids_ds = Dataset.from_list(processed_arxiv_ids)
470
- processed_arxiv_ids_ds.push_to_hub(HF_REPO_ID_TXT, "processed_arxiv_ids", token=os.environ.get("HF_TOKEN"))
471
- except Exception as e:
472
- print(e)
473
-
474
-
475
- ########################################################
476
- ### MAIN
477
- ########################################################
478
-
479
-
480
- def main():
481
- # items = get_zotero_items(debug=True)
482
- items = get_zotero_items(debug=False)
483
-
484
- print(f"# of items fetched from zotero: {len(items)}")
485
- arxiv_items = get_arxiv_items(items)
486
- print(f"# of arxiv papers: {len(arxiv_items)}")
487
-
488
- # get already processed arxiv ids from HF
489
- try:
490
- existing_arxiv_ids = load_dataset(HF_REPO_ID_TXT, "processed_arxiv_ids")["train"]["arxiv_id"]
491
- except Exception as e:
492
- print(e)
493
- existing_arxiv_ids = []
494
-
495
- existing_arxiv_ids = set(existing_arxiv_ids)
496
- print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")
497
-
498
- # new arxiv items
499
- arxiv_items = [item for item in arxiv_items if item["arxiv_id"] not in existing_arxiv_ids]
500
- arxiv_items = fetch_arxiv_htmls(arxiv_items)
501
- print(f"# of new arxiv items: {len(arxiv_items)}")
502
-
503
- if len(arxiv_items) == 0:
504
- print("No new arxiv items to process")
505
- return
506
-
507
- processed_arxiv_ids = set()
508
- pbar = tqdm(range(len(arxiv_items)))
509
-
510
- # remove "data" directory if it exists
511
- if os.path.exists("data"):
512
- try:
513
- shutil.rmtree("data")
514
- except Exception as e:
515
- print(e)
516
-
517
- for item in arxiv_items:
518
- # download images --
519
- save_arxiv_article_images(item["arxiv_id"])
520
-
521
- # parse html
522
- try:
523
- item["contents"] = parse_html_content(item["raw_content"])
524
- except Exception as e:
525
- print(f"Failed to parse html for {item['arxiv_id']}: {e}")
526
- item["contents"] = []
527
-
528
- if len(item["contents"]) == 0:
529
- print("Extracting from pdf...")
530
- md_content = get_pdf_text(item["arxiv_id"]) # fix this
531
- item["raw_content"] = md_content
532
-
533
- if md_content:
534
- item["contents"] = parse_markdown_content(md_content, item["arxiv_id"])
535
- else:
536
- item["contents"] = []
537
-
538
- if len(item["contents"]) > 0:
539
- processed_arxiv_ids.add(item["arxiv_id"])
540
- if len(item["authors"]) == 0:
541
- item["authors"] = [] # ["unknown"]
542
- item["title"] = item["contents"][0]["paper_title"]
543
- pbar.update(1)
544
- pbar.close()
545
-
546
- # save contents ---
547
- processed_arxiv_ids = list(processed_arxiv_ids)
548
- print(f"# of processed arxiv ids: {len(processed_arxiv_ids)}")
549
-
550
- # save abstracts ---
551
- id_to_abstract = {}
552
- for item in arxiv_items:
553
- for entry in item["contents"]:
554
- if entry["content_type"] == "abstract":
555
- id_to_abstract[item["arxiv_id"]] = entry["content"]
556
- break
557
- print(f"# of abstracts: {len(id_to_abstract)}")
558
- abstract_df = pd.Series(id_to_abstract).reset_index().rename(columns={"index": "arxiv_id", 0: "abstract"})
559
- print(abstract_df.head())
560
-
561
- # add to existing dataset
562
- try:
563
- old_abstract_df = load_dataset(HF_REPO_ID_TXT, "abstracts")["train"].to_pandas()
564
- except Exception as e:
565
- print(e)
566
- old_abstract_df = pd.DataFrame(columns=abstract_df.columns)
567
- print(old_abstract_df.head())
568
-
569
- abstract_df = pd.concat([old_abstract_df, abstract_df]).reset_index(drop=True)
570
- abstract_df = abstract_df.drop_duplicates(subset=["arxiv_id"], keep="last").reset_index(drop=True)
571
-
572
- # contents
573
- contents_df = pd.DataFrame(arxiv_items)
574
- print(contents_df.head())
575
- try:
576
- old_contents_df = load_dataset(HF_REPO_ID_TXT, "articles")["train"].to_pandas()
577
- except Exception as e:
578
- print(e)
579
- old_contents_df = pd.DataFrame(columns=contents_df.columns)
580
- if len(old_contents_df) > 0:
581
- print(old_contents_df.sample().T)
582
-
583
- contents_df = pd.concat([old_contents_df, contents_df]).reset_index(drop=True)
584
- contents_df = contents_df.drop_duplicates(subset=["arxiv_id"], keep="last").reset_index(drop=True)
585
-
586
- # upload to hf
587
- processed_arxiv_ids = list(set(processed_arxiv_ids + list(existing_arxiv_ids)))
588
- upload_to_hf(abstract_df, contents_df, processed_arxiv_ids)
589
-
590
- # save as local copy
591
- os.makedirs("data", exist_ok=True)
592
- abstract_df.to_parquet("data/abstracts.parquet")
593
- contents_df.to_parquet("data/contents.parquet")
594
- srsly.write_json("data/processed_arxiv_ids.json", processed_arxiv_ids)
595
-
596
-
597
- def schedule_periodic_task():
598
- """
599
- Schedule the main task to run at the user-defined frequency
600
- """
601
- # main() # run once initially
602
-
603
- frequency = "daily" # TODO: env
604
- if frequency == "hourly":
605
- print("Scheduling tasks to run every hour at the top of the hour")
606
- schedule.every().hour.at(":00").do(main)
607
- elif frequency == "daily":
608
- start_time = "10:00"
609
- print("Scheduling tasks to run every day at: {start_time} UTC+00")
610
- schedule.every().day.at(start_time).do(main)
611
-
612
- while True:
613
- schedule.run_pending()
614
- time.sleep(1)
615
-
616
-
617
- if __name__ == "__main__":
618
- schedule_periodic_task()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
profile_app.py DELETED
@@ -1,52 +0,0 @@
1
- # import cProfile
2
- # import multiprocessing
3
- # import time
4
-
5
- # import requests
6
- # from fasthtml.common import *
7
-
8
- # from app import serve, weeks
9
-
10
- # PORT = 7860 # Update this to match the port your app is using
11
-
12
-
13
- # def run_server():
14
- # serve() # This should start your FastHTML app
15
-
16
-
17
- # def make_requests():
18
- # base_url = f"http://127.0.0.1:{PORT}"
19
-
20
- # # Test home page
21
- # try:
22
- # requests.get(f"{base_url}/")
23
- # except requests.exceptions.RequestException as e:
24
- # print(f"Error accessing home page: {e}")
25
-
26
- # n_weeks = 10
27
- # for week in weeks[: min(n_weeks, len(weeks))]:
28
- # try:
29
- # requests.get(f"{base_url}/week/{week}")
30
- # except requests.exceptions.RequestException as e:
31
- # print(f"Error accessing week {week}: {e}")
32
-
33
- # # Add more requests here to cover other parts of your application
34
-
35
-
36
- # def profile_app():
37
- # server_process = multiprocessing.Process(target=run_server)
38
- # server_process.start()
39
-
40
- # # Wait for the server to start
41
- # time.sleep(60)
42
-
43
- # try:
44
- # make_requests()
45
- # finally:
46
- # server_process.terminate()
47
- # server_process.join()
48
-
49
-
50
- # if __name__ == "__main__":
51
- # cProfile.run("profile_app()", "profile_output.prof")
52
- # print("Profiling complete. Run 'snakeviz profile_output.prof' to view results.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,9 +1,7 @@
1
  fasthtml-hf>=0.1.1
2
- python-fasthtml>=0.0.8
3
  huggingface-hub>=0.20.0
4
  uvicorn>=0.29
5
- schedule==1.2.0
6
- supervisor==4.2.5
7
  requests
8
  srsly
9
  python-dotenv
 
1
  fasthtml-hf>=0.1.1
2
+ python-fasthtml>=0.5.2
3
  huggingface-hub>=0.20.0
4
  uvicorn>=0.29
 
 
5
  requests
6
  srsly
7
  python-dotenv
supervisord.conf DELETED
@@ -1,20 +0,0 @@
1
- [supervisord]
2
- nodaemon=true
3
-
4
- # [program:main]
5
- # command=python main.py
6
- # stdout_logfile=/dev/stdout
7
- # stdout_logfile_maxbytes=0
8
- # stderr_logfile=/dev/stderr
9
- # stderr_logfile_maxbytes=0
10
- # autostart=true
11
- # autorestart=true
12
-
13
- [program:app]
14
- command=python app.py
15
- stdout_logfile=/dev/null
16
- stdout_logfile_maxbytes=0
17
- stderr_logfile=/dev/stderr
18
- stderr_logfile_maxbytes=0
19
- autostart=true
20
- autorestart=true