rbiswasfc commited on
Commit
67be06c
·
1 Parent(s): ec9cba2

data ingest

Browse files
Files changed (7) hide show
  1. .gitignore +4 -0
  2. Dockerfile +0 -0
  3. app.py +0 -0
  4. main.py +435 -0
  5. requirements.txt +0 -0
  6. ruff.toml +3 -0
  7. zotero_refactor.ipynb +746 -0
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ .env
2
+ *.json
3
+ data
4
+ .ipynb_checkpoints
Dockerfile ADDED
File without changes
app.py ADDED
File without changes
main.py ADDED
@@ -0,0 +1,435 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+
4
+ import dotenv
5
+ import pandas as pd
6
+ import requests
7
+ import srsly
8
+ from bs4 import BeautifulSoup
9
+ from datasets import Dataset, load_dataset
10
+ from huggingface_hub import create_repo, login, whoami
11
+ from retry import retry
12
+ from tqdm.auto import tqdm
13
+
14
+ dotenv.load_dotenv()
15
+ login(token=os.environ.get("HF_TOKEN"))
16
+
17
+ hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
18
+ HF_REPO_ID = f"{hf_user}/zotero-answer-ai-articles"
19
+
20
+
21
+ ########################################################
22
+ ### GET ZOTERO ITEMS
23
+ ########################################################
24
+
25
+
26
+ @retry(tries=3, delay=8)
27
+ def _fetch_one_zotero_batch(url, headers, params):
28
+ """
29
+ Fetch articles from Zotero API
30
+ """
31
+ response = requests.get(url, headers=headers, params=params)
32
+ response.raise_for_status()
33
+ return response.json()
34
+
35
+
36
+ def get_zotero_items(debug=False):
37
+ """
38
+ fetch items from zotero library
39
+ """
40
+
41
+ GROUP_ID = os.getenv("GROUP_ID")
42
+ API_KEY = os.getenv("API_KEY")
43
+ BASE_URL = f"https://api.zotero.org/groups/{GROUP_ID}/items"
44
+ LIMIT = 100
45
+
46
+ headers = {"Zotero-API-Key": API_KEY, "Content-Type": "application/json"}
47
+
48
+ items = []
49
+ start = 0
50
+
51
+ i = 1
52
+ while True:
53
+ i += 1
54
+ params = {"limit": LIMIT, "start": start}
55
+ page_items = _fetch_one_zotero_batch(BASE_URL, headers, params)
56
+
57
+ if not page_items:
58
+ break
59
+
60
+ items.extend(page_items)
61
+ start += LIMIT
62
+ print(f"# items fetched {len(items)}")
63
+
64
+ if debug:
65
+ if len(items) > 300:
66
+ break
67
+
68
+ return items
69
+
70
+
71
+ ########################################################
72
+ ### EXTRACT ARXIV LINKS AND PDFs
73
+ ########################################################
74
+
75
+
76
+ def get_arxiv_items(items):
77
+ visited = set()
78
+
79
+ arxiv_items = []
80
+ arxiv_pattern = re.compile(r"arxiv.org/abs/(\d+\.\d+)")
81
+
82
+ for item in items:
83
+ data = item.get("data", {})
84
+ attachments = item.get("links", {}).get("attachment", {})
85
+
86
+ arxiv_url = None
87
+ pdf_url = None
88
+
89
+ if "url" in data and "arxiv.org" in data["url"]:
90
+ arxiv_match = arxiv_pattern.search(data["url"])
91
+ if arxiv_match:
92
+ arxiv_url = data["url"]
93
+
94
+ if attachments:
95
+ pdf_url = attachments["href"]
96
+
97
+ if arxiv_url:
98
+ arxiv_id = arxiv_url.split("/")[-1]
99
+ if arxiv_id in visited:
100
+ continue
101
+
102
+ arxiv_items.append(
103
+ {
104
+ "arxiv_id": arxiv_id,
105
+ "arxiv_url": arxiv_url,
106
+ "pdf_url": pdf_url,
107
+ "added_by": item["meta"]["createdByUser"]["username"],
108
+ "date_added": data.get("dateAdded", ""),
109
+ }
110
+ )
111
+
112
+ visited.add(arxiv_id)
113
+
114
+ return arxiv_items
115
+
116
+
117
+ @retry(tries=3, delay=15, backoff=2)
118
+ def fetch_arxiv_html(arxiv_id):
119
+ url = f"https://ar5iv.labs.arxiv.org/html/{arxiv_id.split('v')[0]}"
120
+ response = requests.get(url)
121
+ return response.text if response.status_code == 200 else None
122
+
123
+
124
+ def fetch_arxiv_htmls(arxiv_items):
125
+ for item in tqdm(arxiv_items):
126
+ html = fetch_arxiv_html(item["arxiv_id"])
127
+ if html:
128
+ item["raw_html"] = html
129
+ else:
130
+ print(f"failed to fetch html for {item['arxiv_id']}")
131
+ item["raw_html"] = "Error"
132
+
133
+ return arxiv_items
134
+
135
+
136
+ ########################################################
137
+ ### PARSE CONTENT FROM ARXIV HTML #
138
+ ########################################################
139
+
140
+
141
+ def parse_html_content(html):
142
+ """
143
+ Parse content from arxiv html
144
+ """
145
+ arxiv_id_match = re.search(r"\[(\d+\.\d+(v\d+)?)\]", html)
146
+ arxiv_id = arxiv_id_match.group(1) if arxiv_id_match else None
147
+ soup = BeautifulSoup(html, "html.parser")
148
+ result = []
149
+
150
+ # Extract paper title
151
+ try:
152
+ paper_title = soup.find("h1", class_="ltx_title ltx_title_document").get_text(strip=True)
153
+ except Exception:
154
+ paper_title = soup.find("title").get_text(strip=True)
155
+ paper_title = re.sub(r"^\[\d+\.\d+(v\d+)?\]\s*", "", paper_title)
156
+
157
+ for math in soup.find_all("math"):
158
+ math.decompose()
159
+ for cite in soup.find_all("cite"):
160
+ cite.decompose()
161
+
162
+ # Extract abstract
163
+ abstract = soup.find("div", class_="ltx_abstract")
164
+ if abstract:
165
+ result.append(
166
+ {
167
+ "content": " ".join(p.get_text(strip=True) for p in abstract.find_all("p")).replace(")", ") "),
168
+ "title": "Abstract",
169
+ "paper_title": paper_title,
170
+ "content_type": "abstract",
171
+ }
172
+ )
173
+ # Extract sections
174
+ sections = soup.find_all("section", class_="ltx_section")
175
+ for index, section in enumerate(sections):
176
+ section_title = section.find("h2", class_="ltx_title ltx_title_section")
177
+ section_title = section_title.get_text(strip=True) if section_title else f"Section {index + 1}"
178
+ section_content = section.get_text(strip=True).replace(")", ") ")
179
+
180
+ content_type = "body"
181
+ if index == 0:
182
+ content_type = "introduction"
183
+ elif index == len(sections) - 1:
184
+ content_type = "conclusion"
185
+
186
+ result.append(
187
+ {
188
+ "content": section_content,
189
+ "title": section_title,
190
+ "paper_title": paper_title,
191
+ "content_type": content_type,
192
+ }
193
+ )
194
+
195
+ for c in result:
196
+ c["arxiv_id"] = arxiv_id
197
+
198
+ return result
199
+
200
+
201
+ ########################################################
202
+ ### GET TEXTS FROM PDF & PARSE
203
+ ########################################################
204
+
205
+
206
+ def get_pdf_text(arxiv_id):
207
+ url = "http://147.189.194.113:80/extract" # fix: currently down
208
+
209
+ try:
210
+ response = requests.get(url, params={"arxiv_id": arxiv_id})
211
+ response = response.json()
212
+ if "text" in response:
213
+ return response["text"]
214
+ return None
215
+ except Exception as e:
216
+ print(e)
217
+ return None
218
+
219
+
220
+ def get_content_type(section_type, section_count):
221
+ """Determine the content type based on the section type and count"""
222
+ if section_type == "abstract":
223
+ return "abstract"
224
+ elif section_type == "introduction" or section_count == 1:
225
+ return "introduction"
226
+ elif section_type == "conclusion" or section_type == "references":
227
+ return section_type
228
+ else:
229
+ return "body"
230
+
231
+
232
+ def get_section_type(title):
233
+ """Determine the section type based on the title"""
234
+ title_lower = title.lower()
235
+ if "abstract" in title_lower:
236
+ return "abstract"
237
+ elif "introduction" in title_lower:
238
+ return "introduction"
239
+ elif "conclusion" in title_lower:
240
+ return "conclusion"
241
+ elif "reference" in title_lower:
242
+ return "references"
243
+ else:
244
+ return "body"
245
+
246
+
247
+ def parse_markdown_content(md_content, arxiv_id):
248
+ """
249
+ Parses markdown content to identify and extract sections based on headers.
250
+ """
251
+
252
+ lines = md_content.split("\n")
253
+ parsed = []
254
+ current_section = None
255
+ content = []
256
+ paper_title = None
257
+ current_title = None
258
+
259
+ # identify sections based on headers
260
+ for line in lines:
261
+ if line.startswith("#"):
262
+ if paper_title is None:
263
+ paper_title = line.lstrip("#").strip()
264
+ continue
265
+ if content:
266
+ if current_title:
267
+ parsed.append(
268
+ {
269
+ "content": " ".join(content),
270
+ "title": current_title,
271
+ "paper_title": paper_title,
272
+ "content_type": get_content_type(current_section, len(parsed)),
273
+ "arxiv_id": arxiv_id,
274
+ }
275
+ )
276
+ content = []
277
+
278
+ current_title = line.lstrip("#").lstrip("#").lstrip()
279
+ if "bit" not in current_title:
280
+ current_title = (
281
+ current_title.lstrip("123456789")
282
+ .lstrip()
283
+ .lstrip(".")
284
+ .lstrip()
285
+ .lstrip("123456789")
286
+ .lstrip()
287
+ .lstrip(".")
288
+ .lstrip()
289
+ )
290
+ current_section = get_section_type(current_title)
291
+
292
+ else:
293
+ content.append(line)
294
+
295
+ # Add the last section
296
+ if content and current_title:
297
+ parsed.append(
298
+ {
299
+ "content": " ".join(content).replace(")", ") "),
300
+ "title": current_title,
301
+ "paper_title": paper_title,
302
+ "content_type": get_content_type(current_section, len(parsed)),
303
+ "arxiv_id": arxiv_id,
304
+ }
305
+ )
306
+
307
+ return parsed
308
+
309
+
310
+ ########################################################
311
+ ### HF UPLOAD
312
+ ########################################################
313
+
314
+
315
+ def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
316
+ repo_id = HF_REPO_ID
317
+ create_repo(
318
+ repo_id=repo_id,
319
+ token=os.environ.get("HF_TOKEN"),
320
+ private=True,
321
+ repo_type="dataset",
322
+ exist_ok=True,
323
+ )
324
+
325
+ # push id_to_abstract
326
+ abstract_ds = Dataset.from_pandas(abstract_df)
327
+ abstract_ds.push_to_hub(repo_id, "abstracts", token=os.environ.get("HF_TOKEN"))
328
+
329
+ # push arxiv_items
330
+ arxiv_ds = Dataset.from_pandas(contents_df)
331
+ arxiv_ds.push_to_hub(repo_id, "articles", token=os.environ.get("HF_TOKEN"))
332
+
333
+ # push processed_arxiv_ids
334
+ processed_arxiv_ids = [{"arxiv_id": arxiv_id} for arxiv_id in processed_arxiv_ids]
335
+ processed_arxiv_ids_ds = Dataset.from_list(processed_arxiv_ids)
336
+ processed_arxiv_ids_ds.push_to_hub(repo_id, "processed_arxiv_ids", token=os.environ.get("HF_TOKEN"))
337
+
338
+
339
+ ########################################################
340
+ ### MAIN
341
+ ########################################################
342
+
343
+
344
+ def main():
345
+ items = get_zotero_items(debug=True)
346
+ print(f"# of items fetched from zotero: {len(items)}")
347
+ arxiv_items = get_arxiv_items(items)
348
+ print(f"# of arxiv papers: {len(arxiv_items)}")
349
+
350
+ # get already processed arxiv ids from HF
351
+ try:
352
+ existing_arxiv_ids = load_dataset(HF_REPO_ID, "processed_arxiv_ids")["train"]["arxiv_id"]
353
+ except Exception as e:
354
+ print(e)
355
+ try:
356
+ existing_arxiv_ids = srsly.read_json("data/processed_arxiv_ids.json")
357
+ except Exception as e:
358
+ print(e)
359
+ existing_arxiv_ids = []
360
+ existing_arxiv_ids = set(existing_arxiv_ids)
361
+ print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")
362
+
363
+ # new arxiv items
364
+ arxiv_items = [item for item in arxiv_items if item["arxiv_id"] not in existing_arxiv_ids]
365
+ arxiv_items = fetch_arxiv_htmls(arxiv_items)
366
+ print(f"# of new arxiv items: {len(arxiv_items)}")
367
+
368
+ processed_arxiv_ids = set()
369
+ for item in arxiv_items:
370
+ try:
371
+ item["contents"] = parse_html_content(item["raw_html"])
372
+ processed_arxiv_ids.add(item["arxiv_id"])
373
+ except Exception as e:
374
+ print(f"Failed to parse html for {item['arxiv_id']}: {e}")
375
+ print("Extracting from pdf...")
376
+ md_content = get_pdf_text(item["arxiv_id"])
377
+ if md_content:
378
+ item["contents"] = parse_markdown_content(md_content, item["arxiv_id"])
379
+ processed_arxiv_ids.add(item["arxiv_id"])
380
+ else:
381
+ item["contents"] = []
382
+
383
+ # save contents ---
384
+ processed_arxiv_ids = list(processed_arxiv_ids)
385
+ print(f"# of processed arxiv ids: {len(processed_arxiv_ids)}")
386
+
387
+ # save abstracts ---
388
+ id_to_abstract = {}
389
+ for item in arxiv_items:
390
+ for entry in item["contents"]:
391
+ if entry["content_type"] == "abstract":
392
+ id_to_abstract[item["arxiv_id"]] = entry["content"]
393
+ break
394
+ print(f"# of abstracts: {len(id_to_abstract)}")
395
+ abstract_df = pd.Series(id_to_abstract).reset_index().rename(columns={"index": "arxiv_id", 0: "abstract"})
396
+ print(abstract_df.head())
397
+
398
+ # add to existing dataset
399
+ try:
400
+ old_abstract_df = load_dataset(HF_REPO_ID, "abstracts")["train"].to_pandas()
401
+ except Exception as e:
402
+ print(e)
403
+ old_abstract_df = pd.DataFrame(columns=abstract_df.columns)
404
+ print(old_abstract_df.head())
405
+
406
+ abstract_df = pd.concat([old_abstract_df, abstract_df]).reset_index(drop=True)
407
+ abstract_df = abstract_df.drop_duplicates(subset=["arxiv_id"], keep="last").reset_index(drop=True)
408
+
409
+ # contents
410
+ contents_df = pd.DataFrame(arxiv_items)
411
+ print(contents_df.head())
412
+ try:
413
+ old_contents_df = load_dataset(HF_REPO_ID, "articles")["train"].to_pandas()
414
+ except Exception as e:
415
+ print(e)
416
+ old_contents_df = pd.DataFrame(columns=contents_df.columns)
417
+ if len(old_contents_df) > 0:
418
+ print(old_contents_df.sample().T)
419
+
420
+ contents_df = pd.concat([old_contents_df, contents_df]).reset_index(drop=True)
421
+ contents_df = contents_df.drop_duplicates(subset=["arxiv_id"], keep="last").reset_index(drop=True)
422
+
423
+ # upload to hf
424
+ processed_arxiv_ids = list(set(processed_arxiv_ids + list(processed_arxiv_ids)))
425
+ upload_to_hf(abstract_df, contents_df, processed_arxiv_ids)
426
+
427
+ # save as local copy
428
+ os.makedirs("data", exist_ok=True)
429
+ abstract_df.to_parquet("data/abstracts.parquet")
430
+ contents_df.to_parquet("data/contents.parquet")
431
+ srsly.write_json("data/processed_arxiv_ids.json", processed_arxiv_ids)
432
+
433
+
434
+ if __name__ == "__main__":
435
+ main()
requirements.txt ADDED
File without changes
ruff.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ line-length = 128
2
+ target-version = "py311"
3
+ ignore = ["E402"]
zotero_refactor.ipynb ADDED
@@ -0,0 +1,746 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "bfd057e4",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "%ai reset"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 2,
16
+ "id": "1c3aa2dc",
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "%%ai -c\n",
21
+ "I'm refactoring the following code to ingest and process items from a zotero lib.\n",
22
+ "\n",
23
+ "```python\n",
24
+ "import re\n",
25
+ "\n",
26
+ "import requests\n",
27
+ "import srsly\n",
28
+ "from bs4 import BeautifulSoup\n",
29
+ "from retry import retry\n",
30
+ "from tqdm.auto import tqdm\n",
31
+ "\n",
32
+ "\n",
33
+ "\n",
34
+ "def get_pdf_text(arxiv_id):\n",
35
+ " url = \"http://147.189.194.113:80/extract\"\n",
36
+ "\n",
37
+ " try:\n",
38
+ " response = requests.get(url, params={\"arxiv_id\": arxiv_id})\n",
39
+ " response = response.json()\n",
40
+ " if \"text\" in response:\n",
41
+ " return response['text']\n",
42
+ " return None\n",
43
+ " except Exception as e:\n",
44
+ " print(e)\n",
45
+ " return None\n",
46
+ " \n",
47
+ "\n",
48
+ "\n",
49
+ "def get_zotero_items():\n",
50
+ " \"\"\"fetch all items from zotero library\"\"\"\n",
51
+ "\n",
52
+ " GROUP_ID = \"5004697\"\n",
53
+ " API_KEY = \"HFfTT8uuL7z9tN0L5VCHe7Ow\"\n",
54
+ " BASE_URL = f\"https://api.zotero.org/groups/{GROUP_ID}/items\"\n",
55
+ " LIMIT = 100\n",
56
+ "\n",
57
+ " headers = {\"Zotero-API-Key\": API_KEY, \"Content-Type\": \"application/json\"}\n",
58
+ "\n",
59
+ " items = []\n",
60
+ " start = 0\n",
61
+ "\n",
62
+ " ITEM_TYPES = [\n",
63
+ " \"journalArticle\",\n",
64
+ " \"report\",\n",
65
+ " \"conferencePaper\",\n",
66
+ " \"manuscript\",\n",
67
+ " \"book\",\n",
68
+ " \"thesis\",\n",
69
+ " ] # Replace or extend with desired item types\n",
70
+ "\n",
71
+ " i = 1\n",
72
+ " while True:\n",
73
+ " i += 1\n",
74
+ " params = {\"limit\": LIMIT, \"start\": start}\n",
75
+ " response = requests.get(BASE_URL, headers=headers, params=params)\n",
76
+ " page_items = response.json()\n",
77
+ " if ITEM_TYPES:\n",
78
+ " params[\"itemType\"] = \",\".join(ITEM_TYPES)\n",
79
+ "\n",
80
+ " if not page_items:\n",
81
+ " break\n",
82
+ "\n",
83
+ " items.extend(page_items)\n",
84
+ " start += LIMIT\n",
85
+ " print(f\"# items fetched {len(items)}\")\n",
86
+ " # break # debug\n",
87
+ "\n",
88
+ " return items\n",
89
+ "\n",
90
+ "\n",
91
+ "def extract_arxiv_links_and_pdfs(items):\n",
92
+ " \"\"\"Extract mapping between arxiv & pdf urls for a list of zotero items\n",
93
+ "\n",
94
+ " Args:\n",
95
+ " items (list): list of zotero items\n",
96
+ "\n",
97
+ " Returns:\n",
98
+ " arxiv_info (dict): dictionary with arxiv links as keys and pdf links as values\n",
99
+ " \"\"\"\n",
100
+ " arxiv_info = {}\n",
101
+ " arxiv_pattern = re.compile(r\"arxiv.org/abs/(\\d+\\.\\d+)\")\n",
102
+ "\n",
103
+ " for item in items:\n",
104
+ " data = item.get(\"data\", {})\n",
105
+ " attachments = item.get(\"links\", {}).get(\"attachment\", {})\n",
106
+ " arxiv_url = None\n",
107
+ " pdf_url = None\n",
108
+ "\n",
109
+ " if \"url\" in data and \"arxiv.org\" in data[\"url\"]:\n",
110
+ " arxiv_match = arxiv_pattern.search(data[\"url\"])\n",
111
+ " if arxiv_match:\n",
112
+ " arxiv_url = data[\"url\"]\n",
113
+ "\n",
114
+ " if attachments:\n",
115
+ " pdf_url = attachments[\"href\"]\n",
116
+ "\n",
117
+ " if arxiv_url:\n",
118
+ " arxiv_info[arxiv_url] = pdf_url\n",
119
+ "\n",
120
+ " return arxiv_info\n",
121
+ "\n",
122
+ "\n",
123
+ "@retry(tries=3, delay=15, backoff=2)\n",
124
+ "def get_arxiv_with_retry(arxiv_id):\n",
125
+ " return requests.get(f\"https://ar5iv.labs.arxiv.org/html/{arxiv_id}\")\n",
126
+ "\n",
127
+ "\n",
128
+ "def get_arxiv(arxiv_ids):\n",
129
+ " \"\"\"\n",
130
+ " Fetch html version of arxiv papers\n",
131
+ " \"\"\"\n",
132
+ " html5ivs = {}\n",
133
+ " for i, arxiv_id in tqdm(enumerate(arxiv_ids)):\n",
134
+ " if \"v\" in arxiv_id:\n",
135
+ " arxiv_id = arxiv_id.split(\"v\")[0]\n",
136
+ " response = get_arxiv_with_retry(arxiv_id)\n",
137
+ " html5ivs[arxiv_id] = response.text\n",
138
+ " if response.status_code != 200:\n",
139
+ " html5ivs[arxiv_id] = \"ERROR\"\n",
140
+ " return html5ivs\n",
141
+ "\n",
142
+ "\n",
143
+ "def extract_html_content(html):\n",
144
+ " \"\"\"\n",
145
+ " Extract content from arxiv html\n",
146
+ " \"\"\"\n",
147
+ " arxiv_id_match = re.search(r\"\\[(\\d+\\.\\d+(v\\d+)?)\\]\", html)\n",
148
+ " arxiv_id = arxiv_id_match.group(1) if arxiv_id_match else None\n",
149
+ " soup = BeautifulSoup(html, \"html.parser\")\n",
150
+ " result = []\n",
151
+ "\n",
152
+ " # Extract paper title\n",
153
+ " paper_title = soup.find(\"h1\", class_=\"ltx_title ltx_title_document\").get_text(strip=True)\n",
154
+ "\n",
155
+ " for math in soup.find_all(\"math\"):\n",
156
+ " math.decompose()\n",
157
+ " for cite in soup.find_all(\"cite\"):\n",
158
+ " cite.decompose()\n",
159
+ "\n",
160
+ " # Extract abstract\n",
161
+ " abstract = soup.find(\"div\", class_=\"ltx_abstract\")\n",
162
+ " if abstract:\n",
163
+ " result.append(\n",
164
+ " {\n",
165
+ " \"content\": \" \".join(p.get_text(strip=True) for p in abstract.find_all(\"p\")).replace(\")\", \") \"),\n",
166
+ " \"title\": \"Abstract\",\n",
167
+ " \"paper_title\": paper_title,\n",
168
+ " \"content_type\": \"abstract\",\n",
169
+ " }\n",
170
+ " )\n",
171
+ "\n",
172
+ " # Extract introduction\n",
173
+ " intro = soup.find(\"section\", id=\"S1\")\n",
174
+ " if intro:\n",
175
+ " section_title = \"Introduction\"\n",
176
+ " paragraphs = intro.find_all(\"div\", class_=\"ltx_para\")\n",
177
+ " for para in paragraphs:\n",
178
+ " result.append(\n",
179
+ " {\n",
180
+ " \"content\": para.get_text(strip=True).replace(\")\", \") \"),\n",
181
+ " \"title\": section_title,\n",
182
+ " \"paper_title\": paper_title,\n",
183
+ " \"content_type\": \"introduction\",\n",
184
+ " }\n",
185
+ " )\n",
186
+ "\n",
187
+ " # Extract other sections (excluding introduction and conclusion)\n",
188
+ " sections = soup.find_all(\"section\", class_=\"ltx_section\")\n",
189
+ " for section in sections[1:-1]:\n",
190
+ " section_title = section.find(\"h2\", class_=\"ltx_title ltx_title_section\").get_text(strip=True)\n",
191
+ " paragraphs = section.find_all(\"div\", class_=\"ltx_para\")\n",
192
+ " for para in paragraphs:\n",
193
+ " result.append(\n",
194
+ " {\n",
195
+ " \"content\": para.get_text(strip=True).replace(\")\", \") \"),\n",
196
+ " \"title\": section_title,\n",
197
+ " \"paper_title\": paper_title,\n",
198
+ " \"content_type\": \"body\",\n",
199
+ " }\n",
200
+ " )\n",
201
+ "\n",
202
+ " # Extract conclusion\n",
203
+ " conclusion = soup.find(\"section\", id=\"S4\")\n",
204
+ " if conclusion:\n",
205
+ " result.append(\n",
206
+ " {\n",
207
+ " \"content\": \" \".join(\n",
208
+ " p.get_text(strip=True) for p in conclusion.find_all(\"div\", class_=\"ltx_para\")\n",
209
+ " ).replace(\")\", \") \"),\n",
210
+ " \"title\": conclusion.find(\"h2\", class_=\"ltx_title ltx_title_section\").get_text(strip=True),\n",
211
+ " \"paper_title\": paper_title,\n",
212
+ " \"content_type\": \"conclusion\",\n",
213
+ " }\n",
214
+ " )\n",
215
+ "\n",
216
+ " for c in result:\n",
217
+ " c[\"arxiv_id\"] = arxiv_id\n",
218
+ "\n",
219
+ " return result\n",
220
+ "\n",
221
+ "\n",
222
+ "def pool_html_content(parsed, pool_size=3, step_size=2):\n",
223
+ " \"\"\"\n",
224
+ " Pool together individual bits of contents from \"body\" and \"introduction\" sections where the \"title\" is the same.\n",
225
+ " It pools them 'pool_size' by 'pool_size' as a sliding window with a step size of 'step_size'.\n",
226
+ " Keeps all other attributes for each element and includes abstract and conclusion without pooling.\n",
227
+ "\n",
228
+ " Args:\n",
229
+ " parsed (list): List of dictionaries containing parsed content.\n",
230
+ " pool_size (int): Size of the pool for the sliding window.\n",
231
+ " step_size (int): Step size for the sliding window.\n",
232
+ "\n",
233
+ " Returns:\n",
234
+ " list: A list of pooled contents including non-pooled abstract and conclusion.\n",
235
+ " \"\"\"\n",
236
+ " pooled_contents = []\n",
237
+ " i = 0\n",
238
+ " while i < len(parsed) - pool_size + 1:\n",
239
+ " current_window = parsed[i : i + pool_size]\n",
240
+ " # Check if all entries in the current window have the same title and are from 'body' or 'introduction'\n",
241
+ " if all(\n",
242
+ " entry[\"content_type\"] in [\"body\", \"introduction\"] and entry[\"title\"] == current_window[0][\"title\"]\n",
243
+ " for entry in current_window\n",
244
+ " ):\n",
245
+ " pooled_content = \"\\n\".join(entry[\"content\"] for entry in current_window)\n",
246
+ " if \"bit\" not in current_window[0][\"title\"][:5]:\n",
247
+ " title = current_window[0][\"title\"].lstrip(\"0123456789\")\n",
248
+ " pooled_content = f\"{current_window[0]['paper_title']}\\n{title}\\n\\n\" + pooled_content\n",
249
+ " pooled_entry = {\n",
250
+ " \"title\": title,\n",
251
+ " \"text\": pooled_content,\n",
252
+ " \"paper_title\": current_window[0][\"paper_title\"],\n",
253
+ " \"content_type\": current_window[0][\"content_type\"],\n",
254
+ " \"arxiv_id\": current_window[0][\"arxiv_id\"],\n",
255
+ " }\n",
256
+ " pooled_contents.append(pooled_entry)\n",
257
+ " i += step_size\n",
258
+ "\n",
259
+ " # Include abstract and conclusion without pooling\n",
260
+ " for entry in parsed:\n",
261
+ " if entry[\"content_type\"] in [\"abstract\", \"conclusion\"]:\n",
262
+ " if \"content\" in entry:\n",
263
+ " entry[\"text\"] = entry[\"content\"]\n",
264
+ " del entry[\"content\"]\n",
265
+ " if \"bit\" not in entry[\"title\"][:5]:\n",
266
+ " entry[\"title\"] = entry[\"title\"].lstrip(\"0123456789\")\n",
267
+ " entry[\"text\"] = f\"{entry['paper_title']}\\n{entry['title']}\\n\\n\" + entry[\"text\"]\n",
268
+ " pooled_contents.append(entry)\n",
269
+ "\n",
270
+ " return pooled_contents\n",
271
+ "\n",
272
+ "\n",
273
+ "def parse_markdown_content(md_content, arxiv_id):\n",
274
+ " \"\"\"\n",
275
+ " Parses markdown content to identify and extract sections based on headers.\n",
276
+ " \"\"\"\n",
277
+ "\n",
278
+ " lines = md_content.split(\"\\n\")\n",
279
+ " parsed = []\n",
280
+ " current_section = None\n",
281
+ " content = []\n",
282
+ " paper_title = None\n",
283
+ " current_title = None\n",
284
+ "\n",
285
+ " # Identify sections based on headers\n",
286
+ " for line in lines:\n",
287
+ " if line.startswith(\"#\"):\n",
288
+ " if paper_title is None:\n",
289
+ " paper_title = line.lstrip(\"#\").lstrip(\"#\").strip()\n",
290
+ " continue\n",
291
+ " if content:\n",
292
+ " if not current_title:\n",
293
+ " print(\"skipping pre-abstract!\")\n",
294
+ " parsed.append(\n",
295
+ " {\n",
296
+ " \"title\": current_title,\n",
297
+ " \"text\": content,\n",
298
+ " \"paper_title\": paper_title,\n",
299
+ " \"content_type\": \"body\"\n",
300
+ " if current_section not in [\"Abstract\", \"Introduction\", \"Conclusion\"]\n",
301
+ " else current_section.lower(),\n",
302
+ " \"arxiv_id\": arxiv_id,\n",
303
+ " }\n",
304
+ " )\n",
305
+ " content = []\n",
306
+ "\n",
307
+ " if \"Introduction\" in line:\n",
308
+ " current_section = \"Introduction\"\n",
309
+ " elif \"Abstract\" in line:\n",
310
+ " current_section = \"Abstract\"\n",
311
+ " elif \"Conclusion\" in line:\n",
312
+ " current_section = \"Conclusion\"\n",
313
+ " elif \"Reference\" in line:\n",
314
+ " break\n",
315
+ " else:\n",
316
+ " current_section = \"body\"\n",
317
+ "\n",
318
+ " current_title = line.lstrip(\"#\").lstrip(\"#\").lstrip()\n",
319
+ "\n",
320
+ " if \"bit\" not in current_title:\n",
321
+ " current_title = (\n",
322
+ " current_title.lstrip(\"123456789\")\n",
323
+ " .lstrip()\n",
324
+ " .lstrip(\".\")\n",
325
+ " .lstrip()\n",
326
+ " .lstrip(\"123456789\")\n",
327
+ " .lstrip()\n",
328
+ " .lstrip(\".\")\n",
329
+ " .lstrip()\n",
330
+ " )\n",
331
+ " else:\n",
332
+ " content.append(line)\n",
333
+ "\n",
334
+ " if content:\n",
335
+ " parsed.append(\n",
336
+ " {\n",
337
+ " \"title\": current_section,\n",
338
+ " \"text\": content,\n",
339
+ " \"paper_title\": paper_title,\n",
340
+ " \"content_type\": \"body\"\n",
341
+ " if current_section not in [\"Abstract\", \"Introduction\", \"Conclusion\"]\n",
342
+ " else current_section.lower(),\n",
343
+ " \"arxiv_id\": arxiv_id,\n",
344
+ " }\n",
345
+ " )\n",
346
+ "\n",
347
+ " return parsed\n",
348
+ "\n",
349
+ "\n",
350
+ "def pool_markdown(parsed, pool_size=3, step_size=2):\n",
351
+ " \"\"\"\n",
352
+ " pool together markdown contents with sliding window of size 'pool_size' and step size of 'step_size'.\n",
353
+ "\n",
354
+ " Args:\n",
355
+ " parsed (list): List of dictionaries containing parsed content.\n",
356
+ " pool_size (int): Size of the pool for the sliding window.\n",
357
+ " step_size (int): Step size for the sliding window.\n",
358
+ "\n",
359
+ " Returns:\n",
360
+ " list: A list of pooled contents including non-pooled abstract and conclusion.\n",
361
+ " \"\"\"\n",
362
+ " cleaned_contents = []\n",
363
+ " for section in parsed:\n",
364
+ " cleaned_lines = [line for line in section[\"text\"] if line and \"|\" not in line and len(line) >= 40]\n",
365
+ " section[\"text\"] = cleaned_lines\n",
366
+ " cleaned_contents.append(section)\n",
367
+ "\n",
368
+ " pooled_contents = []\n",
369
+ " for section in cleaned_contents:\n",
370
+ " if section[\"content_type\"] in [\"body\", \"introduction\"]:\n",
371
+ " i = 0\n",
372
+ " while i <= len(section[\"text\"]) - pool_size:\n",
373
+ " pooled_content = \"\\n\".join(section[\"text\"][i : i + pool_size])\n",
374
+ " pooled_content = f\"{section['paper_title']}\\n{section['title']}\\n\\n\" + pooled_content\n",
375
+ " pooled_entry = {\n",
376
+ " \"title\": section[\"title\"],\n",
377
+ " \"text\": pooled_content,\n",
378
+ " \"paper_title\": section[\"paper_title\"],\n",
379
+ " \"content_type\": section[\"content_type\"],\n",
380
+ " \"arxiv_id\": section[\"arxiv_id\"],\n",
381
+ " }\n",
382
+ " pooled_contents.append(pooled_entry)\n",
383
+ " i += step_size\n",
384
+ " else:\n",
385
+ " section[\"text\"] = f\"{section['paper_title']}\\n\" + \"\\n\".join(section[\"text\"])\n",
386
+ " pooled_contents.append(section)\n",
387
+ " return pooled_contents\n",
388
+ "\n",
389
+ "\n",
390
+ "def clean_and_pool_markdown_content(md, arxiv_id):\n",
391
+ " md = parse_markdown_content(md, arxiv_id)\n",
392
+ " return pool_markdown(md)\n",
393
+ "\n",
394
+ "\n",
395
+ "if __name__ == \"__main__\":\n",
396
+ " items = get_zotero_items()\n",
397
+ " print(f\"# of items fetched from zotero: {len(items)}\")\n",
398
+ " arxiv_info = extract_arxiv_links_and_pdfs(items)\n",
399
+ "\n",
400
+ " # extract arxiv information ---\n",
401
+ " arxiv_ids = set()\n",
402
+ " for arxiv_url, pdf_url in arxiv_info.items():\n",
403
+ " arxiv_ids.add(arxiv_url.split(\"/\")[-1])\n",
404
+ " print(f\"# of arxiv papers: {len(arxiv_ids)}\")\n",
405
+ "\n",
406
+ " html5ivs = get_arxiv(arxiv_ids)\n",
407
+ "\n",
408
+ " broken = []\n",
409
+ " extracted = {}\n",
410
+ " extracted_from_pdf = {}\n",
411
+ " broken_tex = {}\n",
412
+ "\n",
413
+ " pbar = tqdm(range(len(html5ivs)))\n",
414
+ " for arxiv_id, x in html5ivs.items():\n",
415
+ " try:\n",
416
+ " extracted[arxiv_id] = extract_html_content(x)\n",
417
+ " except Exception as e:\n",
418
+ " # print(e)\n",
419
+ " print(f'getting pdf text for {arxiv_id}...')\n",
420
+ " pdf_text = get_pdf_text(arxiv_id) \n",
421
+ " if pdf_text is not None:\n",
422
+ " extracted_from_pdf[arxiv_id] = pdf_text # get_pdf_text(arxiv_id) # <- original\n",
423
+ " else:\n",
424
+ " broken.append(arxiv_id)\n",
425
+ " pbar.update(1)\n",
426
+ " pbar.close()\n",
427
+ "\n",
428
+ " print(f\"# of arxiv papers extracted: {len(extracted)}\")\n",
429
+ " print(f\"# of arxiv papers extracted from pdf: {len(extracted_from_pdf)}\")\n",
430
+ " print(f\"# of broken links: {len(broken)}\")\n",
431
+ "\n",
432
+ " pooled = [pool_html_content(x) for _, x in extracted.items()]\n",
433
+ " pooled += [\n",
434
+ " clean_and_pool_markdown_content(x, arxiv_id) for arxiv_id, x in extracted_from_pdf.items()\n",
435
+ " ] # No pdf content yet, no op for now\n",
436
+ " flat_pool = [item for sublist in pooled for item in sublist]\n",
437
+ "\n",
438
+ " id_to_abstract = {}\n",
439
+ " for doc in pooled:\n",
440
+ " for entry in doc:\n",
441
+ " if entry[\"content_type\"] == \"abstract\":\n",
442
+ " id_to_abstract[entry[\"arxiv_id\"]] = entry[\"text\"]\n",
443
+ " break\n",
444
+ "\n",
445
+ " srsly.write_json(\"id_to_abstract.json\", id_to_abstract)\n",
446
+ " srsly.write_json(\"flatpool.json\", flat_pool)\n",
447
+ "```\n",
448
+ "go through it and reply OK."
449
+ ]
450
+ },
451
+ {
452
+ "cell_type": "markdown",
453
+ "id": "24e7ef23",
454
+ "metadata": {},
455
+ "source": [
456
+ "OK. I've reviewed the code you provided for ingesting and processing items from a Zotero library."
457
+ ]
458
+ },
459
+ {
460
+ "cell_type": "markdown",
461
+ "id": "84628bb2",
462
+ "metadata": {},
463
+ "source": [
464
+ "OK. I've reviewed the code you provided for ingesting and processing items from a Zotero library."
465
+ ]
466
+ },
467
+ {
468
+ "cell_type": "code",
469
+ "execution_count": 3,
470
+ "id": "bada6931",
471
+ "metadata": {},
472
+ "outputs": [],
473
+ "source": [
474
+ "%%ai\n",
475
+ "Let's work step by step. I currently have\n",
476
+ "\n",
477
+ "```\n",
478
+ "import os\n",
479
+ "import re\n",
480
+ "\n",
481
+ "import dotenv\n",
482
+ "import requests\n",
483
+ "from bs4 import BeautifulSoup\n",
484
+ "from retry import retry\n",
485
+ "from tqdm.auto import tqdm\n",
486
+ "\n",
487
+ "dotenv.load_dotenv()\n",
488
+ "\n",
489
+ "\n",
490
+ "########################################################\n",
491
+ "### GET ZOTERO ITEMS\n",
492
+ "########################################################\n",
493
+ "@retry(tries=3, delay=8)\n",
494
+ "def _fetch_one_zotero_batch(url, headers, params):\n",
495
+ " \"\"\"\n",
496
+ " Fetch articles from Zotero API\n",
497
+ " \"\"\"\n",
498
+ " response = requests.get(url, headers=headers, params=params)\n",
499
+ " response.raise_for_status()\n",
500
+ " return response.json()\n",
501
+ "\n",
502
+ "\n",
503
+ "def get_zotero_items(debug=False):\n",
504
+ " \"\"\"\n",
505
+ " fetch items from zotero library\n",
506
+ " \"\"\"\n",
507
+ "\n",
508
+ " GROUP_ID = os.getenv(\"GROUP_ID\")\n",
509
+ " API_KEY = os.getenv(\"API_KEY\")\n",
510
+ " BASE_URL = f\"https://api.zotero.org/groups/{GROUP_ID}/items\"\n",
511
+ " LIMIT = 100\n",
512
+ "\n",
513
+ " headers = {\"Zotero-API-Key\": API_KEY, \"Content-Type\": \"application/json\"}\n",
514
+ "\n",
515
+ " items = []\n",
516
+ " start = 0\n",
517
+ "\n",
518
+ " i = 1\n",
519
+ " while True:\n",
520
+ " i += 1\n",
521
+ " params = {\"limit\": LIMIT, \"start\": start}\n",
522
+ " page_items = _fetch_one_zotero_batch(BASE_URL, headers, params)\n",
523
+ "\n",
524
+ " if not page_items:\n",
525
+ " break\n",
526
+ "\n",
527
+ " items.extend(page_items)\n",
528
+ " start += LIMIT\n",
529
+ " print(f\"# items fetched {len(items)}\")\n",
530
+ "\n",
531
+ " if debug:\n",
532
+ " break\n",
533
+ "\n",
534
+ " return items\n",
535
+ "\n",
536
+ "\n",
537
+ "########################################################\n",
538
+ "### EXTRACT ARXIV LINKS AND PDFs\n",
539
+ "########################################################\n",
540
+ "def get_arxiv_items(items):\n",
541
+ " arxiv_items = []\n",
542
+ " arxiv_pattern = re.compile(r\"arxiv.org/abs/(\\d+\\.\\d+)\")\n",
543
+ "\n",
544
+ " for item in items:\n",
545
+ " data = item.get(\"data\", {})\n",
546
+ " attachments = item.get(\"links\", {}).get(\"attachment\", {})\n",
547
+ "\n",
548
+ " arxiv_url = None\n",
549
+ " pdf_url = None\n",
550
+ "\n",
551
+ " if \"url\" in data and \"arxiv.org\" in data[\"url\"]:\n",
552
+ " arxiv_match = arxiv_pattern.search(data[\"url\"])\n",
553
+ " if arxiv_match:\n",
554
+ " arxiv_url = data[\"url\"]\n",
555
+ "\n",
556
+ " if attachments:\n",
557
+ " pdf_url = attachments[\"href\"]\n",
558
+ "\n",
559
+ " if arxiv_url:\n",
560
+ " arxiv_items.append(\n",
561
+ " {\n",
562
+ " \"arxiv_id\": arxiv_url.split(\"/\")[-1],\n",
563
+ " \"arxiv_url\": arxiv_url,\n",
564
+ " \"pdf_url\": pdf_url,\n",
565
+ " \"added_by\": item[\"meta\"][\"createdByUser\"][\"username\"],\n",
566
+ " \"date_added\": data.get(\"dateAdded\", \"\"),\n",
567
+ " }\n",
568
+ " )\n",
569
+ "\n",
570
+ " return arxiv_items\n",
571
+ "\n",
572
+ "\n",
573
+ "@retry(tries=3, delay=15, backoff=2)\n",
574
+ "def fetch_arxiv_html(arxiv_id):\n",
575
+ " url = f\"https://ar5iv.labs.arxiv.org/html/{arxiv_id.split('v')[0]}\"\n",
576
+ " response = requests.get(url)\n",
577
+ " return response.text if response.status_code == 200 else None\n",
578
+ "\n",
579
+ "\n",
580
+ "def get_arxiv_html(arxiv_items):\n",
581
+ " for item in tqdm(arxiv_items):\n",
582
+ " html = fetch_arxiv_html(item[\"arxiv_id\"])\n",
583
+ " if html:\n",
584
+ " item[\"raw_html\"] = html\n",
585
+ " else:\n",
586
+ " print(f\"failed to fetch html for {item['arxiv_id']}\")\n",
587
+ "\n",
588
+ " return arxiv_items\n",
589
+ "\n",
590
+ "\n",
591
+ "if __name__ == \"__main__\":\n",
592
+ " items = get_zotero_items(debug=True)\n",
593
+ " print(f\"# of items fetched from zotero: {len(items)}\")\n",
594
+ " arxiv_items = get_arxiv_items(items)\n",
595
+ " print(f\"# of arxiv papers: {len(arxiv_items)}\")\n",
596
+ " arxiv_htmls = get_arxiv_html(arxiv_items) # raw html files\n",
597
+ "--\n",
598
+ "review for any mistake until now. What shall we do next."
599
+ ]
600
+ },
601
+ {
602
+ "cell_type": "markdown",
603
+ "id": "60ab1dfc",
604
+ "metadata": {},
605
+ "source": [
606
+ "The code looks good so far. You've implemented functions to fetch items from Zotero, extract ArXiv-related items, and fetch the HTML content for ArXiv papers. \n",
607
+ "\n",
608
+ "Next steps could include:\n",
609
+ "\n",
610
+ "1. Parse the HTML content to extract structured information from the papers.\n",
611
+ "2. Implement error handling for failed HTML fetches.\n",
612
+ "3. Add a function to extract text from PDFs for papers where HTML fetch fails.\n",
613
+ "4. Store the extracted data in a suitable format (e.g., JSON, database).\n",
614
+ "5. Implement content pooling as in your original code.\n",
615
+ "\n",
616
+ "Which of these would you like to focus on first?"
617
+ ]
618
+ },
619
+ {
620
+ "cell_type": "code",
621
+ "execution_count": 5,
622
+ "id": "696318f6",
623
+ "metadata": {},
624
+ "outputs": [],
625
+ "source": [
626
+ "%%aip\n",
627
+ "Let's work on step 1. Let's follow the original code as much as possible.\n",
628
+ "Note: these are arxiv htmls. Let's also keep full body text."
629
+ ]
630
+ },
631
+ {
632
+ "cell_type": "code",
633
+ "execution_count": 6,
634
+ "id": "e7e4a5a6",
635
+ "metadata": {},
636
+ "outputs": [
637
+ {
638
+ "ename": "NameError",
639
+ "evalue": "name 'arxiv_htmls' is not defined",
640
+ "output_type": "error",
641
+ "traceback": [
642
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
643
+ "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
644
+ "Cell \u001b[0;32mIn[6], line 18\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mprocess_arxiv_items\u001b[39m(arxiv_items):\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [\u001b[38;5;28mdict\u001b[39m(item, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mextract_html_content(item[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mraw_html\u001b[39m\u001b[38;5;124m\"\u001b[39m])) \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m arxiv_items \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mraw_html\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m item]\n\u001b[0;32m---> 18\u001b[0m processed_items \u001b[38;5;241m=\u001b[39m process_arxiv_items(\u001b[43marxiv_htmls\u001b[49m)\n",
645
+ "\u001b[0;31mNameError\u001b[0m: name 'arxiv_htmls' is not defined"
646
+ ]
647
+ }
648
+ ],
649
+ "source": [
650
+ "def extract_html_content(html):\n",
651
+ " soup = BeautifulSoup(html, \"html.parser\")\n",
652
+ " arxiv_id = re.search(r\"\\[(\\d+\\.\\d+(v\\d+)?)\\]\", html).group(1) if re.search(r\"\\[(\\d+\\.\\d+(v\\d+)?)\\]\", html) else None\n",
653
+ " paper_title = soup.find(\"h1\", class_=\"ltx_title ltx_title_document\").get_text(strip=True)\n",
654
+ " [m.decompose() for m in soup.find_all([\"math\", \"cite\"])]\n",
655
+ " abstract = \" \".join(p.get_text(strip=True) for p in soup.find(\"div\", class_=\"ltx_abstract\").find_all(\"p\")).replace(\")\", \") \") if soup.find(\"div\", class_=\"ltx_abstract\") else \"\"\n",
656
+ " sections = soup.find_all(\"section\", class_=\"ltx_section\")\n",
657
+ " body = []\n",
658
+ " for section in sections:\n",
659
+ " title = section.find(\"h2\", class_=\"ltx_title ltx_title_section\").get_text(strip=True)\n",
660
+ " content = \" \".join(p.get_text(strip=True).replace(\")\", \") \") for p in section.find_all(\"div\", class_=\"ltx_para\"))\n",
661
+ " body.append({\"title\": title, \"content\": content})\n",
662
+ " return {\"arxiv_id\": arxiv_id, \"paper_title\": paper_title, \"abstract\": abstract, \"body\": body, \"full_text\": soup.get_text(strip=True)}\n",
663
+ "\n",
664
+ "def process_arxiv_items(arxiv_items):\n",
665
+ " return [dict(item, **extract_html_content(item[\"raw_html\"])) for item in arxiv_items if \"raw_html\" in item]\n",
666
+ "\n",
667
+ "processed_items = process_arxiv_items(arxiv_htmls)"
668
+ ]
669
+ },
670
+ {
671
+ "cell_type": "code",
672
+ "execution_count": 8,
673
+ "id": "70a3caaa",
674
+ "metadata": {},
675
+ "outputs": [],
676
+ "source": [
677
+ "%%aip\n",
678
+ "the above doesn't work and missed may details of the original implementation. Do not focus on pooling now\n",
679
+ "just do extract_html_content better\n"
680
+ ]
681
+ },
682
+ {
683
+ "cell_type": "code",
684
+ "execution_count": null,
685
+ "id": "4040257d",
686
+ "metadata": {},
687
+ "outputs": [],
688
+ "source": [
689
+ "def extract_html_content(html):\n",
690
+ " soup, result = BeautifulSoup(html, \"html.parser\"), []\n",
691
+ " arxiv_id = re.search(r\"\\[(\\d+\\.\\d+(v\\d+)?)\\]\", html)\n",
692
+ " arxiv_id = arxiv_id.group(1) if arxiv_id else None\n",
693
+ " paper_title = soup.find(\"h1\", class_=\"ltx_title ltx_title_document\").get_text(strip=True)\n",
694
+ " [m.decompose() for m in soup.find_all([\"math\", \"cite\"])]\n",
695
+ " abstract = soup.find(\"div\", class_=\"ltx_abstract\")\n",
696
+ " if abstract:\n",
697
+ " result.append({\"content\": \" \".join(p.get_text(strip=True) for p in abstract.find_all(\"p\")).replace(\")\", \") \"), \"title\": \"Abstract\", \"paper_title\": paper_title, \"content_type\": \"abstract\"})\n",
698
+ " intro = soup.find(\"section\", id=\"S1\")\n",
699
+ " if intro:\n",
700
+ " for para in intro.find_all(\"div\", class_=\"ltx_para\"):\n",
701
+ " result.append({\"content\": para.get_text(strip=True).replace(\")\", \") \"), \"title\": \"Introduction\", \"paper_title\": paper_title, \"content_type\": \"introduction\"})\n",
702
+ " for section in soup.find_all(\"section\", class_=\"ltx_section\")[1:-1]:\n",
703
+ " section_title = section.find(\"h2\", class_=\"ltx_title ltx_title_section\").get_text(strip=True)\n",
704
+ " for para in section.find_all(\"div\", class_=\"ltx_para\"):\n",
705
+ " result.append({\"content\": para.get_text(strip=True).replace(\")\", \") \"), \"title\": section_title, \"paper_title\": paper_title, \"content_type\": \"body\"})\n",
706
+ " conclusion = soup.find(\"section\", id=\"S4\")\n",
707
+ " if conclusion:\n",
708
+ " result.append({\"content\": \" \".join(p.get_text(strip=True) for p in conclusion.find_all(\"div\", class_=\"ltx_para\")).replace(\")\", \") \"), \"title\": conclusion.find(\"h2\", class_=\"ltx_title ltx_title_section\").get_text(strip=True), \"paper_title\": paper_title, \"content_type\": \"conclusion\"})\n",
709
+ " for c in result:\n",
710
+ " c[\"arxiv_id\"] = arxiv_id\n",
711
+ " return result\n",
712
+ "\n",
713
+ "processed_items = [extract_html_content(item[\"raw_html\"]) for item in arxiv_items if \"raw_html\" in item]"
714
+ ]
715
+ },
716
+ {
717
+ "cell_type": "code",
718
+ "execution_count": null,
719
+ "id": "0242f021",
720
+ "metadata": {},
721
+ "outputs": [],
722
+ "source": []
723
+ }
724
+ ],
725
+ "metadata": {
726
+ "kernelspec": {
727
+ "display_name": "Python 3 (ipykernel)",
728
+ "language": "python",
729
+ "name": "python3"
730
+ },
731
+ "language_info": {
732
+ "codemirror_mode": {
733
+ "name": "ipython",
734
+ "version": 3
735
+ },
736
+ "file_extension": ".py",
737
+ "mimetype": "text/x-python",
738
+ "name": "python",
739
+ "nbconvert_exporter": "python",
740
+ "pygments_lexer": "ipython3",
741
+ "version": "3.12.4"
742
+ }
743
+ },
744
+ "nbformat": 4,
745
+ "nbformat_minor": 5
746
+ }