tosanoob commited on
Commit
2739bfd
1 Parent(s): 829dd16

Upload crawl functions

Browse files
Files changed (1) hide show
  1. testing_functions.ipynb +686 -0
testing_functions.ipynb ADDED
@@ -0,0 +1,686 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import numpy as np\n",
10
+ "import string\n",
11
+ "import pandas as pd\n",
12
+ "import time\n",
13
+ "import urllib\n",
14
+ "import urllib.request\n",
15
+ "import json"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": 4,
21
+ "metadata": {},
22
+ "outputs": [
23
+ {
24
+ "name": "stdout",
25
+ "output_type": "stream",
26
+ "text": [
27
+ "{'computer science': ['machine learning', 'artificial intelligence', 'hardware architecture', 'computational complexity', 'data structures', 'algorithms', 'graphics', 'databases', 'discrete mathematics', 'human-computer interaction', 'information retrieval', 'multiagent systems', 'neural network'], 'economics': ['general economics', 'theoretical economics', 'econometrics'], 'electrical engineering and system science': ['audio processing', 'speech processing', 'signal processing', 'image and video processing', 'system and controls'], 'mathematics': ['general mathematics', 'general topology', 'group theory', 'numerical analysis', 'probability', 'number theory', 'statistic theory']}\n"
28
+ ]
29
+ }
30
+ ],
31
+ "source": [
32
+ "baseurl = 'http://export.arxiv.org/api/query?search_query='\n",
33
+ "\n",
34
+ "# still ambigious, what are keywords?\n",
35
+ "\n",
36
+ "timestamp = \"2020-01-01\" \n",
37
+ "max_results = 10000\n",
38
+ "date = pd.Timestamp(str(timestamp), tz='US/Pacific')\n",
39
+ "\n",
40
+ "topics = json.load(open(\"topics.txt\",\"r\"))\n",
41
+ "print(topics)"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": null,
47
+ "metadata": {},
48
+ "outputs": [],
49
+ "source": [
50
+ "for key in topics:\n",
51
+ " # print(key)\n",
52
+ " # prepare url for each topic\n",
53
+ " keyword_list = topics[key]\n",
54
+ " i = 0\n",
55
+ " for keyword in keyword_list:\n",
56
+ " if i ==0:\n",
57
+ " url = baseurl + 'all:' + keyword\n",
58
+ " i = i + 1 \n",
59
+ " else:\n",
60
+ " url = url + '+OR+' + 'all:' + keyword\n",
61
+ " url = url+ '&max_results=' + str(max_results)\n",
62
+ " url = url.replace(' ', '%20')\n",
63
+ "\n",
64
+ " arxiv_page = urllib.request.urlopen(url,timeout=100).read()\n",
65
+ " with open(key+\".xml\",\"wb\") as outfile:\n",
66
+ " outfile.write(arxiv_page)\n",
67
+ " print(url)"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": null,
73
+ "metadata": {},
74
+ "outputs": [],
75
+ "source": [
76
+ "def crawl_from_url(url):\n",
77
+ " try: \n",
78
+ " arxiv_page = urllib.request.urlopen(url,timeout=100).read()\n",
79
+ " with open(\"save.xml\",\"wb\") as outfile:\n",
80
+ " outfile.write(arxiv_page)\n",
81
+ " arxiv_page = str(arxiv_page) \n",
82
+ " # Mỗi record nằm trong một thẻ <entry> \n",
83
+ " # <id> chứa đường dẫn tới paper trên arxiv\n",
84
+ " # <updated>, <published> là thời gian gần nhất cập nhật/xuất bản\n",
85
+ " # <title> là tiêu đề paper\n",
86
+ " # <summary> là abstract paper\n",
87
+ " # có thể có nhiều thẻ <author> chứa tên các tác giả\n",
88
+ " # <link title=\"pdf\" href=\" ... chứa link tải paper\n",
89
+ "\n",
90
+ " # trích 1 record dựa vào thẻ <entry>\n",
91
+ " start = arxiv_page.find(\"<entry>\")\n",
92
+ " end = arxiv_page.find(\"</entry>\")\n",
93
+ " extract = arxiv_page[start+7:end]\n",
94
+ " # print(extract)\n",
95
+ "\n",
96
+ " except Exception as e:\n",
97
+ " print(\"Error occured: \",e)\n",
98
+ "\n",
99
+ "crawl_from_url(url)"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "execution_count": 2,
105
+ "metadata": {},
106
+ "outputs": [],
107
+ "source": [
108
+ "def extract_tag(txt,tagname):\n",
109
+ " return txt[txt.find(\"<\"+tagname+\">\")+len(tagname)+2:txt.find(\"</\"+tagname+\">\")]\n",
110
+ "\n",
111
+ "def get_record(extract):\n",
112
+ " # id = extract[extract.find(\"<id>\")+4:extract.find(\"</id>\")]\n",
113
+ " # updated = extract[extract.find(\"<updated>\")+9:extract.find(\"</updated>\")]\n",
114
+ " # published = extract[extract.find(\"<published>\")+11:extract.find(\"</published>\")]\n",
115
+ " # title = extract[extract.find(\"<title>\")+7:extract.find(\"</title>\")]\n",
116
+ " # summary = extract[extract.find(\"<summary>\")+9:extract.find(\"</summary>\")]\n",
117
+ " id = extract_tag(extract,\"id\")\n",
118
+ " updated = extract_tag(extract,\"updated\")\n",
119
+ " published = extract_tag(extract,\"published\")\n",
120
+ " title = extract_tag(extract,\"title\").replace(\"\\n \",\"\").strip()\n",
121
+ " summary = extract_tag(extract,\"summary\").replace(\"\\n\",\"\").strip()\n",
122
+ " authors = []\n",
123
+ " while extract.find(\"<author>\")!=-1:\n",
124
+ " # author = extract[extract.find(\"<name>\")+6:extract.find(\"</name>\")]\n",
125
+ " author = extract_tag(extract,\"name\")\n",
126
+ " extract = extract[extract.find(\"</author>\")+9:]\n",
127
+ " authors.append(author)\n",
128
+ " pattern = '<link title=\"pdf\" href=\"'\n",
129
+ " link_start = extract.find('<link title=\"pdf\" href=\"')\n",
130
+ " link = extract[link_start+len(pattern):extract.find(\"rel=\",link_start)-2]\n",
131
+ " return [id, updated, published, title, authors, link, summary]"
132
+ ]
133
+ },
134
+ {
135
+ "cell_type": "code",
136
+ "execution_count": 3,
137
+ "metadata": {},
138
+ "outputs": [
139
+ {
140
+ "name": "stdout",
141
+ "output_type": "stream",
142
+ "text": [
143
+ "{'computer science': ['machine learning', 'artificial intelligence', 'hardware architecture', 'computational complexity', 'data structures', 'algorithms', 'graphics', 'databases', 'discrete mathematics', 'human-computer interaction', 'information retrieval', 'multiagent systems', 'neural network'], 'economics': ['general economics', 'theoretical economics', 'econometrics'], 'electrical engineering and system science': ['audio processing', 'speech processing', 'signal processing', 'image and video processing', 'system and controls'], 'mathematics': ['general mathematics', 'general topology', 'group theory', 'numerical analysis', 'probability', 'number theory', 'statistic theory']}\n"
144
+ ]
145
+ }
146
+ ],
147
+ "source": [
148
+ "# load xml\n",
149
+ "topics = json.load(open(\"topics.txt\",\"r\"))\n",
150
+ "print(topics)\n",
151
+ "records = []\n",
152
+ "for key in topics:\n",
153
+ " with open(key+\".xml\",\"rb\") as infile:\n",
154
+ " xml = infile.read()\n",
155
+ " xml = str(xml,encoding=\"utf-8\")\n",
156
+ " while xml.find(\"<entry>\") != -1:\n",
157
+ " extract = xml[xml.find(\"<entry>\")+7:xml.find(\"</entry>\")]\n",
158
+ " xml = xml[xml.find(\"</entry>\")+8:]\n",
159
+ " records.append([key,*get_record(extract)])"
160
+ ]
161
+ },
162
+ {
163
+ "cell_type": "code",
164
+ "execution_count": 4,
165
+ "metadata": {},
166
+ "outputs": [
167
+ {
168
+ "name": "stdout",
169
+ "output_type": "stream",
170
+ "text": [
171
+ "3000\n",
172
+ "<class 'list'>\n"
173
+ ]
174
+ }
175
+ ],
176
+ "source": [
177
+ "print(len(records))\n",
178
+ "print(type(records[32][5]))"
179
+ ]
180
+ },
181
+ {
182
+ "cell_type": "code",
183
+ "execution_count": null,
184
+ "metadata": {},
185
+ "outputs": [],
186
+ "source": [
187
+ "df = pd.DataFrame(records,columns=[\"topic\",\"id\",\"updated\",\"published\",\"title\",\"author\",\"link\",\"summary\",])\n",
188
+ "df.to_csv(\"arxiv_crawl.csv\")"
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "code",
193
+ "execution_count": null,
194
+ "metadata": {},
195
+ "outputs": [],
196
+ "source": [
197
+ "import json\n",
198
+ "topics_descriptions = json.load(open(\"topic_descriptions.txt\",\"r\"))\n",
199
+ "print(topics_descriptions)"
200
+ ]
201
+ },
202
+ {
203
+ "cell_type": "code",
204
+ "execution_count": null,
205
+ "metadata": {},
206
+ "outputs": [],
207
+ "source": [
208
+ "embed = model.encode(\"\"\"Recommendation systems for different Document Networks (DN) such as the World\n",
209
+ "Wide Web (WWW) and Digital Libraries, often use distance functions extracted\n",
210
+ "from relationships among documents and keywords. For instance, documents in the\n",
211
+ "WWW are related via a hyperlink network, while documents in bibliographic\n",
212
+ "databases are related by citation and collaboration networks. Furthermore,\n",
213
+ "documents are related to keyterms. The distance functions computed from these\n",
214
+ "relations establish associative networks among items of the DN, referred to as\n",
215
+ "Distance Graphs, which allow recommendation systems to identify relevant\n",
216
+ "associations for individual users. However, modern recommendation systems need\n",
217
+ "to integrate associative data from multiple sources such as different\n",
218
+ "databases, web sites, and even other users. Thus, we are presented with a\n",
219
+ "problem of combining evidence (about associations between items) from different\n",
220
+ "sources characterized by distance functions. In this paper we describe our work\n",
221
+ "on (1) inferring relevant associations from, as well as characterizing,\n",
222
+ "semi-metric distance graphs and (2) combining evidence from different distance\n",
223
+ "graphs in a recommendation system. Regarding (1), we present the idea of\n",
224
+ "semi-metric distance graphs, and introduce ratios to measure semi-metric\n",
225
+ "behavior. We compute these ratios for several DN such as digital libraries and\n",
226
+ "web sites and show that they are useful to identify implicit associations.\n",
227
+ "Regarding (2), we describe an algorithm to combine evidence from distance\n",
228
+ "graphs that uses Evidence Sets, a set structure based on Interval Valued Fuzzy\n",
229
+ "Sets and Dempster-Shafer Theory of Evidence. This algorithm has been developed\n",
230
+ "for a recommendation system named TalkMine.\"\"\")\n",
231
+ "for topic in topics_descriptions:\n",
232
+ " description = topics_descriptions[topic]\n",
233
+ " embed_desc = model.encode(description)\n",
234
+ " print(topic+\": \"+str(cos_sim(embed,embed_desc)))"
235
+ ]
236
+ },
237
+ {
238
+ "cell_type": "code",
239
+ "execution_count": 5,
240
+ "metadata": {},
241
+ "outputs": [],
242
+ "source": [
243
+ "import chromadb\n",
244
+ "from chromadb import Documents, EmbeddingFunction, Embeddings\n",
245
+ "\n",
246
+ "from transformers import AutoModel\n",
247
+ "from numpy.linalg import norm\n",
248
+ "\n",
249
+ "cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))\n",
250
+ "model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en',\n",
251
+ " trust_remote_code=True,\n",
252
+ " cache_dir='models') # trust_remote_code is needed to use the encode method\n",
253
+ "class JinaAIEmbeddingFunction(EmbeddingFunction):\n",
254
+ " def __init__(self, model):\n",
255
+ " super().__init__()\n",
256
+ " self.model = model\n",
257
+ "\n",
258
+ " def __call__(self, input: Documents) -> Embeddings:\n",
259
+ " embeddings = self.model.encode(input)\n",
260
+ " return embeddings.tolist()\n",
261
+ "\n",
262
+ "ef = JinaAIEmbeddingFunction(model)"
263
+ ]
264
+ },
265
+ {
266
+ "cell_type": "code",
267
+ "execution_count": 8,
268
+ "metadata": {},
269
+ "outputs": [],
270
+ "source": [
271
+ "client = chromadb.PersistentClient(path=\"arxivdb/\")\n",
272
+ "# first creation, embedding function = default\n",
273
+ "# collection = client.create_collection(name=\"arxiv_records\",metadata={\"hnsw:space\": \"cosine\"})\n",
274
+ "# later call\n",
275
+ "collection = client.get_or_create_collection(name=\"arxiv_records\", embedding_function=ef, metadata={\"hnsw:space\": \"cosine\"})\n"
276
+ ]
277
+ },
278
+ {
279
+ "cell_type": "code",
280
+ "execution_count": 7,
281
+ "metadata": {},
282
+ "outputs": [],
283
+ "source": [
284
+ "client.delete_collection(name=\"arxiv_records\")"
285
+ ]
286
+ },
287
+ {
288
+ "cell_type": "code",
289
+ "execution_count": 13,
290
+ "metadata": {},
291
+ "outputs": [],
292
+ "source": [
293
+ "import sqlite3\n",
294
+ "con = sqlite3.connect(\"arxiv_records_sql\")\n",
295
+ "cur = con.cursor()"
296
+ ]
297
+ },
298
+ {
299
+ "cell_type": "code",
300
+ "execution_count": 14,
301
+ "metadata": {},
302
+ "outputs": [
303
+ {
304
+ "ename": "OperationalError",
305
+ "evalue": "table arxivsql already exists",
306
+ "output_type": "error",
307
+ "traceback": [
308
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
309
+ "\u001b[1;31mOperationalError\u001b[0m Traceback (most recent call last)",
310
+ "Cell \u001b[1;32mIn[14], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mcur\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\"\"\u001b[39;49m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;124;43m create table arxivsql(\u001b[39;49m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;124;43m id,\u001b[39;49m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;124;43m topic,\u001b[39;49m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;124;43m title,\u001b[39;49m\n\u001b[0;32m 6\u001b[0m \u001b[38;5;124;43m authors,\u001b[39;49m\n\u001b[0;32m 7\u001b[0m \u001b[38;5;124;43m year_updated,\u001b[39;49m\n\u001b[0;32m 8\u001b[0m \u001b[38;5;124;43m year_published,\u001b[39;49m\n\u001b[0;32m 9\u001b[0m \u001b[38;5;124;43m link\u001b[39;49m\n\u001b[0;32m 10\u001b[0m \u001b[38;5;124;43m )\u001b[39;49m\n\u001b[0;32m 11\u001b[0m \u001b[38;5;124;43m\"\"\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 12\u001b[0m con\u001b[38;5;241m.\u001b[39mcommit()\n",
311
+ "\u001b[1;31mOperationalError\u001b[0m: table arxivsql already exists"
312
+ ]
313
+ }
314
+ ],
315
+ "source": [
316
+ "cur.execute(\"\"\"\n",
317
+ " create table arxivsql(\n",
318
+ " id,\n",
319
+ " topic,\n",
320
+ " title,\n",
321
+ " authors,\n",
322
+ " year_updated,\n",
323
+ " year_published,\n",
324
+ " link\n",
325
+ " )\n",
326
+ "\"\"\")\n",
327
+ "con.commit()"
328
+ ]
329
+ },
330
+ {
331
+ "cell_type": "code",
332
+ "execution_count": 42,
333
+ "metadata": {},
334
+ "outputs": [],
335
+ "source": [
336
+ "cur.execute(\"drop table arxivsql\")\n",
337
+ "con.commit()"
338
+ ]
339
+ },
340
+ {
341
+ "cell_type": "code",
342
+ "execution_count": 8,
343
+ "metadata": {},
344
+ "outputs": [
345
+ {
346
+ "name": "stdout",
347
+ "output_type": "stream",
348
+ "text": [
349
+ "(3000, 8)\n",
350
+ "<class 'numpy.ndarray'>\n"
351
+ ]
352
+ }
353
+ ],
354
+ "source": [
355
+ "import pandas as pd\n",
356
+ "df = pd.read_csv(\"arxiv_crawl.csv\",index_col=0,header=0)\n",
357
+ "print(df.shape)\n",
358
+ "records = df.values\n",
359
+ "print(type(records))"
360
+ ]
361
+ },
362
+ {
363
+ "cell_type": "code",
364
+ "execution_count": 9,
365
+ "metadata": {},
366
+ "outputs": [
367
+ {
368
+ "name": "stdout",
369
+ "output_type": "stream",
370
+ "text": [
371
+ "Domenico Amato, Giosue' Lo Bosco, Raffaele Giancarl\n"
372
+ ]
373
+ }
374
+ ],
375
+ "source": [
376
+ "def chunk_text(text, max_char=400):\n",
377
+ " \"\"\"\n",
378
+ " Chunk a long text into several chunks, with each chunk about 300-400 characters long,\n",
379
+ " but make sure no word is cut in half.\n",
380
+ " Args:\n",
381
+ " text: The long text to be chunked.\n",
382
+ " max_char: The maximum number of characters per chunk (default: 400).\n",
383
+ " Returns:\n",
384
+ " A list of chunks.\n",
385
+ " \"\"\"\n",
386
+ " chunks = []\n",
387
+ " current_chunk = \"\"\n",
388
+ " words = text.split()\n",
389
+ " for word in words:\n",
390
+ " # Check if adding the word would exceed the chunk limit (including overlap)\n",
391
+ " if len(current_chunk) + len(word) + 1 >= max_char:\n",
392
+ " chunks.append(current_chunk)\n",
393
+ " current_chunk = word\n",
394
+ " else:\n",
395
+ " current_chunk += \" \" + word\n",
396
+ " chunks.append(current_chunk.strip())\n",
397
+ " return chunks\n",
398
+ "\n",
399
+ "def process_authors(authors):\n",
400
+ " text = \"\"\n",
401
+ " for author in authors:\n",
402
+ " text+=author+\", \"\n",
403
+ " return text[:-3]\n",
404
+ "\n",
405
+ "print(process_authors(records[32][5]))"
406
+ ]
407
+ },
408
+ {
409
+ "cell_type": "code",
410
+ "execution_count": 10,
411
+ "metadata": {},
412
+ "outputs": [
413
+ {
414
+ "name": "stdout",
415
+ "output_type": "stream",
416
+ "text": [
417
+ "200\n",
418
+ "400\n",
419
+ "600\n",
420
+ "800\n",
421
+ "1000\n",
422
+ "1200\n",
423
+ "1400\n",
424
+ "1600\n",
425
+ "1800\n",
426
+ "2000\n",
427
+ "2200\n",
428
+ "2400\n",
429
+ "2600\n",
430
+ "2800\n"
431
+ ]
432
+ },
433
+ {
434
+ "name": "stderr",
435
+ "output_type": "stream",
436
+ "text": [
437
+ "Insert of existing embedding ID: 2111.13171v1_0\n",
438
+ "Insert of existing embedding ID: 2111.13171v1_1\n",
439
+ "Insert of existing embedding ID: 2111.13171v1_2\n",
440
+ "Insert of existing embedding ID: 2111.13171v1_3\n",
441
+ "Insert of existing embedding ID: 2111.13171v1_4\n",
442
+ "Add of existing embedding ID: 2111.13171v1_0\n",
443
+ "Add of existing embedding ID: 2111.13171v1_1\n",
444
+ "Add of existing embedding ID: 2111.13171v1_2\n",
445
+ "Add of existing embedding ID: 2111.13171v1_3\n",
446
+ "Add of existing embedding ID: 2111.13171v1_4\n",
447
+ "Insert of existing embedding ID: 2211.03756v1_0\n",
448
+ "Insert of existing embedding ID: 2211.03756v1_1\n",
449
+ "Insert of existing embedding ID: 2211.03756v1_2\n",
450
+ "Insert of existing embedding ID: 2211.03756v1_3\n",
451
+ "Insert of existing embedding ID: 2211.03756v1_4\n",
452
+ "Insert of existing embedding ID: 2211.03756v1_5\n",
453
+ "Add of existing embedding ID: 2211.03756v1_0\n",
454
+ "Add of existing embedding ID: 2211.03756v1_1\n",
455
+ "Add of existing embedding ID: 2211.03756v1_2\n",
456
+ "Add of existing embedding ID: 2211.03756v1_3\n",
457
+ "Add of existing embedding ID: 2211.03756v1_4\n",
458
+ "Add of existing embedding ID: 2211.03756v1_5\n"
459
+ ]
460
+ },
461
+ {
462
+ "name": "stdout",
463
+ "output_type": "stream",
464
+ "text": [
465
+ "3000\n"
466
+ ]
467
+ }
468
+ ],
469
+ "source": [
470
+ "count = 0\n",
471
+ "for record in records:\n",
472
+ " # add to vector db\n",
473
+ " embed_text = \"\"\"\n",
474
+ " Topic: {},\n",
475
+ " Title: {},\n",
476
+ " Summary: {}\n",
477
+ "\"\"\".format(\n",
478
+ " record[0], record[4], record[7]\n",
479
+ " )\n",
480
+ " chunks = chunk_text(embed_text)\n",
481
+ " ids = [record[1][21:] + \"_\" + str(j) for j in range(len(chunks))]\n",
482
+ " paper_ids = [{\"paper_id\": record[1][21:]} for _ in range(len(chunks))]\n",
483
+ " collection.add(documents=chunks, metadatas=paper_ids, ids=ids)\n",
484
+ " # try:\n",
485
+ " # query = \"\"\"insert into arxivsql values(\"{}\",\"{}\",\"{}\",\"{}\",\"{}\",\"{}\",\"{}\")\"\"\".format(\n",
486
+ " # record[1][21:],\n",
487
+ " # record[0],\n",
488
+ " # record[4].replace('\"', \"'\"),\n",
489
+ " # process_authors(record[5]),\n",
490
+ " # record[2][:10],\n",
491
+ " # record[3][:10],\n",
492
+ " # record[6],\n",
493
+ " # )\n",
494
+ " # cur.execute(query)\n",
495
+ " # con.commit()\n",
496
+ " # except Exception as e:\n",
497
+ " # print(e)\n",
498
+ " # print(query)\n",
499
+ " count += 1\n",
500
+ " if count % 200 == 0:\n",
501
+ " print(count)"
502
+ ]
503
+ },
504
+ {
505
+ "cell_type": "code",
506
+ "execution_count": 29,
507
+ "metadata": {},
508
+ "outputs": [],
509
+ "source": [
510
+ "cur.execute(\"\"\"insert into arxivsql values(\"{}\",\"{}\",\"{}\",\"{}\",\"{}\",\"{}\",\"{}\")\"\"\".format(\n",
511
+ " \"1906.04027v2\", #editing\n",
512
+ " \"electrical engineering and system science\",\n",
513
+ " \"'Did You Hear That?'' Learning to Play Video Games from Audio Cues\",\"Raluca D. Gaina, Matthew Stephenso\",\n",
514
+ " \"Hadi Abdullah, Muhammad Sajidur Rahman, Washington Garcia, Logan Blue, Kevin Warren, Anurag Swarnim Yadav, Tom Shrimpton, Patrick Trayno\",\n",
515
+ " \"2019-06-11\",\n",
516
+ " \"2019-06-10\",\n",
517
+ " \"http://arxiv.org/pdf/1910.05262v1\"\n",
518
+ " ))\n",
519
+ "con.commit()"
520
+ ]
521
+ },
522
+ {
523
+ "cell_type": "code",
524
+ "execution_count": 11,
525
+ "metadata": {},
526
+ "outputs": [
527
+ {
528
+ "ename": "NameError",
529
+ "evalue": "name 'cur' is not defined",
530
+ "output_type": "error",
531
+ "traceback": [
532
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
533
+ "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
534
+ "Cell \u001b[1;32mIn[11], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mcur\u001b[49m\u001b[38;5;241m.\u001b[39mexecute(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mselect * from arxivsql where True and True\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28mprint\u001b[39m(res\u001b[38;5;241m.\u001b[39mfetchall())\n",
535
+ "\u001b[1;31mNameError\u001b[0m: name 'cur' is not defined"
536
+ ]
537
+ }
538
+ ],
539
+ "source": [
540
+ "res = cur.execute(\"select * from arxivsql where True and True\")\n",
541
+ "print(res.fetchall())"
542
+ ]
543
+ },
544
+ {
545
+ "cell_type": "code",
546
+ "execution_count": 12,
547
+ "metadata": {},
548
+ "outputs": [
549
+ {
550
+ "name": "stdout",
551
+ "output_type": "stream",
552
+ "text": [
553
+ "10740\n"
554
+ ]
555
+ }
556
+ ],
557
+ "source": [
558
+ "print(collection.count())"
559
+ ]
560
+ },
561
+ {
562
+ "cell_type": "code",
563
+ "execution_count": 43,
564
+ "metadata": {},
565
+ "outputs": [
566
+ {
567
+ "name": "stdout",
568
+ "output_type": "stream",
569
+ "text": [
570
+ "['2211.03756v1_0', '2211.03756v1_1', '2211.03756v1_2', '2211.03756v1_3', '2211.03756v1_4', '2211.03756v1_5', '2211.03756v1_6']\n"
571
+ ]
572
+ }
573
+ ],
574
+ "source": [
575
+ "id = \"2211.03756v1\"\n",
576
+ "ids = [\"{}_{}\".format(id,j) for j in range(0,10)]\n",
577
+ "results = collection.get(ids=ids)\n",
578
+ "print(results[\"ids\"])"
579
+ ]
580
+ },
581
+ {
582
+ "cell_type": "code",
583
+ "execution_count": null,
584
+ "metadata": {},
585
+ "outputs": [],
586
+ "source": [
587
+ "results = collection.query(\n",
588
+ " query_texts = \"recommend academic articles or books related to the field of artificial intelligence, machine learning and technology for the AI intern to explore further\",\n",
589
+ " where_document = {\n",
590
+ " \"$or\":[\n",
591
+ " {\"$contains\":\"AI\"},\n",
592
+ " {\"$contains\":\"machine learning\"},\n",
593
+ " {\"$contains\":\"technology\"}\n",
594
+ " ]\n",
595
+ " },\n",
596
+ " n_results=3\n",
597
+ ")"
598
+ ]
599
+ },
600
+ {
601
+ "cell_type": "code",
602
+ "execution_count": 51,
603
+ "metadata": {},
604
+ "outputs": [
605
+ {
606
+ "name": "stdout",
607
+ "output_type": "stream",
608
+ "text": [
609
+ "['title', 'author']\n"
610
+ ]
611
+ }
612
+ ],
613
+ "source": [
614
+ "args = {\"title\":\"Attention is all you need\",\n",
615
+ " \"author\": \"Vaswani, Ashish and Shazeer\"}\n",
616
+ "keys = list(dict.keys(args))\n",
617
+ "print(keys)\n"
618
+ ]
619
+ },
620
+ {
621
+ "cell_type": "code",
622
+ "execution_count": null,
623
+ "metadata": {},
624
+ "outputs": [],
625
+ "source": [
626
+ "def printline(txt, maxline = 100):\n",
627
+ " for i in range(len(txt)):\n",
628
+ " if i%maxline == maxline-1:\n",
629
+ " print(txt[i],end=\"\\n\")\n",
630
+ " else: print(txt[i],end=\"\")\n",
631
+ "\n",
632
+ "print(dict.keys(results))\n",
633
+ "# get metadatas\n",
634
+ "target = results['metadatas'][0]\n",
635
+ "for rec in target:\n",
636
+ " print(rec['author'])\n",
637
+ " print(rec['link'])\n",
638
+ " printline(rec['summary'])\n",
639
+ " print(\"\\n------------------------------------------\")"
640
+ ]
641
+ },
642
+ {
643
+ "cell_type": "code",
644
+ "execution_count": null,
645
+ "metadata": {},
646
+ "outputs": [],
647
+ "source": [
648
+ "t = target[0]\n",
649
+ "print(t['link'])\n",
650
+ "print(t['title'])\n",
651
+ "print(t['summary'])"
652
+ ]
653
+ },
654
+ {
655
+ "cell_type": "code",
656
+ "execution_count": null,
657
+ "metadata": {},
658
+ "outputs": [],
659
+ "source": [
660
+ "args = '[\"AI technologies\",\"Find academic papers\"]'\n",
661
+ "print(list(args))"
662
+ ]
663
+ }
664
+ ],
665
+ "metadata": {
666
+ "kernelspec": {
667
+ "display_name": "Python 3",
668
+ "language": "python",
669
+ "name": "python3"
670
+ },
671
+ "language_info": {
672
+ "codemirror_mode": {
673
+ "name": "ipython",
674
+ "version": 3
675
+ },
676
+ "file_extension": ".py",
677
+ "mimetype": "text/x-python",
678
+ "name": "python",
679
+ "nbconvert_exporter": "python",
680
+ "pygments_lexer": "ipython3",
681
+ "version": "3.11.2"
682
+ }
683
+ },
684
+ "nbformat": 4,
685
+ "nbformat_minor": 2
686
+ }