Spaces:
Sleeping
Sleeping
Refactoring gemini functions (#9)
Browse files- Refactoring gemini functions (b483e15951ebe8673817778370aa68a016fa09fa)
Co-authored-by: Trương Tấn Cường <tosanoob@users.noreply.huggingface.co>
This view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +37 -35
- .gitignore +4 -6
- arxivdb/4b0c8007-4402-4129-b225-8bd2a39f0757/data_level0.bin +3 -0
- arxivdb/4b0c8007-4402-4129-b225-8bd2a39f0757/header.bin +3 -0
- arxivdb/4b0c8007-4402-4129-b225-8bd2a39f0757/index_metadata.pickle +3 -0
- arxivdb/4b0c8007-4402-4129-b225-8bd2a39f0757/length.bin +3 -0
- arxivdb/4b0c8007-4402-4129-b225-8bd2a39f0757/link_lists.bin +3 -0
- arxivdb/6a7f20ca-1ffd-40b4-9707-7d9628097d5a/data_level0.bin +3 -0
- arxivdb/6a7f20ca-1ffd-40b4-9707-7d9628097d5a/header.bin +3 -0
- arxivdb/6a7f20ca-1ffd-40b4-9707-7d9628097d5a/index_metadata.pickle +3 -0
- arxivdb/6a7f20ca-1ffd-40b4-9707-7d9628097d5a/length.bin +3 -0
- arxivdb/6a7f20ca-1ffd-40b4-9707-7d9628097d5a/link_lists.bin +3 -0
- arxivdb/7e557f5a-be88-4080-aa85-e3bcd927fcf9/data_level0.bin +3 -0
- arxivdb/7e557f5a-be88-4080-aa85-e3bcd927fcf9/header.bin +3 -0
- arxivdb/7e557f5a-be88-4080-aa85-e3bcd927fcf9/index_metadata.pickle +3 -0
- arxivdb/7e557f5a-be88-4080-aa85-e3bcd927fcf9/length.bin +3 -0
- arxivdb/7e557f5a-be88-4080-aa85-e3bcd927fcf9/link_lists.bin +3 -0
- arxivdb/951c5fab-677a-4406-b99b-b4006c3423a2/data_level0.bin +3 -0
- arxivdb/951c5fab-677a-4406-b99b-b4006c3423a2/header.bin +3 -0
- arxivdb/951c5fab-677a-4406-b99b-b4006c3423a2/index_metadata.pickle +3 -0
- arxivdb/951c5fab-677a-4406-b99b-b4006c3423a2/length.bin +3 -0
- arxivdb/951c5fab-677a-4406-b99b-b4006c3423a2/link_lists.bin +3 -0
- arxivdb/chroma.sqlite3 +3 -0
- arxivdb/chromadb.sqlite3 +0 -0
- chat/__init__.py +2 -2
- chat/__pycache__/__init__.cpython-311.pyc +0 -0
- chat/__pycache__/apps.cpython-311.pyc +0 -0
- chat/__pycache__/consumers.cpython-311.pyc +0 -0
- chat/__pycache__/model_manage.cpython-311.pyc +0 -0
- chat/__pycache__/model_manage2.cpython-311.pyc +0 -0
- chat/__pycache__/routing.cpython-311.pyc +0 -0
- chat/__pycache__/urls.cpython-311.pyc +0 -0
- chat/__pycache__/views.cpython-311.pyc +0 -0
- chat/arxiv_bot/__pycache__/arxiv_bot_utils.cpython-311.pyc +0 -0
- chat/arxiv_bot/__pycache__/arxiv_bot_utils2.cpython-311.pyc +0 -0
- chat/arxiv_bot/arxiv_bot_utils.py +248 -248
- chat/arxiv_bot/arxiv_bot_utils2.py +297 -0
- chat/arxiv_bot/prebuild.ipynb +354 -354
- chat/consumers.py +10 -6
- chat/migrations/__pycache__/0001_initial.cpython-311.pyc +0 -0
- chat/migrations/__pycache__/__init__.cpython-311.pyc +0 -0
- chat/model_manage.py +238 -238
- chat/model_manage2.py +174 -0
- chatbot_django/__pycache__/__init__.cpython-311.pyc +0 -0
- chatbot_django/__pycache__/asgi.cpython-311.pyc +0 -0
- chatbot_django/__pycache__/settings.cpython-311.pyc +0 -0
- chatbot_django/__pycache__/urls.cpython-311.pyc +0 -0
- concat.txt +0 -0
- db.sqlite3 +0 -0
- models/models--jinaai--jina-bert-implementation/blobs/64b6ce6fe4477c320b0ab303e2f26ae98beae1f7 +0 -0
.gitattributes
CHANGED
@@ -1,35 +1,37 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
arxivdb/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
37 |
+
models/models--jinaai--jina-embeddings-v2-base-en/blobs/6b70f1386f05b9703ea4edf7f1550a8925399f9580e4cc754cc099efc1e736d8 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
@@ -1,7 +1,5 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
apikey.txt
|
6 |
-
db.sqlite3
|
7 |
hotfix.ipynb
|
|
|
1 |
+
models/
|
2 |
+
__pycache__/
|
3 |
+
*.pyc
|
4 |
+
apikey.txt
|
|
|
|
|
5 |
hotfix.ipynb
|
arxivdb/4b0c8007-4402-4129-b225-8bd2a39f0757/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8d96c82cf4183e567eddf45be92064c7d818268621da9821caa2367bb20cba18
|
3 |
+
size 32120000
|
arxivdb/4b0c8007-4402-4129-b225-8bd2a39f0757/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a494575edafaafb2b60f5a2ad563719976abf7ae3a35ca7c9b5aaae36842006c
|
3 |
+
size 100
|
arxivdb/4b0c8007-4402-4129-b225-8bd2a39f0757/index_metadata.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:91e4880dca7113b4c3a3644e63aa5809f4a30474d1332f66d5f0ad082fe41833
|
3 |
+
size 357939
|
arxivdb/4b0c8007-4402-4129-b225-8bd2a39f0757/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:814d4b3244fb0f86f8d5beac519239863d973c20c8fec45624d0c0ae54baf9cf
|
3 |
+
size 40000
|
arxivdb/4b0c8007-4402-4129-b225-8bd2a39f0757/link_lists.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d679f3012c3a4ae23e21dbfce89bb153cab85edef4c19f5340a4464e99f4c014
|
3 |
+
size 87396
|
arxivdb/6a7f20ca-1ffd-40b4-9707-7d9628097d5a/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:31ea31ff76723407f460b7534220ef974bfb3a563732c1a85e01fd9b2610dc13
|
3 |
+
size 6424000
|
arxivdb/6a7f20ca-1ffd-40b4-9707-7d9628097d5a/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:db5064bd751b93036fa600922f99c2534c183c3335c5267c8c5413a73f450320
|
3 |
+
size 100
|
arxivdb/6a7f20ca-1ffd-40b4-9707-7d9628097d5a/index_metadata.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cd938c16ea62b22a52094297d5d570442daba226ad67e941b0254655e843c67a
|
3 |
+
size 65937
|
arxivdb/6a7f20ca-1ffd-40b4-9707-7d9628097d5a/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9d9d14a589aeeaf2e86552f9c3f1bb4f556e49244f186540c71bac6c1680e834
|
3 |
+
size 8000
|
arxivdb/6a7f20ca-1ffd-40b4-9707-7d9628097d5a/link_lists.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3751bd54da338722a3b5370921bf446e34169a639a18beb7145e5d4e9e3778e3
|
3 |
+
size 18268
|
arxivdb/7e557f5a-be88-4080-aa85-e3bcd927fcf9/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:86e41597eb04379b7582da7eeb5fb0aaca29eb32749069e69886358370fab575
|
3 |
+
size 3212000
|
arxivdb/7e557f5a-be88-4080-aa85-e3bcd927fcf9/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fdb00e89b6ee7733fd37556b1da3447d9895ad7431512096c0e073ed667a25d0
|
3 |
+
size 100
|
arxivdb/7e557f5a-be88-4080-aa85-e3bcd927fcf9/index_metadata.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cd5b11142a96276baf9591e9524a8d9241eb013902301021dddea3a81b61d63a
|
3 |
+
size 33934
|
arxivdb/7e557f5a-be88-4080-aa85-e3bcd927fcf9/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e680a6fe8d1f2bf76260963cf27e0c7bd58c39e9c82262906a285eaf89b1c27d
|
3 |
+
size 4000
|
arxivdb/7e557f5a-be88-4080-aa85-e3bcd927fcf9/link_lists.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:666164435753a1784160baf485cc1c80e665103e6bd19a1998430f93246f1c29
|
3 |
+
size 8624
|
arxivdb/951c5fab-677a-4406-b99b-b4006c3423a2/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ecac6a0b1c9974d085507909895bec9040788bd20bf184eae140000cef97551d
|
3 |
+
size 38544000
|
arxivdb/951c5fab-677a-4406-b99b-b4006c3423a2/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:269f137da42a494d996ad44046f5e349b59d2d31eca4b39aa82d7ec76f62cdf9
|
3 |
+
size 100
|
arxivdb/951c5fab-677a-4406-b99b-b4006c3423a2/index_metadata.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:08461957df6b58835618a34c77a9c96b6dc54f21e04c60c9d10dd36d5b864414
|
3 |
+
size 429953
|
arxivdb/951c5fab-677a-4406-b99b-b4006c3423a2/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e648660e0a36f652356dd7e0210b243cba14b3b7c267c3c05fdc7614b1d2dd03
|
3 |
+
size 48000
|
arxivdb/951c5fab-677a-4406-b99b-b4006c3423a2/link_lists.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:13b650da98a6bd2ec371437494a2cb09a2fae5b67d6eead12af43b40fb548e7c
|
3 |
+
size 104644
|
arxivdb/chroma.sqlite3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0c48f817474996b45a3f4da1e127a2fde083db4bfeddb71893d598b8200fb056
|
3 |
+
size 123736064
|
arxivdb/chromadb.sqlite3
ADDED
File without changes
|
chat/__init__.py
CHANGED
@@ -3,7 +3,7 @@ import chat.arxiv_bot.arxiv_bot_utils as utils
|
|
3 |
import os
|
4 |
from getpass import getpass
|
5 |
import json
|
6 |
-
from .model_manage import get_model
|
7 |
|
8 |
-
model = get_model()
|
9 |
|
|
|
3 |
import os
|
4 |
from getpass import getpass
|
5 |
import json
|
6 |
+
# from .model_manage import get_model
|
7 |
|
8 |
+
# model = get_model()
|
9 |
|
chat/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (358 Bytes). View file
|
|
chat/__pycache__/apps.cpython-311.pyc
ADDED
Binary file (602 Bytes). View file
|
|
chat/__pycache__/consumers.cpython-311.pyc
ADDED
Binary file (1.91 kB). View file
|
|
chat/__pycache__/model_manage.cpython-311.pyc
ADDED
Binary file (164 Bytes). View file
|
|
chat/__pycache__/model_manage2.cpython-311.pyc
ADDED
Binary file (10.5 kB). View file
|
|
chat/__pycache__/routing.cpython-311.pyc
ADDED
Binary file (567 Bytes). View file
|
|
chat/__pycache__/urls.cpython-311.pyc
ADDED
Binary file (456 Bytes). View file
|
|
chat/__pycache__/views.cpython-311.pyc
ADDED
Binary file (601 Bytes). View file
|
|
chat/arxiv_bot/__pycache__/arxiv_bot_utils.cpython-311.pyc
ADDED
Binary file (177 Bytes). View file
|
|
chat/arxiv_bot/__pycache__/arxiv_bot_utils2.cpython-311.pyc
ADDED
Binary file (19.1 kB). View file
|
|
chat/arxiv_bot/arxiv_bot_utils.py
CHANGED
@@ -1,276 +1,276 @@
|
|
1 |
-
import chromadb
|
2 |
-
from chromadb import Documents, EmbeddingFunction, Embeddings
|
3 |
-
from transformers import AutoModel
|
4 |
-
import json
|
5 |
-
from numpy.linalg import norm
|
6 |
-
import sqlite3
|
7 |
-
import urllib
|
8 |
-
from django.conf import settings
|
9 |
|
10 |
|
11 |
-
# this module act as a singleton class
|
12 |
|
13 |
-
class JinaAIEmbeddingFunction(EmbeddingFunction):
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
|
22 |
-
# instance of embedding_model
|
23 |
-
embedding_model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en',
|
24 |
-
|
25 |
-
|
26 |
|
27 |
-
# instance of JinaAIEmbeddingFunction
|
28 |
-
ef = JinaAIEmbeddingFunction(embedding_model)
|
29 |
|
30 |
-
# list of topics
|
31 |
-
topic_descriptions = json.load(open("topic_descriptions.txt"))
|
32 |
-
topics = list(dict.keys(topic_descriptions))
|
33 |
-
embeddings = [embedding_model.encode(topic_descriptions[key]) for key in topic_descriptions]
|
34 |
-
cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
|
35 |
|
36 |
-
def choose_topic(summary):
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
|
47 |
-
def authors_list_to_str(authors):
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
|
54 |
-
def authors_str_to_list(string):
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
|
63 |
-
def chunk_texts(text, max_char=400):
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
|
85 |
-
def trimming(txt):
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
|
90 |
-
# crawl data
|
91 |
|
92 |
-
def extract_tag(txt,tagname):
|
93 |
-
|
94 |
|
95 |
-
def get_record(extract):
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
|
111 |
-
def crawl_exact_paper(title,author,max_results=3):
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
|
129 |
-
def crawl_arxiv(keyword_list, max_results=100):
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
|
152 |
-
class ArxivSQL:
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
|
215 |
-
# instance of ArxivSQL
|
216 |
-
sqldb = ArxivSQL()
|
217 |
|
218 |
-
class ArxivChroma:
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
|
274 |
-
# instance of ArxivChroma
|
275 |
-
db = ArxivChroma()
|
276 |
|
|
|
1 |
+
# import chromadb
|
2 |
+
# from chromadb import Documents, EmbeddingFunction, Embeddings
|
3 |
+
# from transformers import AutoModel
|
4 |
+
# import json
|
5 |
+
# from numpy.linalg import norm
|
6 |
+
# import sqlite3
|
7 |
+
# import urllib
|
8 |
+
# from django.conf import settings
|
9 |
|
10 |
|
11 |
+
# # this module act as a singleton class
|
12 |
|
13 |
+
# class JinaAIEmbeddingFunction(EmbeddingFunction):
|
14 |
+
# def __init__(self, model):
|
15 |
+
# super().__init__()
|
16 |
+
# self.model = model
|
17 |
|
18 |
+
# def __call__(self, input: Documents) -> Embeddings:
|
19 |
+
# embeddings = self.model.encode(input)
|
20 |
+
# return embeddings.tolist()
|
21 |
|
22 |
+
# # instance of embedding_model
|
23 |
+
# embedding_model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en',
|
24 |
+
# trust_remote_code=True,
|
25 |
+
# cache_dir='models')
|
26 |
|
27 |
+
# # instance of JinaAIEmbeddingFunction
|
28 |
+
# ef = JinaAIEmbeddingFunction(embedding_model)
|
29 |
|
30 |
+
# # list of topics
|
31 |
+
# topic_descriptions = json.load(open("topic_descriptions.txt"))
|
32 |
+
# topics = list(dict.keys(topic_descriptions))
|
33 |
+
# embeddings = [embedding_model.encode(topic_descriptions[key]) for key in topic_descriptions]
|
34 |
+
# cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
|
35 |
|
36 |
+
# def choose_topic(summary):
|
37 |
+
# embed = embedding_model.encode(summary)
|
38 |
+
# topic = ""
|
39 |
+
# max_sim = 0.
|
40 |
+
# for i,key in enumerate(topics):
|
41 |
+
# sim = cos_sim(embed,embeddings[i])
|
42 |
+
# if sim > max_sim:
|
43 |
+
# topic = key
|
44 |
+
# max_sim = sim
|
45 |
+
# return topic
|
46 |
|
47 |
+
# def authors_list_to_str(authors):
|
48 |
+
# """input a list of authors, return a string represent authors"""
|
49 |
+
# text = ""
|
50 |
+
# for author in authors:
|
51 |
+
# text+=author+", "
|
52 |
+
# return text[:-3]
|
53 |
|
54 |
+
# def authors_str_to_list(string):
|
55 |
+
# """input a string of authors, return a list of authors"""
|
56 |
+
# authors = []
|
57 |
+
# list_auth = string.split("and")
|
58 |
+
# for author in list_auth:
|
59 |
+
# if author != "et al.":
|
60 |
+
# authors.append(author.strip())
|
61 |
+
# return authors
|
62 |
|
63 |
+
# def chunk_texts(text, max_char=400):
|
64 |
+
# """
|
65 |
+
# Chunk a long text into several chunks, with each chunk about 300-400 characters long,
|
66 |
+
# but make sure no word is cut in half.
|
67 |
+
# Args:
|
68 |
+
# text: The long text to be chunked.
|
69 |
+
# max_char: The maximum number of characters per chunk (default: 400).
|
70 |
+
# Returns:
|
71 |
+
# A list of chunks.
|
72 |
+
# """
|
73 |
+
# chunks = []
|
74 |
+
# current_chunk = ""
|
75 |
+
# words = text.split()
|
76 |
+
# for word in words:
|
77 |
+
# if len(current_chunk) + len(word) + 1 >= max_char:
|
78 |
+
# chunks.append(current_chunk)
|
79 |
+
# current_chunk = " "
|
80 |
+
# else:
|
81 |
+
# current_chunk += " " + word
|
82 |
+
# chunks.append(current_chunk.strip())
|
83 |
+
# return chunks
|
84 |
|
85 |
+
# def trimming(txt):
|
86 |
+
# start = txt.find("{")
|
87 |
+
# end = txt.rfind("}")
|
88 |
+
# return txt[start:end+1].replace("\n"," ")
|
89 |
|
90 |
+
# # crawl data
|
91 |
|
92 |
+
# def extract_tag(txt,tagname):
|
93 |
+
# return txt[txt.find("<"+tagname+">")+len(tagname)+2:txt.find("</"+tagname+">")]
|
94 |
|
95 |
+
# def get_record(extract):
|
96 |
+
# id = extract_tag(extract,"id")
|
97 |
+
# updated = extract_tag(extract,"updated")
|
98 |
+
# published = extract_tag(extract,"published")
|
99 |
+
# title = extract_tag(extract,"title").replace("\n ","").strip()
|
100 |
+
# summary = extract_tag(extract,"summary").replace("\n","").strip()
|
101 |
+
# authors = []
|
102 |
+
# while extract.find("<author>")!=-1:
|
103 |
+
# author = extract_tag(extract,"name")
|
104 |
+
# extract = extract[extract.find("</author>")+9:]
|
105 |
+
# authors.append(author)
|
106 |
+
# pattern = '<link title="pdf" href="'
|
107 |
+
# link_start = extract.find('<link title="pdf" href="')
|
108 |
+
# link = extract[link_start+len(pattern):extract.find("rel=",link_start)-2]
|
109 |
+
# return [id, updated, published, title, authors, link, summary]
|
110 |
|
111 |
+
# def crawl_exact_paper(title,author,max_results=3):
|
112 |
+
# authors = authors_list_to_str(author)
|
113 |
+
# records = []
|
114 |
+
# url = 'http://export.arxiv.org/api/query?search_query=ti:{title}+AND+au:{author}&max_results={max_results}'.format(title=title,author=authors,max_results=max_results)
|
115 |
+
# url = url.replace(" ","%20")
|
116 |
+
# try:
|
117 |
+
# arxiv_page = urllib.request.urlopen(url,timeout=100).read()
|
118 |
+
# xml = str(arxiv_page,encoding="utf-8")
|
119 |
+
# while xml.find("<entry>") != -1:
|
120 |
+
# extract = xml[xml.find("<entry>")+7:xml.find("</entry>")]
|
121 |
+
# xml = xml[xml.find("</entry>")+8:]
|
122 |
+
# extract = get_record(extract)
|
123 |
+
# topic = choose_topic(extract[6])
|
124 |
+
# records.append([topic,*extract])
|
125 |
+
# return records
|
126 |
+
# except Exception as e:
|
127 |
+
# return "Error: "+str(e)
|
128 |
|
129 |
+
# def crawl_arxiv(keyword_list, max_results=100):
|
130 |
+
# baseurl = 'http://export.arxiv.org/api/query?search_query='
|
131 |
+
# records = []
|
132 |
+
# for i,keyword in enumerate(keyword_list):
|
133 |
+
# if i ==0:
|
134 |
+
# url = baseurl + 'all:' + keyword
|
135 |
+
# else:
|
136 |
+
# url = url + '+OR+' + 'all:' + keyword
|
137 |
+
# url = url+ '&max_results=' + str(max_results)
|
138 |
+
# url = url.replace(' ', '%20')
|
139 |
+
# try:
|
140 |
+
# arxiv_page = urllib.request.urlopen(url,timeout=100).read()
|
141 |
+
# xml = str(arxiv_page,encoding="utf-8")
|
142 |
+
# while xml.find("<entry>") != -1:
|
143 |
+
# extract = xml[xml.find("<entry>")+7:xml.find("</entry>")]
|
144 |
+
# xml = xml[xml.find("</entry>")+8:]
|
145 |
+
# extract = get_record(extract)
|
146 |
+
# topic = choose_topic(extract[6])
|
147 |
+
# records.append([topic,*extract])
|
148 |
+
# return records
|
149 |
+
# except Exception as e:
|
150 |
+
# return "Error: "+str(e)
|
151 |
|
152 |
+
# class ArxivSQL:
|
153 |
+
# def __init__(self, table="arxivsql", name="db.sqlite3"):
|
154 |
+
# self.con = sqlite3.connect(name)
|
155 |
+
# self.cur = self.con.cursor()
|
156 |
+
# self.table = table
|
157 |
|
158 |
+
# def query(self, title="", author=[]):
|
159 |
+
# if len(title)>0:
|
160 |
+
# query_title = 'title like "%{}%"'.format(title)
|
161 |
+
# else:
|
162 |
+
# query_title = "True"
|
163 |
+
# if len(author)>0:
|
164 |
+
# query_author = 'authors like '
|
165 |
+
# for auth in author:
|
166 |
+
# query_author += "'%{}%' or ".format(auth)
|
167 |
+
# query_author = query_author[:-4]
|
168 |
+
# else:
|
169 |
+
# query_author = "True"
|
170 |
+
# query = "select * from {} where {} and {}".format(self.table,query_title,query_author)
|
171 |
+
# result = self.cur.execute(query)
|
172 |
+
# return result.fetchall()
|
173 |
|
174 |
+
# def query_id(self, ids=[]):
|
175 |
+
# try:
|
176 |
+
# if len(ids) == 0:
|
177 |
+
# return None
|
178 |
+
# query = "select * from {} where id in (".format(self.table)
|
179 |
+
# for id in ids:
|
180 |
+
# query+="'"+id+"',"
|
181 |
+
# query = query[:-1] + ")"
|
182 |
+
# result = self.cur.execute(query)
|
183 |
+
# return result.fetchall()
|
184 |
+
# except Exception as e:
|
185 |
+
# print(e)
|
186 |
+
# print("Error query: ",query)
|
187 |
|
188 |
+
# def add(self, crawl_records):
|
189 |
+
# """
|
190 |
+
# Add crawl_records (list) obtained from arxiv_crawlers
|
191 |
+
# A record is a list of 8 columns:
|
192 |
+
# [topic, id, updated, published, title, author, link, summary]
|
193 |
+
# Return the final length of the database table
|
194 |
+
# """
|
195 |
+
# results = ""
|
196 |
+
# for record in crawl_records:
|
197 |
+
# try:
|
198 |
+
# query = """insert into arxivsql values("{}","{}","{}","{}","{}","{}","{}")""".format(
|
199 |
+
# record[1][21:],
|
200 |
+
# record[0],
|
201 |
+
# record[4].replace('"',"'"),
|
202 |
+
# authors_list_to_str(record[5]),
|
203 |
+
# record[2][:10],
|
204 |
+
# record[3][:10],
|
205 |
+
# record[6]
|
206 |
+
# )
|
207 |
+
# self.cur.execute(query)
|
208 |
+
# self.con.commit()
|
209 |
+
# except Exception as e:
|
210 |
+
# result+=str(e)
|
211 |
+
# result+="\n" + query + "\n"
|
212 |
+
# finally:
|
213 |
+
# return results
|
214 |
|
215 |
+
# # instance of ArxivSQL
|
216 |
+
# sqldb = ArxivSQL()
|
217 |
|
218 |
+
# class ArxivChroma:
|
219 |
+
# """
|
220 |
+
# Create an interface to arxivdb, which only support query and addition.
|
221 |
+
# This interface do not support edition and deletion procedures.
|
222 |
+
# """
|
223 |
+
# def __init__(self, table="arxiv_records", name="arxivdb/"):
|
224 |
+
# self.client = chromadb.PersistentClient(name)
|
225 |
+
# self.model = embedding_model
|
226 |
+
# self.collection = self.client.get_or_create_collection(table,
|
227 |
+
# embedding_function=JinaAIEmbeddingFunction(
|
228 |
+
# model = self.model
|
229 |
+
# ))
|
230 |
|
231 |
+
# def query_relevant(self, keywords, query_texts, n_results=3):
|
232 |
+
# """
|
233 |
+
# Perform a query using a list of keywords (str),
|
234 |
+
# or using a relavant string
|
235 |
+
# """
|
236 |
+
# contains = []
|
237 |
+
# for keyword in keywords:
|
238 |
+
# contains.append({"$contains":keyword.lower()})
|
239 |
+
# return self.collection.query(
|
240 |
+
# query_texts=query_texts,
|
241 |
+
# where_document={
|
242 |
+
# "$or":contains
|
243 |
+
# },
|
244 |
+
# n_results=n_results,
|
245 |
+
# )
|
246 |
|
247 |
+
# def query_exact(self, id):
|
248 |
+
# ids = ["{}_{}".format(id,j) for j in range(0,10)]
|
249 |
+
# return self.collection.get(ids=ids)
|
250 |
|
251 |
+
# def add(self, crawl_records):
|
252 |
+
# """
|
253 |
+
# Add crawl_records (list) obtained from arxiv_crawlers
|
254 |
+
# A record is a list of 8 columns:
|
255 |
+
# [topic, id, updated, published, title, author, link, summary]
|
256 |
+
# Return the final length of the database table
|
257 |
+
# """
|
258 |
+
# for record in crawl_records:
|
259 |
+
# embed_text = """
|
260 |
+
# Topic: {},
|
261 |
+
# Title: {},
|
262 |
+
# Summary: {}
|
263 |
+
# """.format(record[0],record[4],record[7])
|
264 |
+
# chunks = chunk_texts(embed_text)
|
265 |
+
# ids = [record[1][21:]+"_"+str(j) for j in range(len(chunks))]
|
266 |
+
# paper_ids = [{"paper_id":record[1][21:]} for _ in range(len(chunks))]
|
267 |
+
# self.collection.add(
|
268 |
+
# documents = chunks,
|
269 |
+
# metadatas=paper_ids,
|
270 |
+
# ids = ids
|
271 |
+
# )
|
272 |
+
# return self.collection.count()
|
273 |
|
274 |
+
# # instance of ArxivChroma
|
275 |
+
# db = ArxivChroma()
|
276 |
|
chat/arxiv_bot/arxiv_bot_utils2.py
ADDED
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import chromadb
|
2 |
+
from chromadb import Documents, EmbeddingFunction, Embeddings
|
3 |
+
from transformers import AutoModel
|
4 |
+
import json
|
5 |
+
from numpy.linalg import norm
|
6 |
+
import sqlite3
|
7 |
+
import urllib
|
8 |
+
from django.conf import settings
|
9 |
+
import Levenshtein
|
10 |
+
|
11 |
+
# this module act as a singleton class
|
12 |
+
|
13 |
+
class JinaAIEmbeddingFunction(EmbeddingFunction):
|
14 |
+
def __init__(self, model):
|
15 |
+
super().__init__()
|
16 |
+
self.model = model
|
17 |
+
|
18 |
+
def __call__(self, input: Documents) -> Embeddings:
|
19 |
+
embeddings = self.model.encode(input)
|
20 |
+
return embeddings.tolist()
|
21 |
+
|
22 |
+
# instance of embedding_model
|
23 |
+
embedding_model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en',
|
24 |
+
trust_remote_code=True,
|
25 |
+
cache_dir='models')
|
26 |
+
|
27 |
+
# instance of JinaAIEmbeddingFunction
|
28 |
+
ef = JinaAIEmbeddingFunction(embedding_model)
|
29 |
+
|
30 |
+
# list of topics
|
31 |
+
topic_descriptions = json.load(open("topic_descriptions.txt"))
|
32 |
+
topics = list(dict.keys(topic_descriptions))
|
33 |
+
embeddings = [embedding_model.encode(topic_descriptions[key]) for key in topic_descriptions]
|
34 |
+
cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
|
35 |
+
|
36 |
+
def lev_sim(a,b): return Levenshtein.distance(a,b)
|
37 |
+
|
38 |
+
def choose_topic(summary):
|
39 |
+
embed = embedding_model.encode(summary)
|
40 |
+
topic = ""
|
41 |
+
max_sim = 0.
|
42 |
+
for i,key in enumerate(topics):
|
43 |
+
sim = cos_sim(embed,embeddings[i])
|
44 |
+
if sim > max_sim:
|
45 |
+
topic = key
|
46 |
+
max_sim = sim
|
47 |
+
return topic
|
48 |
+
|
49 |
+
def authors_list_to_str(authors):
|
50 |
+
"""input a list of authors, return a string represent authors"""
|
51 |
+
text = ""
|
52 |
+
for author in authors:
|
53 |
+
text+=author+", "
|
54 |
+
return text[:-3]
|
55 |
+
|
56 |
+
def authors_str_to_list(string):
|
57 |
+
"""input a string of authors, return a list of authors"""
|
58 |
+
authors = []
|
59 |
+
list_auth = string.split("and")
|
60 |
+
for author in list_auth:
|
61 |
+
if author != "et al.":
|
62 |
+
authors.append(author.strip())
|
63 |
+
return authors
|
64 |
+
|
65 |
+
def chunk_texts(text, max_char=400):
|
66 |
+
"""
|
67 |
+
Chunk a long text into several chunks, with each chunk about 300-400 characters long,
|
68 |
+
but make sure no word is cut in half.
|
69 |
+
Args:
|
70 |
+
text: The long text to be chunked.
|
71 |
+
max_char: The maximum number of characters per chunk (default: 400).
|
72 |
+
Returns:
|
73 |
+
A list of chunks.
|
74 |
+
"""
|
75 |
+
chunks = []
|
76 |
+
current_chunk = ""
|
77 |
+
words = text.split()
|
78 |
+
for word in words:
|
79 |
+
if len(current_chunk) + len(word) + 1 >= max_char:
|
80 |
+
chunks.append(current_chunk)
|
81 |
+
current_chunk = " "
|
82 |
+
else:
|
83 |
+
current_chunk += " " + word
|
84 |
+
chunks.append(current_chunk.strip())
|
85 |
+
return chunks
|
86 |
+
|
87 |
+
def trimming(txt):
|
88 |
+
start = txt.find("{")
|
89 |
+
end = txt.rfind("}")
|
90 |
+
return txt[start:end+1].replace("\n"," ")
|
91 |
+
|
92 |
+
# crawl data
|
93 |
+
|
94 |
+
def extract_tag(txt,tagname):
|
95 |
+
return txt[txt.find("<"+tagname+">")+len(tagname)+2:txt.find("</"+tagname+">")]
|
96 |
+
|
97 |
+
def get_record(extract):
|
98 |
+
id = extract_tag(extract,"id")
|
99 |
+
updated = extract_tag(extract,"updated")
|
100 |
+
published = extract_tag(extract,"published")
|
101 |
+
title = extract_tag(extract,"title").replace("\n ","").strip()
|
102 |
+
summary = extract_tag(extract,"summary").replace("\n","").strip()
|
103 |
+
authors = []
|
104 |
+
while extract.find("<author>")!=-1:
|
105 |
+
author = extract_tag(extract,"name")
|
106 |
+
extract = extract[extract.find("</author>")+9:]
|
107 |
+
authors.append(author)
|
108 |
+
pattern = '<link title="pdf" href="'
|
109 |
+
link_start = extract.find('<link title="pdf" href="')
|
110 |
+
link = extract[link_start+len(pattern):extract.find("rel=",link_start)-2]
|
111 |
+
return [id, updated, published, title, authors, link, summary]
|
112 |
+
|
113 |
+
def crawl_exact_paper(title,author,max_results=3):
|
114 |
+
authors = authors_list_to_str(author)
|
115 |
+
records = []
|
116 |
+
url = 'http://export.arxiv.org/api/query?search_query=ti:{title}+AND+au:{author}&max_results={max_results}'.format(title=title,author=authors,max_results=max_results)
|
117 |
+
url = url.replace(" ","%20")
|
118 |
+
try:
|
119 |
+
arxiv_page = urllib.request.urlopen(url,timeout=100).read()
|
120 |
+
xml = str(arxiv_page,encoding="utf-8")
|
121 |
+
while xml.find("<entry>") != -1:
|
122 |
+
extract = xml[xml.find("<entry>")+7:xml.find("</entry>")]
|
123 |
+
xml = xml[xml.find("</entry>")+8:]
|
124 |
+
extract = get_record(extract)
|
125 |
+
topic = choose_topic(extract[6])
|
126 |
+
records.append([topic,*extract])
|
127 |
+
return records
|
128 |
+
except Exception as e:
|
129 |
+
return "Error: "+str(e)
|
130 |
+
|
131 |
+
def crawl_arxiv(keyword_list, max_results=100):
|
132 |
+
baseurl = 'http://export.arxiv.org/api/query?search_query='
|
133 |
+
records = []
|
134 |
+
for i,keyword in enumerate(keyword_list):
|
135 |
+
if i ==0:
|
136 |
+
url = baseurl + 'all:' + keyword
|
137 |
+
else:
|
138 |
+
url = url + '+OR+' + 'all:' + keyword
|
139 |
+
url = url+ '&max_results=' + str(max_results)
|
140 |
+
url = url.replace(' ', '%20')
|
141 |
+
try:
|
142 |
+
arxiv_page = urllib.request.urlopen(url,timeout=100).read()
|
143 |
+
xml = str(arxiv_page,encoding="utf-8")
|
144 |
+
while xml.find("<entry>") != -1:
|
145 |
+
extract = xml[xml.find("<entry>")+7:xml.find("</entry>")]
|
146 |
+
xml = xml[xml.find("</entry>")+8:]
|
147 |
+
extract = get_record(extract)
|
148 |
+
topic = choose_topic(extract[6])
|
149 |
+
records.append([topic,*extract])
|
150 |
+
return records
|
151 |
+
except Exception as e:
|
152 |
+
return "Error: "+str(e)
|
153 |
+
|
154 |
+
# This class act as a module
|
155 |
+
class ArxivChroma:
|
156 |
+
"""
|
157 |
+
Create an interface to arxivdb, which only support query and addition.
|
158 |
+
This interface do not support edition and deletion procedures.
|
159 |
+
"""
|
160 |
+
client = None
|
161 |
+
model = None
|
162 |
+
collection = None
|
163 |
+
|
164 |
+
@staticmethod
|
165 |
+
def connect(table="arxiv_records", name="arxivdb/"):
|
166 |
+
ArxivChroma.client = chromadb.PersistentClient(name)
|
167 |
+
ArxivChroma.model = embedding_model
|
168 |
+
ArxivChroma.collection = ArxivChroma.client.get_or_create_collection(table,
|
169 |
+
embedding_function=JinaAIEmbeddingFunction(
|
170 |
+
model = ArxivChroma.model
|
171 |
+
))
|
172 |
+
|
173 |
+
@staticmethod
|
174 |
+
def query_relevant(keywords, query_texts, n_results=3):
|
175 |
+
"""
|
176 |
+
Perform a query using a list of keywords (str),
|
177 |
+
or using a relavant string
|
178 |
+
"""
|
179 |
+
contains = []
|
180 |
+
for keyword in keywords:
|
181 |
+
contains.append({"$contains":keyword.lower()})
|
182 |
+
return ArxivChroma.collection.query(
|
183 |
+
query_texts=query_texts,
|
184 |
+
where_document={
|
185 |
+
"$or":contains
|
186 |
+
},
|
187 |
+
n_results=n_results,
|
188 |
+
)
|
189 |
+
|
190 |
+
@staticmethod
|
191 |
+
def query_exact(id):
|
192 |
+
ids = ["{}_{}".format(id,j) for j in range(0,10)]
|
193 |
+
return ArxivChroma.collection.get(ids=ids)
|
194 |
+
|
195 |
+
@staticmethod
|
196 |
+
def add(crawl_records):
|
197 |
+
"""
|
198 |
+
Add crawl_records (list) obtained from arxiv_crawlers
|
199 |
+
A record is a list of 8 columns:
|
200 |
+
[topic, id, updated, published, title, author, link, summary]
|
201 |
+
Return the final length of the database table
|
202 |
+
"""
|
203 |
+
for record in crawl_records:
|
204 |
+
embed_text = """
|
205 |
+
Topic: {},
|
206 |
+
Title: {},
|
207 |
+
Summary: {}
|
208 |
+
""".format(record[0],record[4],record[7])
|
209 |
+
chunks = chunk_texts(embed_text)
|
210 |
+
ids = [record[1][21:]+"_"+str(j) for j in range(len(chunks))]
|
211 |
+
paper_ids = [{"paper_id":record[1][21:]} for _ in range(len(chunks))]
|
212 |
+
ArxivChroma.collection.add(
|
213 |
+
documents = chunks,
|
214 |
+
metadatas=paper_ids,
|
215 |
+
ids = ids
|
216 |
+
)
|
217 |
+
return ArxivChroma.collection.count()
|
218 |
+
|
219 |
+
@staticmethod
|
220 |
+
def close_connection():
|
221 |
+
pass
|
222 |
+
|
223 |
+
# This class act as a module
|
224 |
+
class ArxivSQL:
|
225 |
+
table = "arxivsql"
|
226 |
+
con = None
|
227 |
+
cur = None
|
228 |
+
|
229 |
+
@staticmethod
|
230 |
+
def connect(name="db.sqlite3"):
|
231 |
+
ArxivSQL.con = sqlite3.connect(name, check_same_thread=False)
|
232 |
+
ArxivSQL.cur = ArxivSQL.con.cursor()
|
233 |
+
|
234 |
+
@staticmethod
|
235 |
+
def query(title="", author=[], threshold = 15):
|
236 |
+
if len(author)>0:
|
237 |
+
query_author= " OR ".join([f"author LIKE '%{a}%'" for a in author])
|
238 |
+
else:
|
239 |
+
query_author= "True"
|
240 |
+
# Execute the query
|
241 |
+
query = f"select * from {ArxivSQL.table} where {query_author}"
|
242 |
+
results = ArxivSQL.cursor.execute(query).fetchall()
|
243 |
+
if len(title) == 0:
|
244 |
+
return results
|
245 |
+
else:
|
246 |
+
sim_score = {}
|
247 |
+
for row in results:
|
248 |
+
row_title = row[2]
|
249 |
+
row_id = row[0]
|
250 |
+
score = lev_sim(title, row_title)
|
251 |
+
if score < threshold:
|
252 |
+
sim_score[row_id] = score
|
253 |
+
sorted_results = sorted(sim_score.items(), key=lambda x: x[1])
|
254 |
+
return ArxivSQL.query_id(sorted_results)
|
255 |
+
|
256 |
+
@staticmethod
|
257 |
+
def query_id(ids=[]):
|
258 |
+
try:
|
259 |
+
if len(ids) == 0:
|
260 |
+
return None
|
261 |
+
query = "select * from {} where id in (".format(ArxivSQL.table)
|
262 |
+
for id in ids:
|
263 |
+
query+="'"+id+"',"
|
264 |
+
query = query[:-1] + ")"
|
265 |
+
result = ArxivSQL.cur.execute(query)
|
266 |
+
return result.fetchall()
|
267 |
+
except Exception as e:
|
268 |
+
print(e)
|
269 |
+
print("Error query: ",query)
|
270 |
+
|
271 |
+
@staticmethod
|
272 |
+
def add(crawl_records):
|
273 |
+
"""
|
274 |
+
Add crawl_records (list) obtained from arxiv_crawlers
|
275 |
+
A record is a list of 8 columns:
|
276 |
+
[topic, id, updated, published, title, author, link, summary]
|
277 |
+
Return the final length of the database table
|
278 |
+
"""
|
279 |
+
results = ""
|
280 |
+
for record in crawl_records:
|
281 |
+
try:
|
282 |
+
query = """insert into arxivsql values("{}","{}","{}","{}","{}","{}","{}")""".format(
|
283 |
+
record[1][21:],
|
284 |
+
record[0],
|
285 |
+
record[4].replace('"',"'"),
|
286 |
+
authors_list_to_str(record[5]),
|
287 |
+
record[2][:10],
|
288 |
+
record[3][:10],
|
289 |
+
record[6]
|
290 |
+
)
|
291 |
+
ArxivSQL.cur.execute(query)
|
292 |
+
ArxivSQL.con.commit()
|
293 |
+
except Exception as e:
|
294 |
+
results+=str(e)
|
295 |
+
results+="\n" + query + "\n"
|
296 |
+
finally:
|
297 |
+
return results
|
chat/arxiv_bot/prebuild.ipynb
CHANGED
@@ -1,354 +1,354 @@
|
|
1 |
-
{
|
2 |
-
"cells": [
|
3 |
-
{
|
4 |
-
"cell_type": "code",
|
5 |
-
"execution_count": 1,
|
6 |
-
"metadata": {},
|
7 |
-
"outputs": [
|
8 |
-
{
|
9 |
-
"name": "stderr",
|
10 |
-
"output_type": "stream",
|
11 |
-
"text": [
|
12 |
-
"d:\\Program\\Anaconda\\envs\\python_project\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
13 |
-
" from .autonotebook import tqdm as notebook_tqdm\n",
|
14 |
-
"d:\\Program\\Anaconda\\envs\\python_project\\lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
|
15 |
-
" warnings.warn(\n",
|
16 |
-
"d:\\Program\\Anaconda\\envs\\python_project\\lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
|
17 |
-
" warnings.warn(\n"
|
18 |
-
]
|
19 |
-
}
|
20 |
-
],
|
21 |
-
"source": [
|
22 |
-
"import google.generativeai as genai\n",
|
23 |
-
"import arxiv_bot_utils as utils\n",
|
24 |
-
"import os\n",
|
25 |
-
"from getpass import getpass\n",
|
26 |
-
"import json\n",
|
27 |
-
"#chỉ là import một cách bình thường\n",
|
28 |
-
"#nội dung là "
|
29 |
-
]
|
30 |
-
},
|
31 |
-
{
|
32 |
-
"cell_type": "code",
|
33 |
-
"execution_count": 2,
|
34 |
-
"metadata": {},
|
35 |
-
"outputs": [
|
36 |
-
{
|
37 |
-
"name": "stdout",
|
38 |
-
"output_type": "stream",
|
39 |
-
"text": [
|
40 |
-
"models/gemini-1.0-pro\n",
|
41 |
-
"models/gemini-1.0-pro-001\n",
|
42 |
-
"models/gemini-1.0-pro-latest\n",
|
43 |
-
"models/gemini-1.0-pro-vision-latest\n",
|
44 |
-
"models/gemini-1.5-pro-latest\n",
|
45 |
-
"models/gemini-pro\n",
|
46 |
-
"models/gemini-pro-vision\n"
|
47 |
-
]
|
48 |
-
}
|
49 |
-
],
|
50 |
-
"source": [
|
51 |
-
"os.environ['GEMINI_API_KEY'] = getpass(\"Input your API key: \")\n",
|
52 |
-
"# gán biến môi trường luôn\n",
|
53 |
-
"gemini_api_key = os.getenv(\"GEMINI_API_KEY\") # string trong môi trường\n",
|
54 |
-
"if not gemini_api_key:\n",
|
55 |
-
" raise ValueError(\n",
|
56 |
-
" \"Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable\"\n",
|
57 |
-
" )\n",
|
58 |
-
"genai.configure(api_key=gemini_api_key)\n",
|
59 |
-
"for m in genai.list_models():\n",
|
60 |
-
" if 'generateContent' in m.supported_generation_methods:\n",
|
61 |
-
" print(m.name)\n",
|
62 |
-
" #models nằm trên máy chủ\n"
|
63 |
-
]
|
64 |
-
},
|
65 |
-
{
|
66 |
-
"cell_type": "code",
|
67 |
-
"execution_count": 3,
|
68 |
-
"metadata": {},
|
69 |
-
"outputs": [],
|
70 |
-
"source": [
|
71 |
-
"config = genai.GenerationConfig(max_output_tokens=2048,\n",
|
72 |
-
" temperature=0.7)\n",
|
73 |
-
"safety_settings = [\n",
|
74 |
-
" {\n",
|
75 |
-
" \"category\": \"HARM_CATEGORY_DANGEROUS\",\n",
|
76 |
-
" \"threshold\": \"BLOCK_NONE\",\n",
|
77 |
-
" },\n",
|
78 |
-
" {\n",
|
79 |
-
" \"category\": \"HARM_CATEGORY_HARASSMENT\",\n",
|
80 |
-
" \"threshold\": \"BLOCK_NONE\",\n",
|
81 |
-
" },\n",
|
82 |
-
" {\n",
|
83 |
-
" \"category\": \"HARM_CATEGORY_HATE_SPEECH\",\n",
|
84 |
-
" \"threshold\": \"BLOCK_NONE\",\n",
|
85 |
-
" },\n",
|
86 |
-
" {\n",
|
87 |
-
" \"category\": \"HARM_CATEGORY_SEXUALLY_EXPLICIT\",\n",
|
88 |
-
" \"threshold\": \"BLOCK_NONE\",\n",
|
89 |
-
" },\n",
|
90 |
-
" {\n",
|
91 |
-
" \"category\": \"HARM_CATEGORY_DANGEROUS_CONTENT\",\n",
|
92 |
-
" \"threshold\": \"BLOCK_NONE\",\n",
|
93 |
-
" },\n",
|
94 |
-
"]\n",
|
95 |
-
"model = genai.GenerativeModel(\"gemini-pro\",\n",
|
96 |
-
" generation_config=config,\n",
|
97 |
-
" safety_settings=safety_settings)"
|
98 |
-
]
|
99 |
-
},
|
100 |
-
{
|
101 |
-
"cell_type": "code",
|
102 |
-
"execution_count": 4,
|
103 |
-
"metadata": {},
|
104 |
-
"outputs": [],
|
105 |
-
"source": [
|
106 |
-
"def extract_keyword_prompt(query):\n",
|
107 |
-
" \"\"\"A prompt that return a JSON block as arguments for querying database\"\"\"\n",
|
108 |
-
"\n",
|
109 |
-
" prompt = (\n",
|
110 |
-
" \"\"\"[INST] SYSTEM: You are an assistant that choose only one action below based on guest question.\n",
|
111 |
-
" 1. If the guest question is asking for a single specific document or article with explicit title, you need to respond the information in JSON format with 2 keys \"title\", \"author\" if found any above. The authors are separated with the word 'and'. \n",
|
112 |
-
" 2. If the guest question is asking for relevant informations about a topic, you need to respond the information in JSON format with 2 keys \"keywords\", \"description\", include a list of keywords represent the main academic topic, \\\n",
|
113 |
-
" and a description about the main topic. You may paraphrase the keywords to add more. \\\n",
|
114 |
-
" 3. If the guest is not asking for any informations or documents, you need to respond with a polite answer in JSON format with 1 key \"answer\".\n",
|
115 |
-
" QUESTION: '{query}'\n",
|
116 |
-
" [/INST]\n",
|
117 |
-
" ANSWER: \n",
|
118 |
-
" \"\"\"\n",
|
119 |
-
" ).format(query=query)\n",
|
120 |
-
"\n",
|
121 |
-
" return prompt\n",
|
122 |
-
"\n",
|
123 |
-
"def make_answer_prompt(input, contexts):\n",
|
124 |
-
" \"\"\"A prompt that return the final answer, based on the queried context\"\"\"\n",
|
125 |
-
"\n",
|
126 |
-
" prompt = (\n",
|
127 |
-
" \"\"\"[INST] You are a library assistant that help to search articles and documents based on user's question.\n",
|
128 |
-
" From guest's question, you have found some records and documents that may help. Now you need to answer the guest with the information found.\n",
|
129 |
-
" If no information found in the database, you may generate some other recommendation related to user's question using your own knowledge. Each article or paper must have a link to the pdf download page.\n",
|
130 |
-
" You should answer in a conversational form politely.\n",
|
131 |
-
" QUESTION: '{input}'\n",
|
132 |
-
" INFORMATION: '{contexts}'\n",
|
133 |
-
" [/INST]\n",
|
134 |
-
" ANSWER:\n",
|
135 |
-
" \"\"\"\n",
|
136 |
-
" ).format(input=input, contexts=contexts)\n",
|
137 |
-
"\n",
|
138 |
-
" return prompt"
|
139 |
-
]
|
140 |
-
},
|
141 |
-
{
|
142 |
-
"cell_type": "code",
|
143 |
-
"execution_count": 5,
|
144 |
-
"metadata": {},
|
145 |
-
"outputs": [],
|
146 |
-
"source": [
|
147 |
-
"def response(args):\n",
|
148 |
-
" \"\"\"Create response context, based on input arguments\"\"\"\n",
|
149 |
-
" keys = list(dict.keys(args))\n",
|
150 |
-
" if \"answer\" in keys:\n",
|
151 |
-
" return args['answer'], None # trả lời trực tiếp\n",
|
152 |
-
" \n",
|
153 |
-
" if \"keywords\" in keys:\n",
|
154 |
-
" # perform query\n",
|
155 |
-
" query_texts = args[\"description\"]\n",
|
156 |
-
" keywords = args[\"keywords\"]\n",
|
157 |
-
" results = utils.db.query_relevant(keywords=keywords, query_texts=query_texts)\n",
|
158 |
-
" # print(results)\n",
|
159 |
-
" ids = results['metadatas'][0]\n",
|
160 |
-
" if len(ids) == 0:\n",
|
161 |
-
" # go crawl some\n",
|
162 |
-
" new_records = utils.crawl_arxiv(keyword_list=keywords, max_results=10)\n",
|
163 |
-
" print(\"Got new records: \",len(new_records))\n",
|
164 |
-
" if type(new_records) == str:\n",
|
165 |
-
" return \"Error occured, information not found\", new_records\n",
|
166 |
-
" utils.db.add(new_records)\n",
|
167 |
-
" utils.sqldb.add(new_records)\n",
|
168 |
-
" results = utils.db.query_relevant(keywords=keywords, query_texts=query_texts)\n",
|
169 |
-
" ids = results['metadatas'][0]\n",
|
170 |
-
" print(\"Re-queried on chromadb, results: \",ids)\n",
|
171 |
-
" paper_id = [id['paper_id'] for id in ids]\n",
|
172 |
-
" paper_info = utils.sqldb.query_id(paper_id)\n",
|
173 |
-
" print(paper_info)\n",
|
174 |
-
" records = [] # get title (2), author (3), link (6)\n",
|
175 |
-
" result_string = \"\"\n",
|
176 |
-
" if paper_info:\n",
|
177 |
-
" for i in range(len(paper_info)):\n",
|
178 |
-
" result_string += \"Title: {}, Author: {}, Link: {}\".format(paper_info[i][2],paper_info[i][3],paper_info[i][6])\n",
|
179 |
-
" records.append([paper_info[i][2],paper_info[i][3],paper_info[i][6]])\n",
|
180 |
-
" return result_string, records\n",
|
181 |
-
" else:\n",
|
182 |
-
" return \"Information not found\", \"Information not found\"\n",
|
183 |
-
" # invoke llm and return result\n",
|
184 |
-
"\n",
|
185 |
-
" if \"title\" in keys:\n",
|
186 |
-
" title = args['title']\n",
|
187 |
-
" authors = utils.authors_str_to_list(args['author'])\n",
|
188 |
-
" paper_info = utils.sqldb.query(title = title,author = authors)\n",
|
189 |
-
" # if query not found then go crawl brh\n",
|
190 |
-
" # print(paper_info)\n",
|
191 |
-
"\n",
|
192 |
-
" if len(paper_info) == 0:\n",
|
193 |
-
" new_records = utils.crawl_exact_paper(title=title,author=authors)\n",
|
194 |
-
" print(\"Got new records: \",len(new_records))\n",
|
195 |
-
" if type(new_records) == str:\n",
|
196 |
-
" # print(new_records)\n",
|
197 |
-
" return \"Error occured, information not found\", \"Information not found\"\n",
|
198 |
-
" utils.db.add(new_records)\n",
|
199 |
-
" utils.sqldb.add(new_records)\n",
|
200 |
-
" paper_info = utils.sqldb.query(title = title,author = authors)\n",
|
201 |
-
" print(\"Re-queried on chromadb, results: \",paper_info)\n",
|
202 |
-
" # -------------------------------------\n",
|
203 |
-
" records = [] # get title (2), author (3), link (6)\n",
|
204 |
-
" result_string = \"\"\n",
|
205 |
-
" for i in range(len(paper_info)):\n",
|
206 |
-
" result_string += \"Title: {}, Author: {}, Link: {}\".format(paper_info[i][2],paper_info[i][3],paper_info[i][6])\n",
|
207 |
-
" records.append([paper_info[i][2],paper_info[i][3],paper_info[i][6]])\n",
|
208 |
-
" # process results:\n",
|
209 |
-
" if len(result_string) == 0:\n",
|
210 |
-
" return \"Information not found\", \"Information not found\"\n",
|
211 |
-
" return result_string, records\n",
|
212 |
-
" # invoke llm and return result"
|
213 |
-
]
|
214 |
-
},
|
215 |
-
{
|
216 |
-
"cell_type": "code",
|
217 |
-
"execution_count": 6,
|
218 |
-
"metadata": {},
|
219 |
-
"outputs": [],
|
220 |
-
"source": [
|
221 |
-
"def full_chain_single_question(input_prompt):\n",
|
222 |
-
" try:\n",
|
223 |
-
" first_prompt = extract_keyword_prompt(input_prompt)\n",
|
224 |
-
" temp_answer = model.generate_content(first_prompt).text\n",
|
225 |
-
"\n",
|
226 |
-
" args = json.loads(utils.trimming(temp_answer))\n",
|
227 |
-
" contexts, results = response(args)\n",
|
228 |
-
" if not results:\n",
|
229 |
-
" print(contexts)\n",
|
230 |
-
" else:\n",
|
231 |
-
" output_prompt = make_answer_prompt(input_prompt,contexts)\n",
|
232 |
-
" answer = model.generate_content(output_prompt).text\n",
|
233 |
-
" return temp_answer, answer\n",
|
234 |
-
" except Exception as e:\n",
|
235 |
-
" print(e)\n",
|
236 |
-
" return temp_answer, \"Error occured: \" + str(e)"
|
237 |
-
]
|
238 |
-
},
|
239 |
-
{
|
240 |
-
"cell_type": "code",
|
241 |
-
"execution_count": 27,
|
242 |
-
"metadata": {},
|
243 |
-
"outputs": [
|
244 |
-
{
|
245 |
-
"name": "stdout",
|
246 |
-
"output_type": "stream",
|
247 |
-
"text": [
|
248 |
-
"[('1903.04824v1', 'computer science', 'Proceedings of the Fifth International Conference on Cloud and Robotics (ICCR2018)', ' Huaxi, Zhang, Jacques Malenfan', '2019-03-12', '2019-03-12', 'http://arxiv.org/pdf/1903.04824v1'), ('1709.07597v1', 'economics', 'Inverse Reinforcement Learning with Conditional Choice Probabilities', 'Mohit Sharma, Kris M. Kitani, Joachim Groege', '2017-09-22', '2017-09-22', 'http://arxiv.org/pdf/1709.07597v1')]\n",
|
249 |
-
"Sure, here are some key papers on model predictive control for nonlinear systems:\n",
|
250 |
-
"\n",
|
251 |
-
"* **Nonlinear Model Predictive Control: A Survey** by Garcia, P.D., Prett, D.M., and Morari, M. (1989)\n",
|
252 |
-
"* **Model Predictive Control for Nonlinear Systems** by Camacho, E.F. and Bordons, C. (1999)\n",
|
253 |
-
"* **Nonlinear Model Predictive Control** by Rawlings, J.B. and Mayne, D.Q. (2009)\n",
|
254 |
-
"\n",
|
255 |
-
"As for recent reviews on the application of control theory to robotics, here are a few:\n",
|
256 |
-
"\n",
|
257 |
-
"* **Control of Robot Manipulators** by Spong, M.W., Hutchinson, S., and Vidyasagar, M. (2006)\n",
|
258 |
-
"* **Robotics: Modelling, Planning and Control** by Siciliano, B., Sciavicco, L., Villani, L., and Oriolo, G. (2010)\n",
|
259 |
-
"* **Control of Robot Arms** by Featherstone, R. (2014)\n",
|
260 |
-
"\n",
|
261 |
-
"I hope this information is helpful. Please let me know if you have any other questions.\n"
|
262 |
-
]
|
263 |
-
}
|
264 |
-
],
|
265 |
-
"source": [
|
266 |
-
"# test response, second step\n",
|
267 |
-
"input_prompt = \"Can you suggest some key papers on model predictive control for nonlinear systems, and are there any recent reviews on the application of control theory to robotics?\"\n",
|
268 |
-
"args = \"{\\n \\\"keywords\\\": [\\\"Model predictive control\\\", \\\"Nonlinear systems\\\", \\\"Robotics\\\", \\\"Control theory\\\"],\\n \\\"description\\\": \\\"Model predictive control (MPC) is a control algorithm that uses a model of the system to predict future behavior and optimize the control inputs. MPC is particularly well-suited for nonlinear systems, as it can handle the complex dynamics of these systems. In recent years, MPC has been increasingly applied to robotics, as it can improve the performance and safety of robotic systems. Control theory is a branch of mathematics that deals with the analysis and design of control systems. Control theory has been applied to a wide range of problems in robotics, including motion planning, trajectory tracking, and force control.\\\"\\n}\"\n",
|
269 |
-
"args = json.loads(args)\n",
|
270 |
-
"contexts, results = response(args)\n",
|
271 |
-
"if not results:\n",
|
272 |
-
" # direct answer\n",
|
273 |
-
" print(contexts)\n",
|
274 |
-
"else:\n",
|
275 |
-
" output_prompt = make_answer_prompt(input_prompt,contexts)\n",
|
276 |
-
" answer = model.generate_content(output_prompt).text\n",
|
277 |
-
" print(answer)"
|
278 |
-
]
|
279 |
-
},
|
280 |
-
{
|
281 |
-
"cell_type": "code",
|
282 |
-
"execution_count": 7,
|
283 |
-
"metadata": {},
|
284 |
-
"outputs": [
|
285 |
-
{
|
286 |
-
"name": "stdout",
|
287 |
-
"output_type": "stream",
|
288 |
-
"text": [
|
289 |
-
"{'desired': 'Natural Language Processing (Computer Science)', 'question': 'What are some recent papers on deep learning architectures for text classification, and can you recommend any surveys or reviews on the topic?'}\n",
|
290 |
-
"0\n",
|
291 |
-
"[('1808.08121v1', 'computer science', 'An Improvement of Data Classification Using Random Multimodel Deep Learning (RMDL)', 'Mojtaba Heidarysafa, Kamran Kowsari, Donald E. Brown, Kiana Jafari Meimandi, Laura E. Barne', '2018-08-23', '2018-08-23', 'http://arxiv.org/pdf/1808.08121v1'), ('1904.08067v5', 'computer science', 'Text Classification Algorithms: A Survey', 'Kamran Kowsari, Kiana Jafari Meimandi, Mojtaba Heidarysafa, Sanjana Mendu, Laura E. Barnes, Donald E. Brow', '2020-05-20', '2019-04-17', 'http://arxiv.org/pdf/1904.08067v5'), ('2202.09144v1', 'computer science', 'Modelling the semantics of text in complex document layouts using graph transformer networks', 'Thomas Roland Barillot, Jacob Saks, Polena Lilyanova, Edward Torgas, Yachen Hu, Yuanqing Liu, Varun Balupuri, Paul Gaskel', '2022-02-18', '2022-02-18', 'http://arxiv.org/pdf/2202.09144v1')]\n",
|
292 |
-
"1\n",
|
293 |
-
"[('1601.04187v1', 'computer science', 'Conversion of Artificial Recurrent Neural Networks to Spiking Neural Networks for Low-power Neuromorphic Hardware', 'Peter U. Diehl, Guido Zarrella, Andrew Cassidy, Bruno U. Pedroni, Emre Neftc', '2016-01-16', '2016-01-16', 'http://arxiv.org/pdf/1601.04187v1'), ('1801.01093v3', 'economics', 'Comparing the Forecasting Performances of Linear Models for Electricity Prices with High RES Penetration', 'Angelica Gianfreda, Francesco Ravazzolo, Luca Rossin', '2019-11-12', '2018-01-03', 'http://arxiv.org/pdf/1801.01093v3'), ('2302.11093v1', 'electrical engineering and system science', 'Use Cases for Time-Frequency Image Representations and Deep Learning Techniques for Improved Signal Classification', 'Mehmet Parla', '2023-02-22', '2023-02-22', 'http://arxiv.org/pdf/2302.11093v1')]\n",
|
294 |
-
"2\n",
|
295 |
-
"[('1505.07907v4', 'economics', 'Linking Economic Complexity, Institutions and Income Inequality', 'D. Hartmann, M. R. Guevara, C. Jara-Figueroa, M. Aristaran, C. A. Hidalg', '2017-01-04', '2015-05-29', 'http://arxiv.org/pdf/1505.07907v4'), ('2107.06855v2', 'economics', 'Comparing Intellectual property policy in the Global North and South -- A one-size-fits-all policy for economic prosperity?', 'S Sidhartha Narayan, Malavika Ranjan, Madhumitha Raghurama', '2021-08-10', '2021-07-14', 'http://arxiv.org/pdf/2107.06855v2'), ('1910.11780v1', 'economics', 'Inequality in Turkey: Looking Beyond Growth', 'Bayram Cakir, Ipek Ergu', '2019-10-25', '2019-10-25', 'http://arxiv.org/pdf/1910.11780v1')]\n",
|
296 |
-
"3\n",
|
297 |
-
"[('1607.06583v2', 'computer science', \"Classification of Alzheimer's Disease Structural MRI Data by Deep Learning Convolutional Neural Networks\", 'Saman Sarraf, Ghassem Tofigh', '2017-05-19', '2016-07-22', 'http://arxiv.org/pdf/1607.06583v2'), ('2101.10265v1', 'computer science', 'Superiorities of Deep Extreme Learning Machines against Convolutional Neural Networks', 'Gokhan Altan, Yakup Kutl', '2021-01-21', '2021-01-21', 'http://arxiv.org/pdf/2101.10265v1'), ('2208.03143v1', 'computer science', 'Deep Learning and Health Informatics for Smart Monitoring and Diagnosis', 'Amin Gasm', '2022-08-05', '2022-08-05', 'http://arxiv.org/pdf/2208.03143v1')]\n",
|
298 |
-
"4\n",
|
299 |
-
"[('2302.06584v3', 'computer science', 'Thermodynamic AI and the fluctuation frontier', 'Patrick J. Coles, Collin Szczepanski, Denis Melanson, Kaelan Donatella, Antonio J. Martinez, Faris Sbah', '2023-06-13', '2023-02-09', 'http://arxiv.org/pdf/2302.06584v3'), ('2307.12298v1', 'computer science', 'Stabilization and Dissipative Information Transfer of a Superconducting Kerr-Cat Qubit', 'Ufuk Korkmaz, Deniz Türkpenç', '2023-07-23', '2023-07-23', 'http://arxiv.org/pdf/2307.12298v1'), ('2106.10421v1', 'computer science', 'QFCNN: Quantum Fourier Convolutional Neural Network', 'Feihong Shen, Jun Li', '2021-06-19', '2021-06-19', 'http://arxiv.org/pdf/2106.10421v1')]\n",
|
300 |
-
"5\n",
|
301 |
-
"[('2308.16539v2', 'computer science', 'On a Connection between Differential Games, Optimal Control, and Energy-based Models for Multi-Agent Interactions', 'Christopher Diehl, Tobias Klosek, Martin Krüger, Nils Murzyn, Torsten Bertra', '2023-10-16', '2023-08-31', 'http://arxiv.org/pdf/2308.16539v2'), ('2404.12474v1', 'computer science', 'Learning a Stable, Safe, Distributed Feedback Controller for a Heterogeneous Platoon of Vehicles', 'Michael H. Shaham, Taskin Padi', '2024-04-18', '2024-04-18', 'http://arxiv.org/pdf/2404.12474v1'), ('2008.13221v1', 'computer science', 'Human-in-the-Loop Methods for Data-Driven and Reinforcement Learning Systems', 'Vinicius G. Goeck', '2020-08-30', '2020-08-30', 'http://arxiv.org/pdf/2008.13221v1')]\n",
|
302 |
-
"6\n",
|
303 |
-
"[('1911.06206v3', 'economics', 'Bayesian state-space modeling for analyzing heterogeneous network effects of US monetary policy', 'Niko Hauzenberger, Michael Pfarrhofe', '2020-09-10', '2019-11-14', 'http://arxiv.org/pdf/1911.06206v3'), ('2302.14114v1', 'economics', 'Econometric assessment of the monetary policy shocks in Morocco: Evidence from a Bayesian Factor-Augmented VAR', 'Marouane Daou', '2023-02-27', '2023-02-27', 'http://arxiv.org/pdf/2302.14114v1'), ('2311.11858v1', 'economics', 'Theory coherent shrinkage of Time-Varying Parameters in VARs', 'Andrea Renzett', '2023-11-20', '2023-11-20', 'http://arxiv.org/pdf/2311.11858v1')]\n",
|
304 |
-
"7\n",
|
305 |
-
"[('2310.03365v2', 'computer science', 'Swin-Tempo: Temporal-Aware Lung Nodule Detection in CT Scans as Video Sequences Using Swin Transformer-Enhanced UNet', 'Hossein Jafari, Karim Faez, Hamidreza Amindava', '2023-10-14', '2023-10-05', 'http://arxiv.org/pdf/2310.03365v2'), ('1808.08531v1', 'computer science', 'DeepTracker: Visualizing the Training Process of Convolutional Neural Networks', 'Dongyu Liu, Weiwei Cui, Kai Jin, Yuxiao Guo, Huamin Q', '2018-08-26', '2018-08-26', 'http://arxiv.org/pdf/1808.08531v1'), ('2105.10448v1', 'computer science', 'Distinguishing artefacts: evaluating the saturation point of convolutional neural networks', 'Ric Real, James Gopsill, David Jones, Chris Snider, Ben Hick', '2021-05-21', '2021-05-21', 'http://arxiv.org/pdf/2105.10448v1')]\n",
|
306 |
-
"8\n",
|
307 |
-
"Got new records: 10\n",
|
308 |
-
"Re-queried on chromadb, results: []\n",
|
309 |
-
"None\n",
|
310 |
-
"9\n",
|
311 |
-
"[('2403.07017v1', 'computer science', 'Mathematics of multi-agent learning systems at the interface of game theory and artificial intelligence', 'Long Wang, Feng Fu, Xingru Che', '2024-03-09', '2024-03-09', 'http://arxiv.org/pdf/2403.07017v1'), ('2210.02205v1', 'computer science', 'Game Theoretic Rating in N-player general-sum games with Equilibria', 'Luke Marris, Marc Lanctot, Ian Gemp, Shayegan Omidshafiei, Stephen McAleer, Jerome Connor, Karl Tuyls, Thore Graepe', '2022-10-05', '2022-10-05', 'http://arxiv.org/pdf/2210.02205v1'), ('2212.05357v3', 'economics', 'On Blockchain We Cooperate: An Evolutionary Game Perspective', 'Luyao Zhang, Xinyu Tia', '2023-01-19', '2022-12-10', 'http://arxiv.org/pdf/2212.05357v3')]\n"
|
312 |
-
]
|
313 |
-
}
|
314 |
-
],
|
315 |
-
"source": [
|
316 |
-
"with open(\"test_questions.txt\",\"r\") as infile:\n",
|
317 |
-
" data = json.load(infile)\n",
|
318 |
-
"print(data[0])\n",
|
319 |
-
"\n",
|
320 |
-
"test_log = []\n",
|
321 |
-
"for i,t in enumerate(data):\n",
|
322 |
-
" print(i)\n",
|
323 |
-
" temp_answer, answer = full_chain_single_question(t['question'])\n",
|
324 |
-
" test_log.append({'desired topic':t['desired'],\n",
|
325 |
-
" 'question':t['question'],\n",
|
326 |
-
" 'first answer':temp_answer,\n",
|
327 |
-
" 'final answer':answer})\n",
|
328 |
-
"with open(\"test_results.json\",\"w\") as outfile:\n",
|
329 |
-
" json.dump(test_log,outfile)"
|
330 |
-
]
|
331 |
-
}
|
332 |
-
],
|
333 |
-
"metadata": {
|
334 |
-
"kernelspec": {
|
335 |
-
"display_name": "Python 3",
|
336 |
-
"language": "python",
|
337 |
-
"name": "python3"
|
338 |
-
},
|
339 |
-
"language_info": {
|
340 |
-
"codemirror_mode": {
|
341 |
-
"name": "ipython",
|
342 |
-
"version": 3
|
343 |
-
},
|
344 |
-
"file_extension": ".py",
|
345 |
-
"mimetype": "text/x-python",
|
346 |
-
"name": "python",
|
347 |
-
"nbconvert_exporter": "python",
|
348 |
-
"pygments_lexer": "ipython3",
|
349 |
-
"version": "3.10.12"
|
350 |
-
}
|
351 |
-
},
|
352 |
-
"nbformat": 4,
|
353 |
-
"nbformat_minor": 2
|
354 |
-
}
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stderr",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"d:\\Program\\Anaconda\\envs\\python_project\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
13 |
+
" from .autonotebook import tqdm as notebook_tqdm\n",
|
14 |
+
"d:\\Program\\Anaconda\\envs\\python_project\\lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
|
15 |
+
" warnings.warn(\n",
|
16 |
+
"d:\\Program\\Anaconda\\envs\\python_project\\lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
|
17 |
+
" warnings.warn(\n"
|
18 |
+
]
|
19 |
+
}
|
20 |
+
],
|
21 |
+
"source": [
|
22 |
+
"import google.generativeai as genai\n",
|
23 |
+
"import arxiv_bot_utils as utils\n",
|
24 |
+
"import os\n",
|
25 |
+
"from getpass import getpass\n",
|
26 |
+
"import json\n",
|
27 |
+
"#chỉ là import một cách bình thường\n",
|
28 |
+
"#nội dung là "
|
29 |
+
]
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"cell_type": "code",
|
33 |
+
"execution_count": 2,
|
34 |
+
"metadata": {},
|
35 |
+
"outputs": [
|
36 |
+
{
|
37 |
+
"name": "stdout",
|
38 |
+
"output_type": "stream",
|
39 |
+
"text": [
|
40 |
+
"models/gemini-1.0-pro\n",
|
41 |
+
"models/gemini-1.0-pro-001\n",
|
42 |
+
"models/gemini-1.0-pro-latest\n",
|
43 |
+
"models/gemini-1.0-pro-vision-latest\n",
|
44 |
+
"models/gemini-1.5-pro-latest\n",
|
45 |
+
"models/gemini-pro\n",
|
46 |
+
"models/gemini-pro-vision\n"
|
47 |
+
]
|
48 |
+
}
|
49 |
+
],
|
50 |
+
"source": [
|
51 |
+
"os.environ['GEMINI_API_KEY'] = getpass(\"Input your API key: \")\n",
|
52 |
+
"# gán biến môi trường luôn\n",
|
53 |
+
"gemini_api_key = os.getenv(\"GEMINI_API_KEY\") # string trong môi trường\n",
|
54 |
+
"if not gemini_api_key:\n",
|
55 |
+
" raise ValueError(\n",
|
56 |
+
" \"Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable\"\n",
|
57 |
+
" )\n",
|
58 |
+
"genai.configure(api_key=gemini_api_key)\n",
|
59 |
+
"for m in genai.list_models():\n",
|
60 |
+
" if 'generateContent' in m.supported_generation_methods:\n",
|
61 |
+
" print(m.name)\n",
|
62 |
+
" #models nằm trên máy chủ\n"
|
63 |
+
]
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"cell_type": "code",
|
67 |
+
"execution_count": 3,
|
68 |
+
"metadata": {},
|
69 |
+
"outputs": [],
|
70 |
+
"source": [
|
71 |
+
"config = genai.GenerationConfig(max_output_tokens=2048,\n",
|
72 |
+
" temperature=0.7)\n",
|
73 |
+
"safety_settings = [\n",
|
74 |
+
" {\n",
|
75 |
+
" \"category\": \"HARM_CATEGORY_DANGEROUS\",\n",
|
76 |
+
" \"threshold\": \"BLOCK_NONE\",\n",
|
77 |
+
" },\n",
|
78 |
+
" {\n",
|
79 |
+
" \"category\": \"HARM_CATEGORY_HARASSMENT\",\n",
|
80 |
+
" \"threshold\": \"BLOCK_NONE\",\n",
|
81 |
+
" },\n",
|
82 |
+
" {\n",
|
83 |
+
" \"category\": \"HARM_CATEGORY_HATE_SPEECH\",\n",
|
84 |
+
" \"threshold\": \"BLOCK_NONE\",\n",
|
85 |
+
" },\n",
|
86 |
+
" {\n",
|
87 |
+
" \"category\": \"HARM_CATEGORY_SEXUALLY_EXPLICIT\",\n",
|
88 |
+
" \"threshold\": \"BLOCK_NONE\",\n",
|
89 |
+
" },\n",
|
90 |
+
" {\n",
|
91 |
+
" \"category\": \"HARM_CATEGORY_DANGEROUS_CONTENT\",\n",
|
92 |
+
" \"threshold\": \"BLOCK_NONE\",\n",
|
93 |
+
" },\n",
|
94 |
+
"]\n",
|
95 |
+
"model = genai.GenerativeModel(\"gemini-pro\",\n",
|
96 |
+
" generation_config=config,\n",
|
97 |
+
" safety_settings=safety_settings)"
|
98 |
+
]
|
99 |
+
},
|
100 |
+
{
|
101 |
+
"cell_type": "code",
|
102 |
+
"execution_count": 4,
|
103 |
+
"metadata": {},
|
104 |
+
"outputs": [],
|
105 |
+
"source": [
|
106 |
+
"def extract_keyword_prompt(query):\n",
|
107 |
+
" \"\"\"A prompt that return a JSON block as arguments for querying database\"\"\"\n",
|
108 |
+
"\n",
|
109 |
+
" prompt = (\n",
|
110 |
+
" \"\"\"[INST] SYSTEM: You are an assistant that choose only one action below based on guest question.\n",
|
111 |
+
" 1. If the guest question is asking for a single specific document or article with explicit title, you need to respond the information in JSON format with 2 keys \"title\", \"author\" if found any above. The authors are separated with the word 'and'. \n",
|
112 |
+
" 2. If the guest question is asking for relevant informations about a topic, you need to respond the information in JSON format with 2 keys \"keywords\", \"description\", include a list of keywords represent the main academic topic, \\\n",
|
113 |
+
" and a description about the main topic. You may paraphrase the keywords to add more. \\\n",
|
114 |
+
" 3. If the guest is not asking for any informations or documents, you need to respond with a polite answer in JSON format with 1 key \"answer\".\n",
|
115 |
+
" QUESTION: '{query}'\n",
|
116 |
+
" [/INST]\n",
|
117 |
+
" ANSWER: \n",
|
118 |
+
" \"\"\"\n",
|
119 |
+
" ).format(query=query)\n",
|
120 |
+
"\n",
|
121 |
+
" return prompt\n",
|
122 |
+
"\n",
|
123 |
+
"def make_answer_prompt(input, contexts):\n",
|
124 |
+
" \"\"\"A prompt that return the final answer, based on the queried context\"\"\"\n",
|
125 |
+
"\n",
|
126 |
+
" prompt = (\n",
|
127 |
+
" \"\"\"[INST] You are a library assistant that help to search articles and documents based on user's question.\n",
|
128 |
+
" From guest's question, you have found some records and documents that may help. Now you need to answer the guest with the information found.\n",
|
129 |
+
" If no information found in the database, you may generate some other recommendation related to user's question using your own knowledge. Each article or paper must have a link to the pdf download page.\n",
|
130 |
+
" You should answer in a conversational form politely.\n",
|
131 |
+
" QUESTION: '{input}'\n",
|
132 |
+
" INFORMATION: '{contexts}'\n",
|
133 |
+
" [/INST]\n",
|
134 |
+
" ANSWER:\n",
|
135 |
+
" \"\"\"\n",
|
136 |
+
" ).format(input=input, contexts=contexts)\n",
|
137 |
+
"\n",
|
138 |
+
" return prompt"
|
139 |
+
]
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"cell_type": "code",
|
143 |
+
"execution_count": 5,
|
144 |
+
"metadata": {},
|
145 |
+
"outputs": [],
|
146 |
+
"source": [
|
147 |
+
"def response(args):\n",
|
148 |
+
" \"\"\"Create response context, based on input arguments\"\"\"\n",
|
149 |
+
" keys = list(dict.keys(args))\n",
|
150 |
+
" if \"answer\" in keys:\n",
|
151 |
+
" return args['answer'], None # trả lời trực tiếp\n",
|
152 |
+
" \n",
|
153 |
+
" if \"keywords\" in keys:\n",
|
154 |
+
" # perform query\n",
|
155 |
+
" query_texts = args[\"description\"]\n",
|
156 |
+
" keywords = args[\"keywords\"]\n",
|
157 |
+
" results = utils.db.query_relevant(keywords=keywords, query_texts=query_texts)\n",
|
158 |
+
" # print(results)\n",
|
159 |
+
" ids = results['metadatas'][0]\n",
|
160 |
+
" if len(ids) == 0:\n",
|
161 |
+
" # go crawl some\n",
|
162 |
+
" new_records = utils.crawl_arxiv(keyword_list=keywords, max_results=10)\n",
|
163 |
+
" print(\"Got new records: \",len(new_records))\n",
|
164 |
+
" if type(new_records) == str:\n",
|
165 |
+
" return \"Error occured, information not found\", new_records\n",
|
166 |
+
" utils.db.add(new_records)\n",
|
167 |
+
" utils.sqldb.add(new_records)\n",
|
168 |
+
" results = utils.db.query_relevant(keywords=keywords, query_texts=query_texts)\n",
|
169 |
+
" ids = results['metadatas'][0]\n",
|
170 |
+
" print(\"Re-queried on chromadb, results: \",ids)\n",
|
171 |
+
" paper_id = [id['paper_id'] for id in ids]\n",
|
172 |
+
" paper_info = utils.sqldb.query_id(paper_id)\n",
|
173 |
+
" print(paper_info)\n",
|
174 |
+
" records = [] # get title (2), author (3), link (6)\n",
|
175 |
+
" result_string = \"\"\n",
|
176 |
+
" if paper_info:\n",
|
177 |
+
" for i in range(len(paper_info)):\n",
|
178 |
+
" result_string += \"Title: {}, Author: {}, Link: {}\".format(paper_info[i][2],paper_info[i][3],paper_info[i][6])\n",
|
179 |
+
" records.append([paper_info[i][2],paper_info[i][3],paper_info[i][6]])\n",
|
180 |
+
" return result_string, records\n",
|
181 |
+
" else:\n",
|
182 |
+
" return \"Information not found\", \"Information not found\"\n",
|
183 |
+
" # invoke llm and return result\n",
|
184 |
+
"\n",
|
185 |
+
" if \"title\" in keys:\n",
|
186 |
+
" title = args['title']\n",
|
187 |
+
" authors = utils.authors_str_to_list(args['author'])\n",
|
188 |
+
" paper_info = utils.sqldb.query(title = title,author = authors)\n",
|
189 |
+
" # if query not found then go crawl brh\n",
|
190 |
+
" # print(paper_info)\n",
|
191 |
+
"\n",
|
192 |
+
" if len(paper_info) == 0:\n",
|
193 |
+
" new_records = utils.crawl_exact_paper(title=title,author=authors)\n",
|
194 |
+
" print(\"Got new records: \",len(new_records))\n",
|
195 |
+
" if type(new_records) == str:\n",
|
196 |
+
" # print(new_records)\n",
|
197 |
+
" return \"Error occured, information not found\", \"Information not found\"\n",
|
198 |
+
" utils.db.add(new_records)\n",
|
199 |
+
" utils.sqldb.add(new_records)\n",
|
200 |
+
" paper_info = utils.sqldb.query(title = title,author = authors)\n",
|
201 |
+
" print(\"Re-queried on chromadb, results: \",paper_info)\n",
|
202 |
+
" # -------------------------------------\n",
|
203 |
+
" records = [] # get title (2), author (3), link (6)\n",
|
204 |
+
" result_string = \"\"\n",
|
205 |
+
" for i in range(len(paper_info)):\n",
|
206 |
+
" result_string += \"Title: {}, Author: {}, Link: {}\".format(paper_info[i][2],paper_info[i][3],paper_info[i][6])\n",
|
207 |
+
" records.append([paper_info[i][2],paper_info[i][3],paper_info[i][6]])\n",
|
208 |
+
" # process results:\n",
|
209 |
+
" if len(result_string) == 0:\n",
|
210 |
+
" return \"Information not found\", \"Information not found\"\n",
|
211 |
+
" return result_string, records\n",
|
212 |
+
" # invoke llm and return result"
|
213 |
+
]
|
214 |
+
},
|
215 |
+
{
|
216 |
+
"cell_type": "code",
|
217 |
+
"execution_count": 6,
|
218 |
+
"metadata": {},
|
219 |
+
"outputs": [],
|
220 |
+
"source": [
|
221 |
+
"def full_chain_single_question(input_prompt):\n",
|
222 |
+
" try:\n",
|
223 |
+
" first_prompt = extract_keyword_prompt(input_prompt)\n",
|
224 |
+
" temp_answer = model.generate_content(first_prompt).text\n",
|
225 |
+
"\n",
|
226 |
+
" args = json.loads(utils.trimming(temp_answer))\n",
|
227 |
+
" contexts, results = response(args)\n",
|
228 |
+
" if not results:\n",
|
229 |
+
" print(contexts)\n",
|
230 |
+
" else:\n",
|
231 |
+
" output_prompt = make_answer_prompt(input_prompt,contexts)\n",
|
232 |
+
" answer = model.generate_content(output_prompt).text\n",
|
233 |
+
" return temp_answer, answer\n",
|
234 |
+
" except Exception as e:\n",
|
235 |
+
" print(e)\n",
|
236 |
+
" return temp_answer, \"Error occured: \" + str(e)"
|
237 |
+
]
|
238 |
+
},
|
239 |
+
{
|
240 |
+
"cell_type": "code",
|
241 |
+
"execution_count": 27,
|
242 |
+
"metadata": {},
|
243 |
+
"outputs": [
|
244 |
+
{
|
245 |
+
"name": "stdout",
|
246 |
+
"output_type": "stream",
|
247 |
+
"text": [
|
248 |
+
"[('1903.04824v1', 'computer science', 'Proceedings of the Fifth International Conference on Cloud and Robotics (ICCR2018)', ' Huaxi, Zhang, Jacques Malenfan', '2019-03-12', '2019-03-12', 'http://arxiv.org/pdf/1903.04824v1'), ('1709.07597v1', 'economics', 'Inverse Reinforcement Learning with Conditional Choice Probabilities', 'Mohit Sharma, Kris M. Kitani, Joachim Groege', '2017-09-22', '2017-09-22', 'http://arxiv.org/pdf/1709.07597v1')]\n",
|
249 |
+
"Sure, here are some key papers on model predictive control for nonlinear systems:\n",
|
250 |
+
"\n",
|
251 |
+
"* **Nonlinear Model Predictive Control: A Survey** by Garcia, P.D., Prett, D.M., and Morari, M. (1989)\n",
|
252 |
+
"* **Model Predictive Control for Nonlinear Systems** by Camacho, E.F. and Bordons, C. (1999)\n",
|
253 |
+
"* **Nonlinear Model Predictive Control** by Rawlings, J.B. and Mayne, D.Q. (2009)\n",
|
254 |
+
"\n",
|
255 |
+
"As for recent reviews on the application of control theory to robotics, here are a few:\n",
|
256 |
+
"\n",
|
257 |
+
"* **Control of Robot Manipulators** by Spong, M.W., Hutchinson, S., and Vidyasagar, M. (2006)\n",
|
258 |
+
"* **Robotics: Modelling, Planning and Control** by Siciliano, B., Sciavicco, L., Villani, L., and Oriolo, G. (2010)\n",
|
259 |
+
"* **Control of Robot Arms** by Featherstone, R. (2014)\n",
|
260 |
+
"\n",
|
261 |
+
"I hope this information is helpful. Please let me know if you have any other questions.\n"
|
262 |
+
]
|
263 |
+
}
|
264 |
+
],
|
265 |
+
"source": [
|
266 |
+
"# test response, second step\n",
|
267 |
+
"input_prompt = \"Can you suggest some key papers on model predictive control for nonlinear systems, and are there any recent reviews on the application of control theory to robotics?\"\n",
|
268 |
+
"args = \"{\\n \\\"keywords\\\": [\\\"Model predictive control\\\", \\\"Nonlinear systems\\\", \\\"Robotics\\\", \\\"Control theory\\\"],\\n \\\"description\\\": \\\"Model predictive control (MPC) is a control algorithm that uses a model of the system to predict future behavior and optimize the control inputs. MPC is particularly well-suited for nonlinear systems, as it can handle the complex dynamics of these systems. In recent years, MPC has been increasingly applied to robotics, as it can improve the performance and safety of robotic systems. Control theory is a branch of mathematics that deals with the analysis and design of control systems. Control theory has been applied to a wide range of problems in robotics, including motion planning, trajectory tracking, and force control.\\\"\\n}\"\n",
|
269 |
+
"args = json.loads(args)\n",
|
270 |
+
"contexts, results = response(args)\n",
|
271 |
+
"if not results:\n",
|
272 |
+
" # direct answer\n",
|
273 |
+
" print(contexts)\n",
|
274 |
+
"else:\n",
|
275 |
+
" output_prompt = make_answer_prompt(input_prompt,contexts)\n",
|
276 |
+
" answer = model.generate_content(output_prompt).text\n",
|
277 |
+
" print(answer)"
|
278 |
+
]
|
279 |
+
},
|
280 |
+
{
|
281 |
+
"cell_type": "code",
|
282 |
+
"execution_count": 7,
|
283 |
+
"metadata": {},
|
284 |
+
"outputs": [
|
285 |
+
{
|
286 |
+
"name": "stdout",
|
287 |
+
"output_type": "stream",
|
288 |
+
"text": [
|
289 |
+
"{'desired': 'Natural Language Processing (Computer Science)', 'question': 'What are some recent papers on deep learning architectures for text classification, and can you recommend any surveys or reviews on the topic?'}\n",
|
290 |
+
"0\n",
|
291 |
+
"[('1808.08121v1', 'computer science', 'An Improvement of Data Classification Using Random Multimodel Deep Learning (RMDL)', 'Mojtaba Heidarysafa, Kamran Kowsari, Donald E. Brown, Kiana Jafari Meimandi, Laura E. Barne', '2018-08-23', '2018-08-23', 'http://arxiv.org/pdf/1808.08121v1'), ('1904.08067v5', 'computer science', 'Text Classification Algorithms: A Survey', 'Kamran Kowsari, Kiana Jafari Meimandi, Mojtaba Heidarysafa, Sanjana Mendu, Laura E. Barnes, Donald E. Brow', '2020-05-20', '2019-04-17', 'http://arxiv.org/pdf/1904.08067v5'), ('2202.09144v1', 'computer science', 'Modelling the semantics of text in complex document layouts using graph transformer networks', 'Thomas Roland Barillot, Jacob Saks, Polena Lilyanova, Edward Torgas, Yachen Hu, Yuanqing Liu, Varun Balupuri, Paul Gaskel', '2022-02-18', '2022-02-18', 'http://arxiv.org/pdf/2202.09144v1')]\n",
|
292 |
+
"1\n",
|
293 |
+
"[('1601.04187v1', 'computer science', 'Conversion of Artificial Recurrent Neural Networks to Spiking Neural Networks for Low-power Neuromorphic Hardware', 'Peter U. Diehl, Guido Zarrella, Andrew Cassidy, Bruno U. Pedroni, Emre Neftc', '2016-01-16', '2016-01-16', 'http://arxiv.org/pdf/1601.04187v1'), ('1801.01093v3', 'economics', 'Comparing the Forecasting Performances of Linear Models for Electricity Prices with High RES Penetration', 'Angelica Gianfreda, Francesco Ravazzolo, Luca Rossin', '2019-11-12', '2018-01-03', 'http://arxiv.org/pdf/1801.01093v3'), ('2302.11093v1', 'electrical engineering and system science', 'Use Cases for Time-Frequency Image Representations and Deep Learning Techniques for Improved Signal Classification', 'Mehmet Parla', '2023-02-22', '2023-02-22', 'http://arxiv.org/pdf/2302.11093v1')]\n",
|
294 |
+
"2\n",
|
295 |
+
"[('1505.07907v4', 'economics', 'Linking Economic Complexity, Institutions and Income Inequality', 'D. Hartmann, M. R. Guevara, C. Jara-Figueroa, M. Aristaran, C. A. Hidalg', '2017-01-04', '2015-05-29', 'http://arxiv.org/pdf/1505.07907v4'), ('2107.06855v2', 'economics', 'Comparing Intellectual property policy in the Global North and South -- A one-size-fits-all policy for economic prosperity?', 'S Sidhartha Narayan, Malavika Ranjan, Madhumitha Raghurama', '2021-08-10', '2021-07-14', 'http://arxiv.org/pdf/2107.06855v2'), ('1910.11780v1', 'economics', 'Inequality in Turkey: Looking Beyond Growth', 'Bayram Cakir, Ipek Ergu', '2019-10-25', '2019-10-25', 'http://arxiv.org/pdf/1910.11780v1')]\n",
|
296 |
+
"3\n",
|
297 |
+
"[('1607.06583v2', 'computer science', \"Classification of Alzheimer's Disease Structural MRI Data by Deep Learning Convolutional Neural Networks\", 'Saman Sarraf, Ghassem Tofigh', '2017-05-19', '2016-07-22', 'http://arxiv.org/pdf/1607.06583v2'), ('2101.10265v1', 'computer science', 'Superiorities of Deep Extreme Learning Machines against Convolutional Neural Networks', 'Gokhan Altan, Yakup Kutl', '2021-01-21', '2021-01-21', 'http://arxiv.org/pdf/2101.10265v1'), ('2208.03143v1', 'computer science', 'Deep Learning and Health Informatics for Smart Monitoring and Diagnosis', 'Amin Gasm', '2022-08-05', '2022-08-05', 'http://arxiv.org/pdf/2208.03143v1')]\n",
|
298 |
+
"4\n",
|
299 |
+
"[('2302.06584v3', 'computer science', 'Thermodynamic AI and the fluctuation frontier', 'Patrick J. Coles, Collin Szczepanski, Denis Melanson, Kaelan Donatella, Antonio J. Martinez, Faris Sbah', '2023-06-13', '2023-02-09', 'http://arxiv.org/pdf/2302.06584v3'), ('2307.12298v1', 'computer science', 'Stabilization and Dissipative Information Transfer of a Superconducting Kerr-Cat Qubit', 'Ufuk Korkmaz, Deniz Türkpenç', '2023-07-23', '2023-07-23', 'http://arxiv.org/pdf/2307.12298v1'), ('2106.10421v1', 'computer science', 'QFCNN: Quantum Fourier Convolutional Neural Network', 'Feihong Shen, Jun Li', '2021-06-19', '2021-06-19', 'http://arxiv.org/pdf/2106.10421v1')]\n",
|
300 |
+
"5\n",
|
301 |
+
"[('2308.16539v2', 'computer science', 'On a Connection between Differential Games, Optimal Control, and Energy-based Models for Multi-Agent Interactions', 'Christopher Diehl, Tobias Klosek, Martin Krüger, Nils Murzyn, Torsten Bertra', '2023-10-16', '2023-08-31', 'http://arxiv.org/pdf/2308.16539v2'), ('2404.12474v1', 'computer science', 'Learning a Stable, Safe, Distributed Feedback Controller for a Heterogeneous Platoon of Vehicles', 'Michael H. Shaham, Taskin Padi', '2024-04-18', '2024-04-18', 'http://arxiv.org/pdf/2404.12474v1'), ('2008.13221v1', 'computer science', 'Human-in-the-Loop Methods for Data-Driven and Reinforcement Learning Systems', 'Vinicius G. Goeck', '2020-08-30', '2020-08-30', 'http://arxiv.org/pdf/2008.13221v1')]\n",
|
302 |
+
"6\n",
|
303 |
+
"[('1911.06206v3', 'economics', 'Bayesian state-space modeling for analyzing heterogeneous network effects of US monetary policy', 'Niko Hauzenberger, Michael Pfarrhofe', '2020-09-10', '2019-11-14', 'http://arxiv.org/pdf/1911.06206v3'), ('2302.14114v1', 'economics', 'Econometric assessment of the monetary policy shocks in Morocco: Evidence from a Bayesian Factor-Augmented VAR', 'Marouane Daou', '2023-02-27', '2023-02-27', 'http://arxiv.org/pdf/2302.14114v1'), ('2311.11858v1', 'economics', 'Theory coherent shrinkage of Time-Varying Parameters in VARs', 'Andrea Renzett', '2023-11-20', '2023-11-20', 'http://arxiv.org/pdf/2311.11858v1')]\n",
|
304 |
+
"7\n",
|
305 |
+
"[('2310.03365v2', 'computer science', 'Swin-Tempo: Temporal-Aware Lung Nodule Detection in CT Scans as Video Sequences Using Swin Transformer-Enhanced UNet', 'Hossein Jafari, Karim Faez, Hamidreza Amindava', '2023-10-14', '2023-10-05', 'http://arxiv.org/pdf/2310.03365v2'), ('1808.08531v1', 'computer science', 'DeepTracker: Visualizing the Training Process of Convolutional Neural Networks', 'Dongyu Liu, Weiwei Cui, Kai Jin, Yuxiao Guo, Huamin Q', '2018-08-26', '2018-08-26', 'http://arxiv.org/pdf/1808.08531v1'), ('2105.10448v1', 'computer science', 'Distinguishing artefacts: evaluating the saturation point of convolutional neural networks', 'Ric Real, James Gopsill, David Jones, Chris Snider, Ben Hick', '2021-05-21', '2021-05-21', 'http://arxiv.org/pdf/2105.10448v1')]\n",
|
306 |
+
"8\n",
|
307 |
+
"Got new records: 10\n",
|
308 |
+
"Re-queried on chromadb, results: []\n",
|
309 |
+
"None\n",
|
310 |
+
"9\n",
|
311 |
+
"[('2403.07017v1', 'computer science', 'Mathematics of multi-agent learning systems at the interface of game theory and artificial intelligence', 'Long Wang, Feng Fu, Xingru Che', '2024-03-09', '2024-03-09', 'http://arxiv.org/pdf/2403.07017v1'), ('2210.02205v1', 'computer science', 'Game Theoretic Rating in N-player general-sum games with Equilibria', 'Luke Marris, Marc Lanctot, Ian Gemp, Shayegan Omidshafiei, Stephen McAleer, Jerome Connor, Karl Tuyls, Thore Graepe', '2022-10-05', '2022-10-05', 'http://arxiv.org/pdf/2210.02205v1'), ('2212.05357v3', 'economics', 'On Blockchain We Cooperate: An Evolutionary Game Perspective', 'Luyao Zhang, Xinyu Tia', '2023-01-19', '2022-12-10', 'http://arxiv.org/pdf/2212.05357v3')]\n"
|
312 |
+
]
|
313 |
+
}
|
314 |
+
],
|
315 |
+
"source": [
|
316 |
+
"with open(\"test_questions.txt\",\"r\") as infile:\n",
|
317 |
+
" data = json.load(infile)\n",
|
318 |
+
"print(data[0])\n",
|
319 |
+
"\n",
|
320 |
+
"test_log = []\n",
|
321 |
+
"for i,t in enumerate(data):\n",
|
322 |
+
" print(i)\n",
|
323 |
+
" temp_answer, answer = full_chain_single_question(t['question'])\n",
|
324 |
+
" test_log.append({'desired topic':t['desired'],\n",
|
325 |
+
" 'question':t['question'],\n",
|
326 |
+
" 'first answer':temp_answer,\n",
|
327 |
+
" 'final answer':answer})\n",
|
328 |
+
"with open(\"test_results.json\",\"w\") as outfile:\n",
|
329 |
+
" json.dump(test_log,outfile)"
|
330 |
+
]
|
331 |
+
}
|
332 |
+
],
|
333 |
+
"metadata": {
|
334 |
+
"kernelspec": {
|
335 |
+
"display_name": "Python 3",
|
336 |
+
"language": "python",
|
337 |
+
"name": "python3"
|
338 |
+
},
|
339 |
+
"language_info": {
|
340 |
+
"codemirror_mode": {
|
341 |
+
"name": "ipython",
|
342 |
+
"version": 3
|
343 |
+
},
|
344 |
+
"file_extension": ".py",
|
345 |
+
"mimetype": "text/x-python",
|
346 |
+
"name": "python",
|
347 |
+
"nbconvert_exporter": "python",
|
348 |
+
"pygments_lexer": "ipython3",
|
349 |
+
"version": "3.10.12"
|
350 |
+
}
|
351 |
+
},
|
352 |
+
"nbformat": 4,
|
353 |
+
"nbformat_minor": 2
|
354 |
+
}
|
chat/consumers.py
CHANGED
@@ -1,21 +1,25 @@
|
|
1 |
import json
|
2 |
-
from . import
|
3 |
-
from chat.arxiv_bot.arxiv_bot_utils import ArxivSQL
|
4 |
from channels.generic.websocket import WebsocketConsumer
|
5 |
|
6 |
|
7 |
class ChatConsumer(WebsocketConsumer):
|
8 |
def connect(self):
|
9 |
self.accept()
|
10 |
-
self.
|
11 |
|
12 |
def disconnect(self, close_code):
|
|
|
13 |
pass
|
|
|
14 |
def receive(self, text_data):
|
15 |
text_data_json = json.loads(text_data)
|
16 |
message = text_data_json["messages"]
|
17 |
print(message)
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
21 |
|
|
|
1 |
import json
|
2 |
+
from . import model_manage2 as md
|
|
|
3 |
from channels.generic.websocket import WebsocketConsumer
|
4 |
|
5 |
|
6 |
class ChatConsumer(WebsocketConsumer):
|
7 |
def connect(self):
|
8 |
self.accept()
|
9 |
+
self.model, self.session = md.init_model("auto")
|
10 |
|
11 |
def disconnect(self, close_code):
|
12 |
+
del self.model, self.session
|
13 |
pass
|
14 |
+
|
15 |
def receive(self, text_data):
|
16 |
text_data_json = json.loads(text_data)
|
17 |
message = text_data_json["messages"]
|
18 |
print(message)
|
19 |
+
question = message[-1]['content']
|
20 |
+
response, history_state = md.full_chain_history_question(question, self.session, mode="auto")
|
21 |
+
# print("First answer: ",response)
|
22 |
+
print("Session history:")
|
23 |
+
md.print_history(history_state)
|
24 |
+
self.send(text_data=json.dumps({"message": response}))
|
25 |
|
chat/migrations/__pycache__/0001_initial.cpython-311.pyc
ADDED
Binary file (1.01 kB). View file
|
|
chat/migrations/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (171 Bytes). View file
|
|
chat/model_manage.py
CHANGED
@@ -1,271 +1,271 @@
|
|
1 |
-
# my_app/model_manager.py
|
2 |
-
import google.generativeai as genai
|
3 |
-
import chat.arxiv_bot.arxiv_bot_utils as utils
|
4 |
-
import json
|
5 |
|
6 |
-
model = None
|
7 |
|
8 |
-
model_retrieval = None
|
9 |
|
10 |
-
model_answer = None
|
11 |
|
12 |
-
RETRIEVAL_INSTRUCT = """You are an auto chatbot that response with only one action below based on user question.
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
|
23 |
-
ANSWER_INSTRUCT = """You are a library assistant that help answering customer question based on the information given.
|
24 |
-
|
25 |
-
|
26 |
|
27 |
-
def create_model():
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
|
73 |
-
def get_model():
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
|
80 |
-
def extract_keyword_prompt(query):
|
81 |
-
|
82 |
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
|
97 |
-
def make_answer_prompt(input, contexts):
|
98 |
-
|
99 |
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
|
112 |
-
def retrieval_chat_template(question):
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
|
118 |
-
def answer_chat_template(question, contexts):
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
|
124 |
-
def response(args, db_instance):
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
# if "title" in keys:
|
169 |
-
# title = args['title']
|
170 |
-
# authors = utils.authors_str_to_list(args['author'])
|
171 |
-
# paper_info = db_instance.query(title = title,author = authors)
|
172 |
-
# # if query not found then go crawl brh
|
173 |
-
# # print(paper_info)
|
174 |
-
|
175 |
-
# if len(paper_info) == 0:
|
176 |
-
# new_records = utils.crawl_exact_paper(title=title,author=authors)
|
177 |
-
# print("Got new records: ",len(new_records))
|
178 |
-
# if type(new_records) == str:
|
179 |
-
# # print(new_records)
|
180 |
-
# return "Error occured, information not found", "Information not found"
|
181 |
-
# utils.db.add(new_records)
|
182 |
-
# db_instance.add(new_records)
|
183 |
-
# paper_info = db_instance.query(title = title,author = authors)
|
184 |
-
# print("Re-queried on chromadb, results: ",paper_info)
|
185 |
-
# # -------------------------------------
|
186 |
-
# records = [] # get title (2), author (3), link (6)
|
187 |
-
# result_string = ""
|
188 |
-
# for i in range(len(paper_info)):
|
189 |
-
# result_string += "Title: {}, Author: {}, Link: {}".format(paper_info[i][2],paper_info[i][3],paper_info[i][6])
|
190 |
-
# records.append([paper_info[i][2],paper_info[i][3],paper_info[i][6]])
|
191 |
-
# # process results:
|
192 |
-
# if len(result_string) == 0:
|
193 |
-
# return "Information not found", "Information not found"
|
194 |
-
# return result_string, records
|
195 |
-
# invoke llm and return result
|
196 |
-
|
197 |
-
def full_chain_single_question(input_prompt, db_instance):
|
198 |
-
try:
|
199 |
-
first_prompt = extract_keyword_prompt(input_prompt)
|
200 |
-
temp_answer = model.generate_content(first_prompt).text
|
201 |
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
output_prompt = make_answer_prompt(input_prompt,contexts)
|
209 |
-
answer = model.generate_content(output_prompt).text
|
210 |
-
return temp_answer, answer
|
211 |
-
except Exception as e:
|
212 |
-
# print(e)
|
213 |
-
return temp_answer, "Error occured: " + str(e)
|
214 |
-
|
215 |
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
|
227 |
-
# def
|
228 |
# try:
|
229 |
-
#
|
230 |
-
# print('Extracted temp chat: ',temp_chat)
|
231 |
-
# first_prompt = extract_keyword_prompt(temp_chat[-1]["parts"][0])
|
232 |
# temp_answer = model.generate_content(first_prompt).text
|
233 |
|
234 |
# args = json.loads(utils.trimming(temp_answer))
|
235 |
# contexts, results = response(args, db_instance)
|
236 |
-
# print('Context extracted: ',contexts)
|
237 |
# if not results:
|
|
|
238 |
# return "Random question, direct return", contexts
|
239 |
# else:
|
240 |
-
#
|
241 |
-
#
|
242 |
-
# print(temp_chat)
|
243 |
-
# answer = model.generate_content(temp_chat).text
|
244 |
# return temp_answer, answer
|
245 |
# except Exception as e:
|
246 |
# # print(e)
|
247 |
# return temp_answer, "Error occured: " + str(e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
|
255 |
-
|
256 |
-
|
257 |
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
|
|
1 |
+
# # my_app/model_manager.py
|
2 |
+
# import google.generativeai as genai
|
3 |
+
# import chat.arxiv_bot.arxiv_bot_utils as utils
|
4 |
+
# import json
|
5 |
|
6 |
+
# model = None
|
7 |
|
8 |
+
# model_retrieval = None
|
9 |
|
10 |
+
# model_answer = None
|
11 |
|
12 |
+
# RETRIEVAL_INSTRUCT = """You are an auto chatbot that response with only one action below based on user question.
|
13 |
+
# 1. If the guest question is asking about a science topic, you need to respond the information in JSON schema below:
|
14 |
+
# {
|
15 |
+
# "keywords": [a list of string keywords about the topic],
|
16 |
+
# "description": "a paragraph describing the topic in about 50 to 100 words"
|
17 |
+
# }
|
18 |
+
# 2. If the guest is not asking for any informations or documents, you need to respond in JSON schema below:
|
19 |
+
# {
|
20 |
+
# "answer": "your answer to the user question"
|
21 |
+
# }"""
|
22 |
|
23 |
+
# ANSWER_INSTRUCT = """You are a library assistant that help answering customer question based on the information given.
|
24 |
+
# You always answer in a conversational form naturally and politely.
|
25 |
+
# You must introduce all the records given, each must contain title, authors and the link to the pdf file."""
|
26 |
|
27 |
+
# def create_model():
|
28 |
+
# with open("apikey.txt","r") as apikey:
|
29 |
+
# key = apikey.readline()
|
30 |
+
# genai.configure(api_key=key)
|
31 |
+
# for m in genai.list_models():
|
32 |
+
# if 'generateContent' in m.supported_generation_methods:
|
33 |
+
# print(m.name)
|
34 |
+
# print("He was there")
|
35 |
+
# config = genai.GenerationConfig(max_output_tokens=2048,
|
36 |
+
# temperature=1.0)
|
37 |
+
# safety_settings = [
|
38 |
+
# {
|
39 |
+
# "category": "HARM_CATEGORY_DANGEROUS",
|
40 |
+
# "threshold": "BLOCK_NONE",
|
41 |
+
# },
|
42 |
+
# {
|
43 |
+
# "category": "HARM_CATEGORY_HARASSMENT",
|
44 |
+
# "threshold": "BLOCK_NONE",
|
45 |
+
# },
|
46 |
+
# {
|
47 |
+
# "category": "HARM_CATEGORY_HATE_SPEECH",
|
48 |
+
# "threshold": "BLOCK_NONE",
|
49 |
+
# },
|
50 |
+
# {
|
51 |
+
# "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
|
52 |
+
# "threshold": "BLOCK_NONE",
|
53 |
+
# },
|
54 |
+
# {
|
55 |
+
# "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
|
56 |
+
# "threshold": "BLOCK_NONE",
|
57 |
+
# },
|
58 |
+
# ]
|
59 |
+
# global model, model_retrieval, model_answer
|
60 |
+
# model = genai.GenerativeModel("gemini-1.5-pro-latest",
|
61 |
+
# generation_config=config,
|
62 |
+
# safety_settings=safety_settings)
|
63 |
+
# model_retrieval = genai.GenerativeModel("gemini-1.5-pro-latest",
|
64 |
+
# generation_config=config,
|
65 |
+
# safety_settings=safety_settings,
|
66 |
+
# system_instruction=RETRIEVAL_INSTRUCT)
|
67 |
+
# model_answer = genai.GenerativeModel("gemini-1.5-pro-latest",
|
68 |
+
# generation_config=config,
|
69 |
+
# safety_settings=safety_settings,
|
70 |
+
# system_instruction=ANSWER_INSTRUCT)
|
71 |
+
# return model, model_answer, model_retrieval
|
72 |
|
73 |
+
# def get_model():
|
74 |
+
# global model, model_answer, model_retrieval
|
75 |
+
# if model is None:
|
76 |
+
# # Khởi tạo model ở đây
|
77 |
+
# model, model_answer, model_retrieval = create_model() # Giả sử create_model là hàm tạo model của bạn
|
78 |
+
# return model, model_answer, model_retrieval
|
79 |
|
80 |
+
# def extract_keyword_prompt(query):
|
81 |
+
# """A prompt that return a JSON block as arguments for querying database"""
|
82 |
|
83 |
+
# prompt = """[INST] SYSTEM: You are an auto chatbot that response with only one action below based on user question.
|
84 |
+
# 1. If the guest question is asking about a science topic, you need to respond the information in JSON schema below:
|
85 |
+
# {
|
86 |
+
# "keywords": [a list of string keywords about the topic],
|
87 |
+
# "description": "a paragraph describing the topic in about 50 to 100 words"
|
88 |
+
# }
|
89 |
+
# 2. If the guest is not asking for any informations or documents, you need to respond in JSON schema below:
|
90 |
+
# {
|
91 |
+
# "answer": "your answer to the user question"
|
92 |
+
# }
|
93 |
+
# QUESTION: """ + query + """[/INST]
|
94 |
+
# ANSWER: """
|
95 |
+
# return prompt
|
96 |
|
97 |
+
# def make_answer_prompt(input, contexts):
|
98 |
+
# """A prompt that return the final answer, based on the queried context"""
|
99 |
|
100 |
+
# prompt = (
|
101 |
+
# """[INST] You are a library assistant that help answering customer QUESTION based on the INFORMATION given.
|
102 |
+
# You always answer in a conversational form naturally and politely.
|
103 |
+
# You must introduce all the records given, each must contain title, authors and the link to the pdf file.
|
104 |
+
# QUESTION: {input}
|
105 |
+
# INFORMATION: '{contexts}'
|
106 |
+
# [/INST]
|
107 |
+
# ANSWER:
|
108 |
+
# """
|
109 |
+
# ).format(input=input, contexts=contexts)
|
110 |
+
# return prompt
|
111 |
|
112 |
+
# def retrieval_chat_template(question):
|
113 |
+
# return {
|
114 |
+
# "role":"user",
|
115 |
+
# "parts":[f"QUESTION: {question} \n ANSWER:"]
|
116 |
+
# }
|
117 |
|
118 |
+
# def answer_chat_template(question, contexts):
|
119 |
+
# return {
|
120 |
+
# "role":"user",
|
121 |
+
# "parts":[f"QUESTION: {question} \n INFORMATION: {contexts} \n ANSWER:"]
|
122 |
+
# }
|
123 |
|
124 |
+
# def response(args, db_instance):
|
125 |
+
# """Create response context, based on input arguments"""
|
126 |
+
# keys = list(dict.keys(args))
|
127 |
+
# if "answer" in keys:
|
128 |
+
# return args['answer'], None # trả lời trực tiếp
|
129 |
|
130 |
+
# if "keywords" in keys:
|
131 |
+
# # perform query
|
132 |
+
# query_texts = args["description"]
|
133 |
+
# keywords = args["keywords"]
|
134 |
+
# results = utils.db.query_relevant(keywords=keywords, query_texts=query_texts)
|
135 |
+
# # print(results)
|
136 |
+
# ids = results['metadatas'][0]
|
137 |
+
# if len(ids) == 0:
|
138 |
+
# # go crawl some
|
139 |
+
# new_records = utils.crawl_arxiv(keyword_list=keywords, max_results=10)
|
140 |
+
# print("Got new records: ",len(new_records))
|
141 |
+
# if type(new_records) == str:
|
142 |
+
# return "Error occured, information not found", new_records
|
143 |
+
# utils.db.add(new_records)
|
144 |
+
# db_instance.add(new_records)
|
145 |
+
# results = utils.db.query_relevant(keywords=keywords, query_texts=query_texts)
|
146 |
+
# ids = results['metadatas'][0]
|
147 |
+
# print("Re-queried on chromadb, results: ",ids)
|
148 |
+
# paper_id = [id['paper_id'] for id in ids]
|
149 |
+
# paper_info = db_instance.query_id(paper_id)
|
150 |
+
# print(paper_info)
|
151 |
+
# records = [] # get title (2), author (3), link (6)
|
152 |
+
# result_string = ""
|
153 |
+
# if paper_info:
|
154 |
+
# for i in range(len(paper_info)):
|
155 |
+
# result_string += "Record no.{} - Title: {}, Author: {}, Link: {}, ".format(i+1,paper_info[i][2],paper_info[i][3],paper_info[i][6])
|
156 |
+
# id = paper_info[i][0]
|
157 |
+
# selected_document = utils.db.query_exact(id)["documents"]
|
158 |
+
# doc_str = "Summary:"
|
159 |
+
# for doc in selected_document:
|
160 |
+
# doc_str+= doc + " "
|
161 |
+
# result_string += doc_str
|
162 |
+
# records.append([paper_info[i][2],paper_info[i][3],paper_info[i][6]])
|
163 |
+
# return result_string, records
|
164 |
+
# else:
|
165 |
+
# return "Information not found", "Information not found"
|
166 |
+
# # invoke llm and return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
|
168 |
+
# # if "title" in keys:
|
169 |
+
# # title = args['title']
|
170 |
+
# # authors = utils.authors_str_to_list(args['author'])
|
171 |
+
# # paper_info = db_instance.query(title = title,author = authors)
|
172 |
+
# # # if query not found then go crawl brh
|
173 |
+
# # # print(paper_info)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
|
175 |
+
# # if len(paper_info) == 0:
|
176 |
+
# # new_records = utils.crawl_exact_paper(title=title,author=authors)
|
177 |
+
# # print("Got new records: ",len(new_records))
|
178 |
+
# # if type(new_records) == str:
|
179 |
+
# # # print(new_records)
|
180 |
+
# # return "Error occured, information not found", "Information not found"
|
181 |
+
# # utils.db.add(new_records)
|
182 |
+
# # db_instance.add(new_records)
|
183 |
+
# # paper_info = db_instance.query(title = title,author = authors)
|
184 |
+
# # print("Re-queried on chromadb, results: ",paper_info)
|
185 |
+
# # # -------------------------------------
|
186 |
+
# # records = [] # get title (2), author (3), link (6)
|
187 |
+
# # result_string = ""
|
188 |
+
# # for i in range(len(paper_info)):
|
189 |
+
# # result_string += "Title: {}, Author: {}, Link: {}".format(paper_info[i][2],paper_info[i][3],paper_info[i][6])
|
190 |
+
# # records.append([paper_info[i][2],paper_info[i][3],paper_info[i][6]])
|
191 |
+
# # # process results:
|
192 |
+
# # if len(result_string) == 0:
|
193 |
+
# # return "Information not found", "Information not found"
|
194 |
+
# # return result_string, records
|
195 |
+
# # invoke llm and return result
|
196 |
|
197 |
+
# def full_chain_single_question(input_prompt, db_instance):
|
198 |
# try:
|
199 |
+
# first_prompt = extract_keyword_prompt(input_prompt)
|
|
|
|
|
200 |
# temp_answer = model.generate_content(first_prompt).text
|
201 |
|
202 |
# args = json.loads(utils.trimming(temp_answer))
|
203 |
# contexts, results = response(args, db_instance)
|
|
|
204 |
# if not results:
|
205 |
+
# # print(contexts)
|
206 |
# return "Random question, direct return", contexts
|
207 |
# else:
|
208 |
+
# output_prompt = make_answer_prompt(input_prompt,contexts)
|
209 |
+
# answer = model.generate_content(output_prompt).text
|
|
|
|
|
210 |
# return temp_answer, answer
|
211 |
# except Exception as e:
|
212 |
# # print(e)
|
213 |
# return temp_answer, "Error occured: " + str(e)
|
214 |
+
|
215 |
+
|
216 |
+
# def format_chat_history_from_web(chat_history: list):
|
217 |
+
# temp_chat = []
|
218 |
+
# for message in chat_history:
|
219 |
+
# temp_chat.append(
|
220 |
+
# {
|
221 |
+
# "role": message["role"],
|
222 |
+
# "parts": [message["content"]]
|
223 |
+
# }
|
224 |
+
# )
|
225 |
+
# return temp_chat
|
226 |
+
|
227 |
+
# # def full_chain_history_question(chat_history: list, db_instance):
|
228 |
+
# # try:
|
229 |
+
# # temp_chat = format_chat_history_from_web(chat_history)
|
230 |
+
# # print('Extracted temp chat: ',temp_chat)
|
231 |
+
# # first_prompt = extract_keyword_prompt(temp_chat[-1]["parts"][0])
|
232 |
+
# # temp_answer = model.generate_content(first_prompt).text
|
233 |
|
234 |
+
# # args = json.loads(utils.trimming(temp_answer))
|
235 |
+
# # contexts, results = response(args, db_instance)
|
236 |
+
# # print('Context extracted: ',contexts)
|
237 |
+
# # if not results:
|
238 |
+
# # return "Random question, direct return", contexts
|
239 |
+
# # else:
|
240 |
+
# # QA_Prompt = make_answer_prompt(temp_chat[-1]["parts"][0], contexts)
|
241 |
+
# # temp_chat[-1]["parts"] = QA_Prompt
|
242 |
+
# # print(temp_chat)
|
243 |
+
# # answer = model.generate_content(temp_chat).text
|
244 |
+
# # return temp_answer, answer
|
245 |
+
# # except Exception as e:
|
246 |
+
# # # print(e)
|
247 |
+
# # return temp_answer, "Error occured: " + str(e)
|
248 |
+
|
249 |
+
# def full_chain_history_question(chat_history: list, db_instance):
|
250 |
+
# try:
|
251 |
+
# temp_chat = format_chat_history_from_web(chat_history)
|
252 |
+
# question = temp_chat[-1]['parts'][0]
|
253 |
+
# first_answer = model_retrieval.generate_content(temp_chat).text
|
254 |
|
255 |
+
# print(first_answer)
|
256 |
+
# args = json.loads(utils.trimming(first_answer))
|
257 |
|
258 |
+
# contexts, results = response(args, db_instance)
|
259 |
+
# if not results:
|
260 |
+
# return "Random question, direct return", contexts
|
261 |
+
# else:
|
262 |
+
# print('Context to answers: ',contexts)
|
263 |
+
# answer_chat = answer_chat_template(question, contexts)
|
264 |
+
# temp_chat[-1] = answer_chat
|
265 |
+
# answer = model_answer.generate_content(temp_chat).text
|
266 |
+
# return first_answer, answer
|
267 |
+
# except Exception as e:
|
268 |
+
# if first_answer:
|
269 |
+
# return first_answer, "Error occured: " + str(e)
|
270 |
+
# else:
|
271 |
+
# return "No answer", "Error occured: " + str(e)
|
chat/model_manage2.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import chat.arxiv_bot.arxiv_bot_utils2 as utils
|
2 |
+
import google.generativeai as genai
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
from google.generativeai.types import content_types
|
6 |
+
from collections.abc import Iterable
|
7 |
+
from IPython import display
|
8 |
+
from IPython.display import Markdown
|
9 |
+
|
10 |
+
# ----------------------- define instructions -----------------------
|
11 |
+
system_instruction = """You are a library chatbot that help people to find relevant articles about a topic, or find a specific article with given title and authors.
|
12 |
+
Your job is to analyze the user question, generate enough parameters based on the user question and use the tools that are given to you.
|
13 |
+
Also, after the function call is done, you must post-process the results in a more conversational form, providing some explanation about the paper based on its summary to avoid recitation.
|
14 |
+
You must provide the link to its Arxiv pdf page."""
|
15 |
+
|
16 |
+
# --------------------------- define tools --------------------------
|
17 |
+
def search_for_relevant_article(keywords: list['str'], topic_description: str) -> str:
|
18 |
+
"""This tool is used to search for articles from the database which is relevant to a topic, using a list of more than 3 keywords and a long sentence topic description.
|
19 |
+
If there is not enough 3 keywords from the question, the model must generate more keywords related to the topic.
|
20 |
+
If there is no description about the topic, the model must generate a description for the function call.
|
21 |
+
\nThe result is a string describe the records found from the database: 'Record no. - Title: <title>, Author: <authors>, Link: <link to the pdf file>, Summary: <summary of the article>'. There can be many records.
|
22 |
+
\nIf the result is 'Information not found' it means some error has occured, or the database has no relevant article"""
|
23 |
+
|
24 |
+
print('Keywords: {}, description: {}'.format(keywords,topic_description))
|
25 |
+
|
26 |
+
results = utils.ArxivChroma.query_relevant(keywords=keywords, query_texts=topic_description)
|
27 |
+
# print(results)
|
28 |
+
ids = results['metadatas'][0]
|
29 |
+
if len(ids) == 0:
|
30 |
+
# go crawl some
|
31 |
+
new_records = utils.crawl_arxiv(keyword_list=keywords, max_results=10)
|
32 |
+
# print("Got new records: ",len(new_records))
|
33 |
+
if type(new_records) == str:
|
34 |
+
return "Information not found"
|
35 |
+
|
36 |
+
utils.ArxivChroma.add(new_records)
|
37 |
+
utils.ArxivSQL.add(new_records)
|
38 |
+
results = utils.ArxivChroma.query_relevant(keywords=keywords, query_texts=topic_description)
|
39 |
+
ids = results['metadatas'][0]
|
40 |
+
# print("Re-queried on chromadb, results: ",ids)
|
41 |
+
|
42 |
+
paper_id = [id['paper_id'] for id in ids]
|
43 |
+
paper_info = utils.ArxivSQL.query_id(paper_id)
|
44 |
+
# print(paper_info)
|
45 |
+
records = [] # get title (2), author (3), link (6)
|
46 |
+
result_string = ""
|
47 |
+
if paper_info:
|
48 |
+
for i in range(len(paper_info)):
|
49 |
+
result_string += "Record no.{} - Title: {}, Author: {}, Link: {}, ".format(i+1,paper_info[i][2],paper_info[i][3],paper_info[i][6])
|
50 |
+
id = paper_info[i][0]
|
51 |
+
selected_document = utils.ArxivChroma.query_exact(id)["documents"]
|
52 |
+
doc_str = "Summary:"
|
53 |
+
for doc in selected_document:
|
54 |
+
doc_str+= doc + " "
|
55 |
+
result_string += doc_str
|
56 |
+
records.append([paper_info[i][2],paper_info[i][3],paper_info[i][6]])
|
57 |
+
return result_string
|
58 |
+
else:
|
59 |
+
return "Information not found"
|
60 |
+
|
61 |
+
def search_for_specific_article(title: str, authors: list['str']) -> str:
|
62 |
+
"""This tool is used to search for a specific article from the database, with its name and authors given.
|
63 |
+
\nThe result is a string describe the records found from the database: 'Record no. - Title: <title>, Author: <authors>, Link: <link to the pdf file>, Summary: <summary of the article>'. There can be many records.
|
64 |
+
\nIf the result is 'Information not found' it means some error has occured, or the database has no relevant article"""
|
65 |
+
|
66 |
+
print('Keywords: {}, description: {}'.format(title,authors))
|
67 |
+
|
68 |
+
paper_info = utils.ArxivSQL.query(title = title,author = authors)
|
69 |
+
if len(paper_info) == 0:
|
70 |
+
new_records = utils.crawl_exact_paper(title=title,author=authors)
|
71 |
+
# print("Got new records: ",len(new_records))
|
72 |
+
if type(new_records) == str:
|
73 |
+
# print(new_records)
|
74 |
+
return "Information not found"
|
75 |
+
utils.ArxivChroma.add(new_records)
|
76 |
+
utils.ArxivSQL.add(new_records)
|
77 |
+
paper_info = utils.ArxivSQL.query(title = title,author = authors)
|
78 |
+
# print("Re-queried on chromadb, results: ",paper_info)
|
79 |
+
# -------------------------------------
|
80 |
+
records = [] # get title (2), author (3), link (6)
|
81 |
+
result_string = ""
|
82 |
+
if paper_info:
|
83 |
+
for i in range(len(paper_info)):
|
84 |
+
result_string += "Record no.{} - Title: {}, Author: {}, Link: {}, ".format(i+1,paper_info[i][2],paper_info[i][3],paper_info[i][6])
|
85 |
+
id = paper_info[i][0]
|
86 |
+
selected_document = utils.ArxivChroma.query_exact(id)["documents"]
|
87 |
+
doc_str = "Summary:"
|
88 |
+
for doc in selected_document:
|
89 |
+
doc_str+= doc + " "
|
90 |
+
result_string += doc_str
|
91 |
+
records.append([paper_info[i][2],paper_info[i][3],paper_info[i][6]])
|
92 |
+
# process results:
|
93 |
+
if len(result_string) == 0:
|
94 |
+
return "Information not found"
|
95 |
+
return result_string
|
96 |
+
|
97 |
+
def answer_others_questions(question: str) -> str:
|
98 |
+
"""This tool is the default option for other questions that are not related to article or paper request. The model will response the question with its own answer."""
|
99 |
+
return question
|
100 |
+
|
101 |
+
tools = [search_for_relevant_article, search_for_specific_article, answer_others_questions]
|
102 |
+
tools_name = ['search_for_relevant_article', 'search_for_specific_article', 'answer_others_questions']
|
103 |
+
|
104 |
+
# load key, prepare config ------------------------
|
105 |
+
with open("apikey.txt","r") as apikey:
|
106 |
+
key = apikey.readline()
|
107 |
+
genai.configure(api_key=key)
|
108 |
+
generation_config = {
|
109 |
+
"temperature": 1,
|
110 |
+
"top_p": 1,
|
111 |
+
"top_k": 0,
|
112 |
+
"max_output_tokens": 2048,
|
113 |
+
"response_mime_type": "text/plain",
|
114 |
+
}
|
115 |
+
safety_settings = [
|
116 |
+
{
|
117 |
+
"category": "HARM_CATEGORY_DANGEROUS",
|
118 |
+
"threshold": "BLOCK_NONE",
|
119 |
+
},
|
120 |
+
{
|
121 |
+
"category": "HARM_CATEGORY_HARASSMENT",
|
122 |
+
"threshold": "BLOCK_NONE",
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"category": "HARM_CATEGORY_HATE_SPEECH",
|
126 |
+
"threshold": "BLOCK_NONE",
|
127 |
+
},
|
128 |
+
{
|
129 |
+
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
|
130 |
+
"threshold": "BLOCK_NONE",
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
|
134 |
+
"threshold": "BLOCK_NONE",
|
135 |
+
},
|
136 |
+
]
|
137 |
+
# this function return a tool_config with mode 'none', 'any', 'auto'
|
138 |
+
def tool_config_from_mode(mode: str, fns: Iterable[str] = ()):
|
139 |
+
"""Create a tool config with the specified function calling mode."""
|
140 |
+
return content_types.to_tool_config(
|
141 |
+
{"function_calling_config": {"mode": mode, "allowed_function_names": fns}}
|
142 |
+
)
|
143 |
+
|
144 |
+
def init_model(mode = "auto"):
|
145 |
+
# return an instance of a model, holding its own ChatSession
|
146 |
+
# every socket session holds its own model
|
147 |
+
# this function must be called upon socket init, also start_chat() to begin chat
|
148 |
+
model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest",
|
149 |
+
safety_settings=safety_settings,
|
150 |
+
generation_config=generation_config,
|
151 |
+
tools=tools,
|
152 |
+
tool_config=tool_config_from_mode(mode),
|
153 |
+
system_instruction=system_instruction)
|
154 |
+
chat_instance = model.start_chat(enable_automatic_function_calling=True)
|
155 |
+
return model, chat_instance
|
156 |
+
|
157 |
+
# handle tool call and chatsession
|
158 |
+
def full_chain_history_question(user_input, chat_instance: genai.ChatSession, mode="auto"):
|
159 |
+
try:
|
160 |
+
response = chat_instance.send_message(user_input,tool_config=tool_config_from_mode(mode)).text
|
161 |
+
return response, chat_instance.history
|
162 |
+
except Exception as e:
|
163 |
+
print(e)
|
164 |
+
return f'Error occured during call: {e}', chat_instance.history
|
165 |
+
|
166 |
+
# for printing log session
|
167 |
+
def print_history(history):
|
168 |
+
for content in history:
|
169 |
+
part = content.parts[0]
|
170 |
+
print(content.role, "->", type(part).to_dict(part))
|
171 |
+
print('-'*80)
|
172 |
+
|
173 |
+
utils.ArxivChroma.connect()
|
174 |
+
utils.ArxivSQL.connect()
|
chatbot_django/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (170 Bytes). View file
|
|
chatbot_django/__pycache__/asgi.cpython-311.pyc
ADDED
Binary file (1.24 kB). View file
|
|
chatbot_django/__pycache__/settings.cpython-311.pyc
ADDED
Binary file (2.7 kB). View file
|
|
chatbot_django/__pycache__/urls.cpython-311.pyc
ADDED
Binary file (1.25 kB). View file
|
|
concat.txt
ADDED
Binary file (32.7 kB). View file
|
|
db.sqlite3
CHANGED
Binary files a/db.sqlite3 and b/db.sqlite3 differ
|
|
models/models--jinaai--jina-bert-implementation/blobs/64b6ce6fe4477c320b0ab303e2f26ae98beae1f7
ADDED
The diff for this file is too large to render.
See raw diff
|
|