lingyit1108 commited on
Commit
8c107a7
β€’
1 Parent(s): 22585fc

swap to new embedding model and handle user 'i dont know' scenario

Browse files
config/model_config_advanced.yml CHANGED
@@ -14,4 +14,4 @@ vector_store:
14
  persisted_path: './models/chroma_db_advanced'
15
 
16
  questionaire_data:
17
- db_path: './database/mock_qna_advanced.sqlite'
 
14
  persisted_path: './models/chroma_db_advanced'
15
 
16
  questionaire_data:
17
+ db_path: './database/mock_qna.sqlite'
database/mock_qna.sqlite CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bae92f8657dd47a86cfb157a8b9a829115282746b04049a4d056a38af1b8ab4f
3
  size 40960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d51005d26f568ee304005ab7cf52cdc58a55f528230ae914a11dc9b75219623
3
  size 40960
models/chroma_db_advanced/a88943fe-4428-425d-8b9c-7bb8665a0c79/link_lists.bin DELETED
File without changes
models/chroma_db_advanced/af9795b7-8b5f-4493-adbc-40aedf3c96ed/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:453d35bee81975816ce0a286e796c4884c609c148e52d0605ac221daa46bf3d7
3
+ size 10056000
models/chroma_db_advanced/{a88943fe-4428-425d-8b9c-7bb8665a0c79 β†’ af9795b7-8b5f-4493-adbc-40aedf3c96ed}/header.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
3
  size 100
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89bd0cf182f20a10a0d7faa81bf3304c0565bc9b6f4705056ae63c061b9269ff
3
  size 100
models/chroma_db_advanced/{a88943fe-4428-425d-8b9c-7bb8665a0c79/data_level0.bin β†’ af9795b7-8b5f-4493-adbc-40aedf3c96ed/index_metadata.pickle} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2eec38a208011f4f233e59d2618152fa02e42d91757412778a5db814fe80bf2f
3
- size 1676000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5ecccac152d2deee938b41b1533b454bb8d5778a0befcd855529538a1a17bdf
3
+ size 346049
models/chroma_db_advanced/{a88943fe-4428-425d-8b9c-7bb8665a0c79 β†’ af9795b7-8b5f-4493-adbc-40aedf3c96ed}/length.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc19b1997119425765295aeab72d76faa6927d4f83985d328c26f20468d6cc76
3
- size 4000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae1fb78e4b679db0ad051360ddb549f4584c14a8b45f99d8d052f7d67067acb3
3
+ size 24000
models/chroma_db_advanced/af9795b7-8b5f-4493-adbc-40aedf3c96ed/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:535d672bfbbeec1181b50015d78bc1e776088cbbb0738d04bc725a76249eb744
3
+ size 52152
models/chroma_db_advanced/chroma.sqlite3 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:51aba6bb0bf5e5851de1e4e6cf53215b874c11b7194b3b765a2edfbc59ce9313
3
- size 15937536
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74c0d3543bf7cab83459feda7fad58a984a7c018fc566f79e937038b3756fcca
3
+ size 101720064
notebooks/002_persisted-embedding-model-advanced.ipynb CHANGED
@@ -10,11 +10,16 @@
10
  },
11
  {
12
  "cell_type": "code",
13
- "execution_count": null,
14
  "id": "7de9c591-5a77-4bbe-80f1-4897e15f0b97",
15
  "metadata": {},
16
  "outputs": [],
17
  "source": [
 
 
 
 
 
18
  "import chromadb\n",
19
  "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n",
20
  "from llama_index.vector_stores.chroma.base import ChromaVectorStore\n",
@@ -31,27 +36,78 @@
31
  "import time"
32
  ]
33
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  {
35
  "cell_type": "code",
36
  "execution_count": null,
 
 
 
 
 
 
 
 
37
  "id": "3e65dff6-77b6-4be8-8857-5cecf3a035bb",
38
  "metadata": {},
39
  "outputs": [],
40
  "source": [
41
  "# load some documents\n",
42
  "documents = SimpleDirectoryReader(input_files=[\n",
43
- " \"../raw_documents/qna.txt\",\n",
44
  " \"../raw_documents/HI Chapter Summary Version 1.3.pdf\",\n",
45
  " \"../raw_documents/conversation_examples.txt\",\n",
46
  " \"../raw_documents/HI_Knowledge_Base.pdf\",\n",
47
- " \"../raw_documents/answers.txt\",\n",
48
- " ]).load_data()\n",
49
  "document = Document(text=\"\\n\\n\".join([doc.text for doc in documents]))"
50
  ]
51
  },
52
  {
53
  "cell_type": "code",
54
- "execution_count": null,
55
  "id": "bd86b3f5-1dfc-4257-bd9c-86d34f02398d",
56
  "metadata": {},
57
  "outputs": [],
@@ -62,7 +118,7 @@
62
  },
63
  {
64
  "cell_type": "code",
65
- "execution_count": null,
66
  "id": "f568ce7b-bcbf-455c-acf1-6c2cae129fed",
67
  "metadata": {},
68
  "outputs": [],
@@ -73,7 +129,7 @@
73
  },
74
  {
75
  "cell_type": "code",
76
- "execution_count": null,
77
  "id": "ed0b018e-1982-46b2-b1b4-04f5c0ce8672",
78
  "metadata": {},
79
  "outputs": [],
@@ -92,19 +148,28 @@
92
  },
93
  {
94
  "cell_type": "code",
95
- "execution_count": null,
96
  "id": "0946b6ce-96ab-44de-ad75-e424a8429f67",
97
  "metadata": {},
98
- "outputs": [],
 
 
 
 
 
 
 
 
99
  "source": [
100
  "Settings.llm = None\n",
101
  "Settings.chunk_size = 1024\n",
 
102
  "Settings.embed_model = \"local:../models/fine-tuned-embeddings-advanced\""
103
  ]
104
  },
105
  {
106
  "cell_type": "code",
107
- "execution_count": null,
108
  "id": "b8c73a2c-1129-406a-8046-085afcaf9cbb",
109
  "metadata": {},
110
  "outputs": [],
@@ -114,10 +179,21 @@
114
  },
115
  {
116
  "cell_type": "code",
117
- "execution_count": null,
118
  "id": "75f1c76f-d3e5-4b69-818c-98865adb1457",
119
  "metadata": {},
120
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
121
  "source": [
122
  "len(nodes)"
123
  ]
@@ -132,7 +208,7 @@
132
  },
133
  {
134
  "cell_type": "code",
135
- "execution_count": null,
136
  "id": "dab4c6f3-ef67-4d90-b3d5-e290c5d1b6f4",
137
  "metadata": {},
138
  "outputs": [],
@@ -142,7 +218,7 @@
142
  },
143
  {
144
  "cell_type": "code",
145
- "execution_count": null,
146
  "id": "6a764113-ad7e-4674-aa57-ebbf405902a8",
147
  "metadata": {},
148
  "outputs": [],
@@ -160,7 +236,7 @@
160
  },
161
  {
162
  "cell_type": "code",
163
- "execution_count": null,
164
  "id": "e492ed4a-23a3-47d6-8b50-51fb48b3aa05",
165
  "metadata": {},
166
  "outputs": [],
@@ -170,7 +246,7 @@
170
  },
171
  {
172
  "cell_type": "code",
173
- "execution_count": null,
174
  "id": "cbd11b89-9b83-4f08-bb30-160f750f2ffb",
175
  "metadata": {},
176
  "outputs": [],
@@ -180,39 +256,88 @@
180
  },
181
  {
182
  "cell_type": "code",
183
- "execution_count": null,
184
- "id": "082a0d7e-b025-4db1-be2a-7a0b7bc453b9",
185
  "metadata": {},
186
- "outputs": [],
 
 
 
 
 
 
 
 
187
  "source": [
188
- "vector_query_engine = vector_index.as_query_engine()"
 
 
189
  ]
190
  },
191
  {
192
  "cell_type": "code",
193
- "execution_count": null,
194
- "id": "d3bd848d-9985-4a3d-bdc4-ec340cc69ef3",
195
  "metadata": {},
196
  "outputs": [],
197
  "source": [
198
- "indexing_cost = time.time() - start_time\n",
199
- "indexing_cost = indexing_cost / 60\n",
200
- "print(f\"Indexing time: {indexing_cost:.1f} mins\")"
201
  ]
202
  },
203
  {
204
  "cell_type": "code",
205
- "execution_count": null,
206
  "id": "3290e870-41d7-49c4-9c4f-cb16bd1f469e",
207
  "metadata": {
208
  "scrolled": true
209
  },
210
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
211
  "source": [
212
  "response = vector_query_engine.query(\"Healthcare System in Singapore consists of?\")\n",
213
  "response"
214
  ]
215
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  {
217
  "cell_type": "code",
218
  "execution_count": null,
@@ -239,7 +364,7 @@
239
  },
240
  {
241
  "cell_type": "code",
242
- "execution_count": null,
243
  "id": "c1a42c35-5f57-423c-8fb7-7d18b3b466b5",
244
  "metadata": {},
245
  "outputs": [],
@@ -269,7 +394,7 @@
269
  },
270
  {
271
  "cell_type": "code",
272
- "execution_count": null,
273
  "id": "d38dc953-b923-4128-86a1-c8c6f69af0ed",
274
  "metadata": {},
275
  "outputs": [],
@@ -279,7 +404,7 @@
279
  },
280
  {
281
  "cell_type": "code",
282
- "execution_count": null,
283
  "id": "4c83c613-2cfc-4871-9d07-c82f77a3bd5e",
284
  "metadata": {},
285
  "outputs": [],
@@ -289,7 +414,7 @@
289
  },
290
  {
291
  "cell_type": "code",
292
- "execution_count": null,
293
  "id": "0583e9b0-d977-488c-8331-46dfa749924c",
294
  "metadata": {},
295
  "outputs": [],
@@ -308,7 +433,7 @@
308
  },
309
  {
310
  "cell_type": "code",
311
- "execution_count": null,
312
  "id": "2159a2b6-494b-41b9-ac54-dd342bfb74ba",
313
  "metadata": {},
314
  "outputs": [],
@@ -318,7 +443,7 @@
318
  },
319
  {
320
  "cell_type": "code",
321
- "execution_count": null,
322
  "id": "1b385644-b46e-4d13-88fa-9f4af39db405",
323
  "metadata": {},
324
  "outputs": [],
@@ -328,7 +453,7 @@
328
  },
329
  {
330
  "cell_type": "code",
331
- "execution_count": null,
332
  "id": "93cb53d1-6b8c-4b2d-a839-53501c0d54b2",
333
  "metadata": {},
334
  "outputs": [],
@@ -340,7 +465,7 @@
340
  },
341
  {
342
  "cell_type": "code",
343
- "execution_count": null,
344
  "id": "c40d59e1-6d42-41f0-8c9b-70aa026093ae",
345
  "metadata": {},
346
  "outputs": [],
@@ -362,7 +487,7 @@
362
  },
363
  {
364
  "cell_type": "code",
365
- "execution_count": null,
366
  "id": "1a506940-c2b4-4d14-ad93-fd451331c582",
367
  "metadata": {},
368
  "outputs": [],
@@ -375,7 +500,7 @@
375
  },
376
  {
377
  "cell_type": "code",
378
- "execution_count": null,
379
  "id": "3f592848-8536-4b4d-b34a-adc32d043432",
380
  "metadata": {},
381
  "outputs": [],
@@ -385,7 +510,7 @@
385
  },
386
  {
387
  "cell_type": "code",
388
- "execution_count": null,
389
  "id": "6c7df81a-fd2f-42bf-b09c-46d7750f7252",
390
  "metadata": {},
391
  "outputs": [],
@@ -399,58 +524,66 @@
399
  },
400
  {
401
  "cell_type": "code",
402
- "execution_count": null,
403
- "id": "434f0caf-8b1f-40c6-b9ec-b039cd1ca612",
404
  "metadata": {},
405
  "outputs": [],
406
  "source": [
407
- "prompt = \"\"\"\n",
408
- "Question: Which of the following is NOT a characteristic of medical expense insurance?\n",
409
- "A. Pro ration factor and co-insurance.\n",
410
- "B. Deductibles apply for all treatments.\n",
411
- "C. Impose Sub- Limits.\n",
412
- "D. Can be issued as a rider or stand-alone.\n",
413
- "\"\"\""
414
  ]
415
  },
416
  {
417
  "cell_type": "code",
418
  "execution_count": null,
419
- "id": "78abaf95-e52d-445c-9d8e-bc51efb20f06",
420
  "metadata": {},
421
  "outputs": [],
422
- "source": [
423
- "res = chat_engine.chat(prompt)\n",
424
- "print(res.response)"
425
- ]
426
  },
427
  {
428
  "cell_type": "code",
429
- "execution_count": null,
430
- "id": "1e62303c-3a00-448f-ad93-15cb6cee1f24",
431
  "metadata": {},
432
  "outputs": [],
433
- "source": []
 
 
 
 
 
 
 
 
434
  },
435
  {
436
  "cell_type": "code",
437
- "execution_count": null,
438
- "id": "dad72f9f-7f86-407d-93be-f5724cb30d5c",
439
  "metadata": {},
440
- "outputs": [],
 
 
 
 
 
 
 
 
441
  "source": [
442
- "hi_engine = index.as_query_engine(\n",
443
- " memory=memory,\n",
444
- " system_prompt=system_content,\n",
445
- " similarity_top_k=3,\n",
446
- " streaming=True\n",
447
- ")"
448
  ]
449
  },
450
  {
451
  "cell_type": "code",
452
  "execution_count": null,
453
- "id": "ab778a5d-d438-4f39-88f5-c67a1f1d575e",
454
  "metadata": {},
455
  "outputs": [],
456
  "source": []
@@ -458,18 +591,44 @@
458
  {
459
  "cell_type": "code",
460
  "execution_count": null,
461
- "id": "7bb7c21a-7461-40c1-87a7-4a1f92f70153",
462
  "metadata": {},
463
  "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
  "source": [
465
- "res = hi_engine.query(\"may I know what is the rationale?\")\n",
466
- "print(res)"
467
  ]
468
  },
469
  {
470
  "cell_type": "code",
471
  "execution_count": null,
472
- "id": "874a39ce-e682-42fa-8085-646bacea6cdb",
473
  "metadata": {},
474
  "outputs": [],
475
  "source": []
 
10
  },
11
  {
12
  "cell_type": "code",
13
+ "execution_count": 1,
14
  "id": "7de9c591-5a77-4bbe-80f1-4897e15f0b97",
15
  "metadata": {},
16
  "outputs": [],
17
  "source": [
18
+ "import sys, os, shutil\n",
19
+ "sys.path.insert(0, \"../\")\n",
20
+ "\n",
21
+ "from preprocess_raw_documents import split_content\n",
22
+ "\n",
23
  "import chromadb\n",
24
  "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n",
25
  "from llama_index.vector_stores.chroma.base import ChromaVectorStore\n",
 
36
  "import time"
37
  ]
38
  },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": 2,
42
+ "id": "978152ce-4d87-44b5-b521-dbaff60b32b0",
43
+ "metadata": {},
44
+ "outputs": [
45
+ {
46
+ "name": "stderr",
47
+ "output_type": "stream",
48
+ "text": [
49
+ "199it [00:00, 8821.71it/s]\n",
50
+ "200it [00:00, 12584.17it/s]\n"
51
+ ]
52
+ }
53
+ ],
54
+ "source": [
55
+ "split_content(filepath=\"../raw_documents/answers.txt\", \n",
56
+ " separator=\"\\n\\n\", \n",
57
+ " tmp_folder=\"../raw_documents/answers_temp\")\n",
58
+ "\n",
59
+ "split_content(filepath=\"../raw_documents/qna.txt\", \n",
60
+ " separator=\"\\n\\n\\n\", \n",
61
+ " tmp_folder=\"../raw_documents/qna_temp\")"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": 5,
67
+ "id": "d925371b-8777-4f5b-a7f2-ec3f228ef266",
68
+ "metadata": {},
69
+ "outputs": [],
70
+ "source": [
71
+ "answers_temp_files = []\n",
72
+ "folder_path = \"../raw_documents/answers_temp\"\n",
73
+ "for f in os.listdir(folder_path):\n",
74
+ " fpath = os.path.join(folder_path, f)\n",
75
+ " answers_temp_files.append(fpath)\n",
76
+ " \n",
77
+ "qna_temp_files = []\n",
78
+ "folder_path = \"../raw_documents/qna_temp\"\n",
79
+ "for f in os.listdir(folder_path):\n",
80
+ " fpath = os.path.join(folder_path, f)\n",
81
+ " qna_temp_files.append(fpath)"
82
+ ]
83
+ },
84
  {
85
  "cell_type": "code",
86
  "execution_count": null,
87
+ "id": "e876a26b-822d-44d6-a3dd-ccdcc04933cf",
88
+ "metadata": {},
89
+ "outputs": [],
90
+ "source": []
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": 7,
95
  "id": "3e65dff6-77b6-4be8-8857-5cecf3a035bb",
96
  "metadata": {},
97
  "outputs": [],
98
  "source": [
99
  "# load some documents\n",
100
  "documents = SimpleDirectoryReader(input_files=[\n",
 
101
  " \"../raw_documents/HI Chapter Summary Version 1.3.pdf\",\n",
102
  " \"../raw_documents/conversation_examples.txt\",\n",
103
  " \"../raw_documents/HI_Knowledge_Base.pdf\",\n",
104
+ " ] + answers_temp_files + qna_temp_files ).load_data()\n",
 
105
  "document = Document(text=\"\\n\\n\".join([doc.text for doc in documents]))"
106
  ]
107
  },
108
  {
109
  "cell_type": "code",
110
+ "execution_count": 8,
111
  "id": "bd86b3f5-1dfc-4257-bd9c-86d34f02398d",
112
  "metadata": {},
113
  "outputs": [],
 
118
  },
119
  {
120
  "cell_type": "code",
121
+ "execution_count": 9,
122
  "id": "f568ce7b-bcbf-455c-acf1-6c2cae129fed",
123
  "metadata": {},
124
  "outputs": [],
 
129
  },
130
  {
131
  "cell_type": "code",
132
+ "execution_count": 10,
133
  "id": "ed0b018e-1982-46b2-b1b4-04f5c0ce8672",
134
  "metadata": {},
135
  "outputs": [],
 
148
  },
149
  {
150
  "cell_type": "code",
151
+ "execution_count": 11,
152
  "id": "0946b6ce-96ab-44de-ad75-e424a8429f67",
153
  "metadata": {},
154
+ "outputs": [
155
+ {
156
+ "name": "stdout",
157
+ "output_type": "stream",
158
+ "text": [
159
+ "LLM is explicitly disabled. Using MockLLM.\n"
160
+ ]
161
+ }
162
+ ],
163
  "source": [
164
  "Settings.llm = None\n",
165
  "Settings.chunk_size = 1024\n",
166
+ "Settings.chunk_overlap = 50\n",
167
  "Settings.embed_model = \"local:../models/fine-tuned-embeddings-advanced\""
168
  ]
169
  },
170
  {
171
  "cell_type": "code",
172
+ "execution_count": 12,
173
  "id": "b8c73a2c-1129-406a-8046-085afcaf9cbb",
174
  "metadata": {},
175
  "outputs": [],
 
179
  },
180
  {
181
  "cell_type": "code",
182
+ "execution_count": 13,
183
  "id": "75f1c76f-d3e5-4b69-818c-98865adb1457",
184
  "metadata": {},
185
+ "outputs": [
186
+ {
187
+ "data": {
188
+ "text/plain": [
189
+ "6814"
190
+ ]
191
+ },
192
+ "execution_count": 13,
193
+ "metadata": {},
194
+ "output_type": "execute_result"
195
+ }
196
+ ],
197
  "source": [
198
  "len(nodes)"
199
  ]
 
208
  },
209
  {
210
  "cell_type": "code",
211
+ "execution_count": 14,
212
  "id": "dab4c6f3-ef67-4d90-b3d5-e290c5d1b6f4",
213
  "metadata": {},
214
  "outputs": [],
 
218
  },
219
  {
220
  "cell_type": "code",
221
+ "execution_count": 15,
222
  "id": "6a764113-ad7e-4674-aa57-ebbf405902a8",
223
  "metadata": {},
224
  "outputs": [],
 
236
  },
237
  {
238
  "cell_type": "code",
239
+ "execution_count": 16,
240
  "id": "e492ed4a-23a3-47d6-8b50-51fb48b3aa05",
241
  "metadata": {},
242
  "outputs": [],
 
246
  },
247
  {
248
  "cell_type": "code",
249
+ "execution_count": 17,
250
  "id": "cbd11b89-9b83-4f08-bb30-160f750f2ffb",
251
  "metadata": {},
252
  "outputs": [],
 
256
  },
257
  {
258
  "cell_type": "code",
259
+ "execution_count": 18,
260
+ "id": "d3bd848d-9985-4a3d-bdc4-ec340cc69ef3",
261
  "metadata": {},
262
+ "outputs": [
263
+ {
264
+ "name": "stdout",
265
+ "output_type": "stream",
266
+ "text": [
267
+ "Indexing time: 2.3 mins\n"
268
+ ]
269
+ }
270
+ ],
271
  "source": [
272
+ "indexing_cost = time.time() - start_time\n",
273
+ "indexing_cost = indexing_cost / 60\n",
274
+ "print(f\"Indexing time: {indexing_cost:.1f} mins\")"
275
  ]
276
  },
277
  {
278
  "cell_type": "code",
279
+ "execution_count": 19,
280
+ "id": "f16cca33-71fb-437d-a033-671b9fd44054",
281
  "metadata": {},
282
  "outputs": [],
283
  "source": [
284
+ "vector_query_engine = vector_index.as_query_engine()"
 
 
285
  ]
286
  },
287
  {
288
  "cell_type": "code",
289
+ "execution_count": 20,
290
  "id": "3290e870-41d7-49c4-9c4f-cb16bd1f469e",
291
  "metadata": {
292
  "scrolled": true
293
  },
294
+ "outputs": [
295
+ {
296
+ "data": {
297
+ "text/plain": [
298
+ "Response(response='Context information is below.\\n---------------------\\nfile_path: ../raw_documents/answers_temp/answers_050.txt\\n\\nQuestion: The fundamental principle of Singapore healthcare financing is ____________.\\nAnswer: The answer is \"Individual Savings\".\\n\\nfile_path: ../raw_documents/qna_temp/qna_050.txt\\n\\nC1/5\\nQuestion: The fundamental principle of Singapore healthcare financing is ____________.\\nA. The 3’s M. That is Medisave, Medishield, Medifund.\\nB. Means Testing and Casemix.\\nC. Individual Savings.\\nD. Tax based subsidies and government subvention.\\nAnswer: C. The answer is \"Individual Savings\".\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: Healthcare System in Singapore consists of?\\nAnswer: ', source_nodes=[NodeWithScore(node=TextNode(id_='536fef67-6a3f-4054-a94a-cc9143599510', embedding=None, metadata={'file_path': '../raw_documents/answers_temp/answers_050.txt', 'file_name': 'answers_050.txt', 'file_type': 'text/plain', 'file_size': 130, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='2b0f7dad-c532-4abd-8c42-f53383a4fc76', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '../raw_documents/answers_temp/answers_050.txt', 'file_name': 'answers_050.txt', 'file_type': 'text/plain', 'file_size': 130, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, hash='5b1d1dc729a663e4ccfacc0f18adf0f6644a2a7d2991490fd962d1550c83f2ff'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='6d93c092-b4cc-4b5b-b379-080d777d3908', node_type=<ObjectType.TEXT: '1'>, metadata={'file_path': '../raw_documents/answers_temp/answers_044.txt', 'file_name': 'answers_044.txt', 'file_type': 'text/plain', 'file_size': 164, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, hash='caeb59043b8daa56ed472941882947570abff951f64aa0498672aba5921fac1d'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='859a9958-6f5d-4581-95d0-39edfc950ef5', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='8416454b2fbad3e6122c5151d2b3d1eadf0afde3514ba09374c71e96baf712bc')}, text='Question: The fundamental principle of Singapore healthcare financing is ____________.\\nAnswer: The answer is \"Individual Savings\".', start_char_idx=0, end_char_idx=130, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'), score=0.4159636550867191), NodeWithScore(node=TextNode(id_='472000ae-a0aa-4464-a200-72fe67a3fbde', embedding=None, metadata={'file_path': '../raw_documents/qna_temp/qna_050.txt', 'file_name': 'qna_050.txt', 'file_type': 'text/plain', 'file_size': 297, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='506fb715-d3b0-4ca7-b7ca-011a1e1a1f0d', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '../raw_documents/qna_temp/qna_050.txt', 'file_name': 'qna_050.txt', 'file_type': 'text/plain', 'file_size': 297, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, hash='7461ffa12ff6729003131976b82995b7254ab10f8dc7d79c65988ec9e3b7b631'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='d8232b90-d641-4966-b98f-4ca0821db773', node_type=<ObjectType.TEXT: '1'>, metadata={'file_path': '../raw_documents/qna_temp/qna_044.txt', 'file_name': 'qna_044.txt', 'file_type': 'text/plain', 'file_size': 383, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, hash='cbeb00c29c6130548466697a862fee43ab2be92d84158cc0b69c2f5c7bbe68b1'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='e772e623-cf91-41cd-a516-50acb894eb54', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='a7583b0fd46f98d0118c712632277d81f417b779f8bcc100ab2558dae6317cde')}, text='C1/5\\nQuestion: The fundamental principle of Singapore healthcare financing is ____________.\\nA. The 3’s M. That is Medisave, Medishield, Medifund.\\nB. Means Testing and Casemix.\\nC. Individual Savings.\\nD. Tax based subsidies and government subvention.\\nAnswer: C. The answer is \"Individual Savings\".', start_char_idx=0, end_char_idx=295, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'), score=0.4126648577998099)], metadata={'536fef67-6a3f-4054-a94a-cc9143599510': {'file_path': '../raw_documents/answers_temp/answers_050.txt', 'file_name': 'answers_050.txt', 'file_type': 'text/plain', 'file_size': 130, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, '472000ae-a0aa-4464-a200-72fe67a3fbde': {'file_path': '../raw_documents/qna_temp/qna_050.txt', 'file_name': 'qna_050.txt', 'file_type': 'text/plain', 'file_size': 297, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}})"
299
+ ]
300
+ },
301
+ "execution_count": 20,
302
+ "metadata": {},
303
+ "output_type": "execute_result"
304
+ }
305
+ ],
306
  "source": [
307
  "response = vector_query_engine.query(\"Healthcare System in Singapore consists of?\")\n",
308
  "response"
309
  ]
310
  },
311
+ {
312
+ "cell_type": "code",
313
+ "execution_count": null,
314
+ "id": "aa4b9906-5f75-4003-9f4c-5cfcc7ab1eaf",
315
+ "metadata": {},
316
+ "outputs": [],
317
+ "source": []
318
+ },
319
+ {
320
+ "cell_type": "code",
321
+ "execution_count": 21,
322
+ "id": "1bb75b04-6a62-43a4-8728-d2e52e49f1c0",
323
+ "metadata": {},
324
+ "outputs": [],
325
+ "source": [
326
+ "if os.path.exists(\"../raw_documents/answers_temp\"):\n",
327
+ " shutil.rmtree(\"../raw_documents/answers_temp\")"
328
+ ]
329
+ },
330
+ {
331
+ "cell_type": "code",
332
+ "execution_count": 22,
333
+ "id": "0ed920fb-6456-49ac-8b63-08bd86b5b39c",
334
+ "metadata": {},
335
+ "outputs": [],
336
+ "source": [
337
+ "if os.path.exists(\"../raw_documents/qna_temp\"):\n",
338
+ " shutil.rmtree(\"../raw_documents/qna_temp\")"
339
+ ]
340
+ },
341
  {
342
  "cell_type": "code",
343
  "execution_count": null,
 
364
  },
365
  {
366
  "cell_type": "code",
367
+ "execution_count": 1,
368
  "id": "c1a42c35-5f57-423c-8fb7-7d18b3b466b5",
369
  "metadata": {},
370
  "outputs": [],
 
394
  },
395
  {
396
  "cell_type": "code",
397
+ "execution_count": 2,
398
  "id": "d38dc953-b923-4128-86a1-c8c6f69af0ed",
399
  "metadata": {},
400
  "outputs": [],
 
404
  },
405
  {
406
  "cell_type": "code",
407
+ "execution_count": 3,
408
  "id": "4c83c613-2cfc-4871-9d07-c82f77a3bd5e",
409
  "metadata": {},
410
  "outputs": [],
 
414
  },
415
  {
416
  "cell_type": "code",
417
+ "execution_count": 4,
418
  "id": "0583e9b0-d977-488c-8331-46dfa749924c",
419
  "metadata": {},
420
  "outputs": [],
 
433
  },
434
  {
435
  "cell_type": "code",
436
+ "execution_count": 5,
437
  "id": "2159a2b6-494b-41b9-ac54-dd342bfb74ba",
438
  "metadata": {},
439
  "outputs": [],
 
443
  },
444
  {
445
  "cell_type": "code",
446
+ "execution_count": 6,
447
  "id": "1b385644-b46e-4d13-88fa-9f4af39db405",
448
  "metadata": {},
449
  "outputs": [],
 
453
  },
454
  {
455
  "cell_type": "code",
456
+ "execution_count": 7,
457
  "id": "93cb53d1-6b8c-4b2d-a839-53501c0d54b2",
458
  "metadata": {},
459
  "outputs": [],
 
465
  },
466
  {
467
  "cell_type": "code",
468
+ "execution_count": 8,
469
  "id": "c40d59e1-6d42-41f0-8c9b-70aa026093ae",
470
  "metadata": {},
471
  "outputs": [],
 
487
  },
488
  {
489
  "cell_type": "code",
490
+ "execution_count": 9,
491
  "id": "1a506940-c2b4-4d14-ad93-fd451331c582",
492
  "metadata": {},
493
  "outputs": [],
 
500
  },
501
  {
502
  "cell_type": "code",
503
+ "execution_count": 10,
504
  "id": "3f592848-8536-4b4d-b34a-adc32d043432",
505
  "metadata": {},
506
  "outputs": [],
 
510
  },
511
  {
512
  "cell_type": "code",
513
+ "execution_count": 11,
514
  "id": "6c7df81a-fd2f-42bf-b09c-46d7750f7252",
515
  "metadata": {},
516
  "outputs": [],
 
524
  },
525
  {
526
  "cell_type": "code",
527
+ "execution_count": 12,
528
+ "id": "c3106dff-dd6f-47a9-9454-1e61775e7539",
529
  "metadata": {},
530
  "outputs": [],
531
  "source": [
532
+ "hi_engine = index.as_query_engine(\n",
533
+ " memory=memory,\n",
534
+ " system_prompt=system_content,\n",
535
+ " similarity_top_k=10,\n",
536
+ " streaming=True\n",
537
+ ")"
 
538
  ]
539
  },
540
  {
541
  "cell_type": "code",
542
  "execution_count": null,
543
+ "id": "53a38081-4a79-44bc-bfa3-5d8653804328",
544
  "metadata": {},
545
  "outputs": [],
546
+ "source": []
 
 
 
547
  },
548
  {
549
  "cell_type": "code",
550
+ "execution_count": 24,
551
+ "id": "434f0caf-8b1f-40c6-b9ec-b039cd1ca612",
552
  "metadata": {},
553
  "outputs": [],
554
+ "source": [
555
+ "prompt = \"\"\"\n",
556
+ "Question: Which is not a government healthcare philosophy? \n",
557
+ "A. To nurture a healthy nation by promoting good health.\n",
558
+ "B. To rely on competition to improve service and raise efficiency\n",
559
+ "C. To intervene directly whenever necessary\n",
560
+ "D. To provide for the care of employees\n",
561
+ "\"\"\""
562
+ ]
563
  },
564
  {
565
  "cell_type": "code",
566
+ "execution_count": 26,
567
+ "id": "a1c83dff-50d1-47b1-b7e9-4fc5cd08e1e8",
568
  "metadata": {},
569
+ "outputs": [
570
+ {
571
+ "name": "stdout",
572
+ "output_type": "stream",
573
+ "text": [
574
+ "D. To provide for the care of employees\n"
575
+ ]
576
+ }
577
+ ],
578
  "source": [
579
+ "res = hi_engine.query(prompt)\n",
580
+ "print(res)"
 
 
 
 
581
  ]
582
  },
583
  {
584
  "cell_type": "code",
585
  "execution_count": null,
586
+ "id": "cedd3512-548d-4455-80fd-c6a8b2c0cd00",
587
  "metadata": {},
588
  "outputs": [],
589
  "source": []
 
591
  {
592
  "cell_type": "code",
593
  "execution_count": null,
594
+ "id": "ec53dfcf-d4c0-4d10-a24e-be2004a83656",
595
  "metadata": {},
596
  "outputs": [],
597
+ "source": []
598
+ },
599
+ {
600
+ "cell_type": "code",
601
+ "execution_count": 14,
602
+ "id": "78abaf95-e52d-445c-9d8e-bc51efb20f06",
603
+ "metadata": {},
604
+ "outputs": [
605
+ {
606
+ "name": "stderr",
607
+ "output_type": "stream",
608
+ "text": [
609
+ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
610
+ "To disable this warning, you can either:\n",
611
+ "\t- Avoid using `tokenizers` before the fork if possible\n",
612
+ "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
613
+ ]
614
+ },
615
+ {
616
+ "name": "stdout",
617
+ "output_type": "stream",
618
+ "text": [
619
+ "The correct answer is \"Deductibles apply for all treatments\".\n"
620
+ ]
621
+ }
622
+ ],
623
  "source": [
624
+ "res = chat_engine.chat(prompt)\n",
625
+ "print(res.response)"
626
  ]
627
  },
628
  {
629
  "cell_type": "code",
630
  "execution_count": null,
631
+ "id": "1e62303c-3a00-448f-ad93-15cb6cee1f24",
632
  "metadata": {},
633
  "outputs": [],
634
  "source": []
preprocess_raw_documents.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ from tqdm import tqdm
4
+
5
+
6
+ def split_content(filepath, separator, tmp_folder):
7
+ os.makedirs(tmp_folder, exist_ok=True)
8
+ base_file_name = os.path.basename(filepath)
9
+ fname, fextn = base_file_name.split(".")
10
+ with open(filepath, "r") as fp:
11
+ content = fp.read()
12
+ content_chunk = content.split(separator)
13
+ for index, chunk in tqdm(enumerate(content_chunk)):
14
+ new_fpath = os.path.join(tmp_folder, f"{fname}_{index:03d}.{fextn}")
15
+ with open(new_fpath, "w") as fp:
16
+ fp.write(chunk)
qna_prompting.py CHANGED
@@ -25,10 +25,11 @@ qna_question_data_format = """
25
  Example 3: `Chapter_5` for fifth chapter
26
  """
27
  qna_answer_description = """
28
- Use this tool to trigger the evaluation of user's provided input with the
29
- correct answer of the Q&A question asked. When user provides answer to the
30
- question asked, they can reply in natural language or giving the alphabet
31
- letter of which selected choice they think it's the right answer.
 
32
 
33
  If user's answer is not a single alphabet letter, but is contextually
34
  closer to a particular answer choice, return the corresponding
@@ -122,7 +123,6 @@ def evaluate_qna_answer(user_selected_answer: str) -> str:
122
 
123
  ### convert to numeric type
124
  qna_answer = int(qna_answer)
125
-
126
  qna_answer_alphabet = num_mapping.get(qna_answer, "ERROR")
127
 
128
  con = sqlite3.connect(db_path)
@@ -138,13 +138,34 @@ def evaluate_qna_answer(user_selected_answer: str) -> str:
138
  con.commit()
139
  con.close()
140
 
141
- if qna_answer == user_answer_numeric:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  st.toast("🍯 yummy yummy, hooray!", icon="πŸŽ‰")
143
  time.sleep(2)
144
  st.toast("πŸ»πŸ’•πŸ― You got it right!", icon="🎊")
145
  time.sleep(2)
146
  st.toast("πŸ₯‡ You are amazing! πŸ’―πŸ’―", icon="πŸ’ͺ")
147
  st.balloons()
 
148
  else:
149
  st.toast("🐼 Something doesn't seem right.. πŸ”₯🏠πŸ”₯", icon="πŸ˜‚")
150
  time.sleep(2)
@@ -152,17 +173,16 @@ def evaluate_qna_answer(user_selected_answer: str) -> str:
152
  time.sleep(2)
153
  st.toast("πŸ€œπŸ€› Nevertheless, it was a good try!! πŸ‹οΈβ€β™‚οΈπŸ‹οΈβ€β™‚οΈ", icon="πŸ‘")
154
  st.snow()
 
 
 
 
 
155
 
156
- reasoning = "" if "textbook" in reasons else "Rationale is that: " + reasons
157
- qna_answer_response = (
158
- f"Your selected answer is `{user_selected_answer}`, "
159
- f"but the actual answer is `{qna_answer_alphabet}`. " + reasoning
160
- )
161
-
162
  except Exception as e:
163
  print(e)
164
 
165
- return qna_answer_response
166
 
167
  get_qna_question_tool = FunctionTool.from_defaults(
168
  fn=get_qna_question,
 
25
  Example 3: `Chapter_5` for fifth chapter
26
  """
27
  qna_answer_description = """
28
+ Not to trigger this when questions being asked, come directly from user.
29
+ Only use this tool to trigger the evaluation of user's provided input with the
30
+ correct answer of the Q&A question asked by Assistant. When user provides
31
+ answer to the question asked, they can reply in natural language or giving
32
+ the alphabet letter of which selected choice they think it's the right answer.
33
 
34
  If user's answer is not a single alphabet letter, but is contextually
35
  closer to a particular answer choice, return the corresponding
 
123
 
124
  ### convert to numeric type
125
  qna_answer = int(qna_answer)
 
126
  qna_answer_alphabet = num_mapping.get(qna_answer, "ERROR")
127
 
128
  con = sqlite3.connect(db_path)
 
138
  con.commit()
139
  con.close()
140
 
141
+ reasoning = "" if "textbook" in reasons else f"Rationale is that: {reasons}. "
142
+ qna_answer_response = (
143
+ f"Your selected answer is `{user_selected_answer}`, "
144
+ f"but the actual answer is `{qna_answer_alphabet}`. "
145
+ )
146
+ qna_not_knowing_response = (
147
+ f"No problem! The answer is `{qna_answer_alphabet}`. "
148
+ )
149
+ to_know_more = (
150
+ "Let me know if you want to know more, "
151
+ "I can give you an explanation πŸ»πŸ’•"
152
+ )
153
+
154
+ if user_answer_numeric == 0:
155
+ st.toast("πŸ―β“ couldn't find the honey? πŸ‘Œ no worries!", icon="🫠")
156
+ time.sleep(2)
157
+ st.toast("🐻 Let me bring it to you! πŸ―πŸ’•", icon="πŸ’Œ")
158
+ time.sleep(2)
159
+ st.toast("✨ You will do great next time! πŸ’†", icon="🎁")
160
+ final_response = qna_not_knowing_response + reasoning + to_know_more
161
+ elif qna_answer == user_answer_numeric:
162
  st.toast("🍯 yummy yummy, hooray!", icon="πŸŽ‰")
163
  time.sleep(2)
164
  st.toast("πŸ»πŸ’•πŸ― You got it right!", icon="🎊")
165
  time.sleep(2)
166
  st.toast("πŸ₯‡ You are amazing! πŸ’―πŸ’―", icon="πŸ’ͺ")
167
  st.balloons()
168
+ final_response = qna_answer_response + reasoning + to_know_more
169
  else:
170
  st.toast("🐼 Something doesn't seem right.. πŸ”₯🏠πŸ”₯", icon="πŸ˜‚")
171
  time.sleep(2)
 
173
  time.sleep(2)
174
  st.toast("πŸ€œπŸ€› Nevertheless, it was a good try!! πŸ‹οΈβ€β™‚οΈπŸ‹οΈβ€β™‚οΈ", icon="πŸ‘")
175
  st.snow()
176
+ final_response = qna_answer_response + reasoning + to_know_more
177
+
178
+ st.session_state.question_id = None
179
+ st.session_state.qna_answer = None
180
+ st.session_state.reasons = None
181
 
 
 
 
 
 
 
182
  except Exception as e:
183
  print(e)
184
 
185
+ return final_response
186
 
187
  get_qna_question_tool = FunctionTool.from_defaults(
188
  fn=get_qna_question,
streamlit_app.py CHANGED
@@ -40,7 +40,7 @@ nest_asyncio.apply()
40
  st.set_page_config(page_title="πŸ»πŸ“š Study Bear 🍯")
41
  openai_api = os.getenv("OPENAI_API_KEY")
42
 
43
- with open("./config/model_config.yml", "r") as file_reader:
44
  model_config = yaml.safe_load(file_reader)
45
 
46
  input_files = model_config["input_data"]["source"]
 
40
  st.set_page_config(page_title="πŸ»πŸ“š Study Bear 🍯")
41
  openai_api = os.getenv("OPENAI_API_KEY")
42
 
43
+ with open("./config/model_config_advanced.yml", "r") as file_reader:
44
  model_config = yaml.safe_load(file_reader)
45
 
46
  input_files = model_config["input_data"]["source"]