Spaces:
Sleeping
Sleeping
jeevan
commited on
Commit
•
416fc9c
1
Parent(s):
77e353e
using ft model
Browse files- .gitattributes +1 -1
- Tasks/Task 1/Task1.md +0 -2
- Tasks/Task 1/pre-processing.ipynb +550 -458
- app.py +2 -3
.gitattributes
CHANGED
@@ -32,4 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
32 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
Tasks/Task 1/Task1.md
CHANGED
@@ -37,8 +37,6 @@ In addition to the default strategy, I would like to test out a **Section- and T
|
|
37 |
|
38 |
|
39 |
|
40 |
-
|
41 |
-
|
42 |
# Problem Statement
|
43 |
|
44 |
People are concerned about the implications of AI, and no one seems to understand the right way to think about building ethical and useful AI applications for enterprises.
|
|
|
37 |
|
38 |
|
39 |
|
|
|
|
|
40 |
# Problem Statement
|
41 |
|
42 |
People are concerned about the implications of AI, and no one seems to understand the right way to think about building ethical and useful AI applications for enterprises.
|
Tasks/Task 1/pre-processing.ipynb
CHANGED
@@ -24,7 +24,7 @@
|
|
24 |
},
|
25 |
{
|
26 |
"cell_type": "code",
|
27 |
-
"execution_count":
|
28 |
"metadata": {},
|
29 |
"outputs": [],
|
30 |
"source": [
|
@@ -40,14 +40,14 @@
|
|
40 |
},
|
41 |
{
|
42 |
"cell_type": "code",
|
43 |
-
"execution_count":
|
44 |
"metadata": {},
|
45 |
"outputs": [
|
46 |
{
|
47 |
"name": "stderr",
|
48 |
"output_type": "stream",
|
49 |
"text": [
|
50 |
-
"/Users/jeevan/Documents/Learnings/ai-engineering-bootcamp/AIE4/mid-term/
|
51 |
" from tqdm.autonotebook import tqdm, trange\n"
|
52 |
]
|
53 |
}
|
@@ -56,24 +56,66 @@
|
|
56 |
"# Embedding model - snowflake-arctic-embed-l\n",
|
57 |
"from langchain_huggingface import HuggingFaceEmbeddings\n",
|
58 |
"\n",
|
59 |
-
"model_name = \"
|
60 |
"embedding_model = HuggingFaceEmbeddings(model_name=model_name)"
|
61 |
]
|
62 |
},
|
63 |
{
|
64 |
"cell_type": "code",
|
65 |
-
"execution_count":
|
66 |
"metadata": {},
|
67 |
"outputs": [],
|
68 |
"source": [
|
69 |
-
"from
|
70 |
-
"from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
"\n",
|
|
|
|
|
|
|
72 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
"pdf_loader = PDFLoaderWrapper(\n",
|
74 |
" documents_to_preload, PDFLoaderWrapper.LoaderType.PYMUPDF\n",
|
75 |
")\n",
|
76 |
-
"documents = await pdf_loader.aload()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
"\n",
|
78 |
"text_splitter = SemanticChunker(embedding_model, buffer_size=5, breakpoint_threshold_type=\"percentile\",breakpoint_threshold_amount=90)\n",
|
79 |
"\n",
|
@@ -82,7 +124,7 @@
|
|
82 |
},
|
83 |
{
|
84 |
"cell_type": "code",
|
85 |
-
"execution_count":
|
86 |
"metadata": {},
|
87 |
"outputs": [],
|
88 |
"source": [
|
@@ -98,7 +140,7 @@
|
|
98 |
},
|
99 |
{
|
100 |
"cell_type": "code",
|
101 |
-
"execution_count":
|
102 |
"metadata": {},
|
103 |
"outputs": [],
|
104 |
"source": [
|
@@ -112,437 +154,481 @@
|
|
112 |
},
|
113 |
{
|
114 |
"cell_type": "code",
|
115 |
-
"execution_count":
|
116 |
"metadata": {},
|
117 |
"outputs": [
|
118 |
{
|
119 |
"data": {
|
120 |
"text/plain": [
|
121 |
-
"['
|
122 |
-
" '
|
123 |
-
" '
|
124 |
-
" '
|
125 |
-
" '
|
126 |
-
" '
|
127 |
-
" '
|
128 |
-
" '
|
129 |
-
" '
|
130 |
-
" '
|
131 |
-
" '
|
132 |
-
" '
|
133 |
-
" '
|
134 |
-
" '
|
135 |
-
" '
|
136 |
-
" '
|
137 |
-
" '
|
138 |
-
" '
|
139 |
-
" '
|
140 |
-
" '
|
141 |
-
" '
|
142 |
-
" '
|
143 |
-
" '
|
144 |
-
" '
|
145 |
-
" '
|
146 |
-
" '
|
147 |
-
" '
|
148 |
-
" '
|
149 |
-
" '
|
150 |
-
" '
|
151 |
-
" '
|
152 |
-
" '
|
153 |
-
" '
|
154 |
-
" '
|
155 |
-
" '
|
156 |
-
" '
|
157 |
-
" '
|
158 |
-
" '
|
159 |
-
" '
|
160 |
-
" '
|
161 |
-
" '
|
162 |
-
" '
|
163 |
-
" '
|
164 |
-
" '
|
165 |
-
" '
|
166 |
-
" '
|
167 |
-
" '
|
168 |
-
" '
|
169 |
-
" '
|
170 |
-
" '
|
171 |
-
" '
|
172 |
-
" '
|
173 |
-
" '
|
174 |
-
" '
|
175 |
-
" '
|
176 |
-
" '
|
177 |
-
" '
|
178 |
-
" '
|
179 |
-
" '
|
180 |
-
" '
|
181 |
-
" '
|
182 |
-
" '
|
183 |
-
" '
|
184 |
-
" '
|
185 |
-
" '
|
186 |
-
" '
|
187 |
-
" '
|
188 |
-
" '
|
189 |
-
" '
|
190 |
-
" '
|
191 |
-
" '
|
192 |
-
" '
|
193 |
-
" '
|
194 |
-
" '
|
195 |
-
" '
|
196 |
-
" '
|
197 |
-
" '
|
198 |
-
" '
|
199 |
-
" '
|
200 |
-
" '
|
201 |
-
" '
|
202 |
-
" '
|
203 |
-
" '
|
204 |
-
" '
|
205 |
-
" '
|
206 |
-
" '
|
207 |
-
" '
|
208 |
-
" '
|
209 |
-
" '
|
210 |
-
" '
|
211 |
-
" '
|
212 |
-
" '
|
213 |
-
" '
|
214 |
-
" '
|
215 |
-
" '
|
216 |
-
" '
|
217 |
-
" '
|
218 |
-
" '
|
219 |
-
" '
|
220 |
-
" '
|
221 |
-
" '
|
222 |
-
" '
|
223 |
-
" '
|
224 |
-
" '
|
225 |
-
" '
|
226 |
-
" '
|
227 |
-
" '
|
228 |
-
" '
|
229 |
-
" '
|
230 |
-
" '
|
231 |
-
" '
|
232 |
-
" '
|
233 |
-
" '
|
234 |
-
" '
|
235 |
-
" '
|
236 |
-
" '
|
237 |
-
" '
|
238 |
-
" '
|
239 |
-
" '
|
240 |
-
" '
|
241 |
-
" '
|
242 |
-
" '
|
243 |
-
" '
|
244 |
-
" '
|
245 |
-
" '
|
246 |
-
" '
|
247 |
-
" '
|
248 |
-
" '
|
249 |
-
" '
|
250 |
-
" '
|
251 |
-
" '
|
252 |
-
" '
|
253 |
-
" '
|
254 |
-
" '
|
255 |
-
" '
|
256 |
-
" '
|
257 |
-
" '
|
258 |
-
" '
|
259 |
-
" '
|
260 |
-
" '
|
261 |
-
" '
|
262 |
-
" '
|
263 |
-
" '
|
264 |
-
" '
|
265 |
-
" '
|
266 |
-
" '
|
267 |
-
" '
|
268 |
-
" '
|
269 |
-
" '
|
270 |
-
" '
|
271 |
-
" '
|
272 |
-
" '
|
273 |
-
" '
|
274 |
-
" '
|
275 |
-
" '
|
276 |
-
" '
|
277 |
-
" '
|
278 |
-
" '
|
279 |
-
" '
|
280 |
-
" '
|
281 |
-
" '
|
282 |
-
" '
|
283 |
-
" '
|
284 |
-
" '
|
285 |
-
" '
|
286 |
-
" '
|
287 |
-
" '
|
288 |
-
" '
|
289 |
-
" '
|
290 |
-
" '
|
291 |
-
" '
|
292 |
-
" '
|
293 |
-
" '
|
294 |
-
" '
|
295 |
-
" '
|
296 |
-
" '
|
297 |
-
" '
|
298 |
-
" '
|
299 |
-
" '
|
300 |
-
" '
|
301 |
-
" '
|
302 |
-
" '
|
303 |
-
" '
|
304 |
-
" '
|
305 |
-
" '
|
306 |
-
" '
|
307 |
-
" '
|
308 |
-
" '
|
309 |
-
" '
|
310 |
-
" '
|
311 |
-
" '
|
312 |
-
" '
|
313 |
-
" '
|
314 |
-
" '
|
315 |
-
" '
|
316 |
-
" '
|
317 |
-
" '
|
318 |
-
" '
|
319 |
-
" '
|
320 |
-
" '
|
321 |
-
" '
|
322 |
-
" '
|
323 |
-
" '
|
324 |
-
" '
|
325 |
-
" '
|
326 |
-
" '
|
327 |
-
" '
|
328 |
-
" '
|
329 |
-
" '
|
330 |
-
" '
|
331 |
-
" '
|
332 |
-
" '
|
333 |
-
" '
|
334 |
-
" '
|
335 |
-
" '
|
336 |
-
" '
|
337 |
-
" '
|
338 |
-
" '
|
339 |
-
" '
|
340 |
-
" '
|
341 |
-
" '
|
342 |
-
" '
|
343 |
-
" '
|
344 |
-
" '
|
345 |
-
" '
|
346 |
-
" '
|
347 |
-
" '
|
348 |
-
" '
|
349 |
-
" '
|
350 |
-
" '
|
351 |
-
" '
|
352 |
-
" '
|
353 |
-
" '
|
354 |
-
" '
|
355 |
-
" '
|
356 |
-
" '
|
357 |
-
" '
|
358 |
-
" '
|
359 |
-
" '
|
360 |
-
" '
|
361 |
-
" '
|
362 |
-
" '
|
363 |
-
" '
|
364 |
-
" '
|
365 |
-
" '
|
366 |
-
" '
|
367 |
-
" '
|
368 |
-
" '
|
369 |
-
" '
|
370 |
-
" '
|
371 |
-
" '
|
372 |
-
" '
|
373 |
-
" '
|
374 |
-
" '
|
375 |
-
" '
|
376 |
-
" '
|
377 |
-
" '
|
378 |
-
" '
|
379 |
-
" '
|
380 |
-
" '
|
381 |
-
" '
|
382 |
-
" '
|
383 |
-
" '
|
384 |
-
" '
|
385 |
-
" '
|
386 |
-
" '
|
387 |
-
" '
|
388 |
-
" '
|
389 |
-
" '
|
390 |
-
" '
|
391 |
-
" '
|
392 |
-
" '
|
393 |
-
" '
|
394 |
-
" '
|
395 |
-
" '
|
396 |
-
" '
|
397 |
-
" '
|
398 |
-
" '
|
399 |
-
" '
|
400 |
-
" '
|
401 |
-
" '
|
402 |
-
" '
|
403 |
-
" '
|
404 |
-
" '
|
405 |
-
" '
|
406 |
-
" '
|
407 |
-
" '
|
408 |
-
" '
|
409 |
-
" '
|
410 |
-
" '
|
411 |
-
" '
|
412 |
-
" '
|
413 |
-
" '
|
414 |
-
" '
|
415 |
-
" '
|
416 |
-
" '
|
417 |
-
" '
|
418 |
-
" '
|
419 |
-
" '
|
420 |
-
" '
|
421 |
-
" '
|
422 |
-
" '
|
423 |
-
" '
|
424 |
-
" '
|
425 |
-
" '
|
426 |
-
" '
|
427 |
-
" '
|
428 |
-
" '
|
429 |
-
" '
|
430 |
-
" '
|
431 |
-
" '
|
432 |
-
" '
|
433 |
-
" '
|
434 |
-
" '
|
435 |
-
" '
|
436 |
-
" '
|
437 |
-
" '
|
438 |
-
" '
|
439 |
-
" '
|
440 |
-
" '
|
441 |
-
" '
|
442 |
-
" '
|
443 |
-
" '
|
444 |
-
" '
|
445 |
-
" '
|
446 |
-
" '
|
447 |
-
" '
|
448 |
-
" '
|
449 |
-
" '
|
450 |
-
" '
|
451 |
-
" '
|
452 |
-
" '
|
453 |
-
" '
|
454 |
-
" '
|
455 |
-
" '
|
456 |
-
" '
|
457 |
-
" '
|
458 |
-
" '
|
459 |
-
" '
|
460 |
-
" '
|
461 |
-
" '
|
462 |
-
" '
|
463 |
-
" '
|
464 |
-
" '
|
465 |
-
" '
|
466 |
-
" '
|
467 |
-
" '
|
468 |
-
" '
|
469 |
-
" '
|
470 |
-
" '
|
471 |
-
" '
|
472 |
-
" '
|
473 |
-
" '
|
474 |
-
" '
|
475 |
-
" '
|
476 |
-
" '
|
477 |
-
" '
|
478 |
-
" '
|
479 |
-
" '
|
480 |
-
" '
|
481 |
-
" '
|
482 |
-
" '
|
483 |
-
" '
|
484 |
-
" '
|
485 |
-
" '
|
486 |
-
" '
|
487 |
-
" '
|
488 |
-
" '
|
489 |
-
" '
|
490 |
-
" '
|
491 |
-
" '
|
492 |
-
" '
|
493 |
-
" '
|
494 |
-
" '
|
495 |
-
" '
|
496 |
-
" '
|
497 |
-
" '
|
498 |
-
" '
|
499 |
-
" '
|
500 |
-
" '
|
501 |
-
" '
|
502 |
-
" '
|
503 |
-
" '
|
504 |
-
" '
|
505 |
-
" '
|
506 |
-
" '
|
507 |
-
" '
|
508 |
-
" '
|
509 |
-
" '
|
510 |
-
" '
|
511 |
-
" '
|
512 |
-
" '
|
513 |
-
" '
|
514 |
-
" '
|
515 |
-
" '
|
516 |
-
" '
|
517 |
-
" '
|
518 |
-
" '
|
519 |
-
" '
|
520 |
-
" '
|
521 |
-
" '
|
522 |
-
" '
|
523 |
-
" '
|
524 |
-
" '
|
525 |
-
" '
|
526 |
-
" '
|
527 |
-
" '
|
528 |
-
" '
|
529 |
-
" '
|
530 |
-
" '
|
531 |
-
" '
|
532 |
-
" '
|
533 |
-
" '
|
534 |
-
" '
|
535 |
-
" '
|
536 |
-
" '
|
537 |
-
" '
|
538 |
-
" '
|
539 |
-
" '
|
540 |
-
" '
|
541 |
-
" '
|
542 |
-
" '
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
543 |
]
|
544 |
},
|
545 |
-
"execution_count":
|
546 |
"metadata": {},
|
547 |
"output_type": "execute_result"
|
548 |
}
|
@@ -554,13 +640,13 @@
|
|
554 |
"from qdrant_client.http.models import Distance, VectorParams\n",
|
555 |
"\n",
|
556 |
"dimension = 1024\n",
|
557 |
-
"collection_name = \"ai-safety-
|
558 |
"qdrant_server = os.environ[\"QDRANT_API_URL\"]\n",
|
559 |
"qdrant_client = QdrantClient(url=qdrant_server,api_key=os.environ[\"QDRANT_API_KEY\"])\n",
|
560 |
-
"qdrant_client.create_collection(\n",
|
561 |
-
"
|
562 |
-
"
|
563 |
-
")\n",
|
564 |
"\n",
|
565 |
"vector_store = QdrantVectorStore(\n",
|
566 |
" client=qdrant_client,\n",
|
@@ -568,13 +654,13 @@
|
|
568 |
" embedding=embedding_model,\n",
|
569 |
")\n",
|
570 |
"\n",
|
571 |
-
"vector_store.add_documents(
|
572 |
"\n"
|
573 |
]
|
574 |
},
|
575 |
{
|
576 |
"cell_type": "code",
|
577 |
-
"execution_count":
|
578 |
"metadata": {},
|
579 |
"outputs": [],
|
580 |
"source": [
|
@@ -584,25 +670,25 @@
|
|
584 |
},
|
585 |
{
|
586 |
"cell_type": "code",
|
587 |
-
"execution_count":
|
588 |
"metadata": {},
|
589 |
"outputs": [
|
590 |
{
|
591 |
"data": {
|
592 |
"text/plain": [
|
593 |
-
"[Document(metadata={'source': 'https://
|
594 |
-
" Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page':
|
595 |
-
" Document(metadata={'source': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page':
|
596 |
-
" Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page':
|
597 |
-
" Document(metadata={'source': 'https://
|
598 |
-
" Document(metadata={'source': 'https://
|
599 |
-
" Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page':
|
600 |
-
" Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page':
|
601 |
-
" Document(metadata={'source': 'https://
|
602 |
-
" Document(metadata={'source': 'https://
|
603 |
]
|
604 |
},
|
605 |
-
"execution_count":
|
606 |
"metadata": {},
|
607 |
"output_type": "execute_result"
|
608 |
}
|
@@ -1094,14 +1180,20 @@
|
|
1094 |
],
|
1095 |
"source": [
|
1096 |
"# Vector Store with recursive chunked documents\n",
|
|
|
|
|
|
|
|
|
1097 |
"\n",
|
1098 |
-
"
|
|
|
|
|
1099 |
"\n",
|
1100 |
"recursive_qdrant_client = QdrantClient(url=qdrant_server,api_key=os.environ[\"QDRANT_API_KEY\"])\n",
|
1101 |
-
"
|
1102 |
-
"
|
1103 |
-
"
|
1104 |
-
"
|
1105 |
"\n",
|
1106 |
"recursive_vector_store = QdrantVectorStore(\n",
|
1107 |
" client=recursive_qdrant_client,\n",
|
|
|
24 |
},
|
25 |
{
|
26 |
"cell_type": "code",
|
27 |
+
"execution_count": 6,
|
28 |
"metadata": {},
|
29 |
"outputs": [],
|
30 |
"source": [
|
|
|
40 |
},
|
41 |
{
|
42 |
"cell_type": "code",
|
43 |
+
"execution_count": 1,
|
44 |
"metadata": {},
|
45 |
"outputs": [
|
46 |
{
|
47 |
"name": "stderr",
|
48 |
"output_type": "stream",
|
49 |
"text": [
|
50 |
+
"/Users/jeevan/Documents/Learnings/ai-engineering-bootcamp/AIE4/mid-term/SafeGuardAI/.venv/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:13: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
51 |
" from tqdm.autonotebook import tqdm, trange\n"
|
52 |
]
|
53 |
}
|
|
|
56 |
"# Embedding model - snowflake-arctic-embed-l\n",
|
57 |
"from langchain_huggingface import HuggingFaceEmbeddings\n",
|
58 |
"\n",
|
59 |
+
"model_name = \"jeevanions/finetuned_arctic-embedd-l\"\n",
|
60 |
"embedding_model = HuggingFaceEmbeddings(model_name=model_name)"
|
61 |
]
|
62 |
},
|
63 |
{
|
64 |
"cell_type": "code",
|
65 |
+
"execution_count": 4,
|
66 |
"metadata": {},
|
67 |
"outputs": [],
|
68 |
"source": [
|
69 |
+
"from enum import Enum\n",
|
70 |
+
"from typing import List\n",
|
71 |
+
"from langchain_community.document_loaders import PyMuPDFLoader\n",
|
72 |
+
"from langchain_core.documents import Document\n",
|
73 |
+
"import asyncio\n",
|
74 |
+
"\n",
|
75 |
+
"class PDFLoaderWrapper():\n",
|
76 |
+
" class LoaderType(str, Enum):\n",
|
77 |
+
" PYMUPDF = \"pymupdf\"\n",
|
78 |
"\n",
|
79 |
+
" def __init__(self, file_path: str | List[str] , loader_type: LoaderType = LoaderType.PYMUPDF):\n",
|
80 |
+
" self.file_path = file_path if isinstance(file_path, list) else [file_path]\n",
|
81 |
+
" self.loader_type = loader_type\n",
|
82 |
"\n",
|
83 |
+
" async def aload(self) -> List[Document]:\n",
|
84 |
+
" all_docs = []\n",
|
85 |
+
" for file_path in self.file_path:\n",
|
86 |
+
" if self.loader_type == self.LoaderType.PYMUPDF:\n",
|
87 |
+
" try:\n",
|
88 |
+
" loader = PyMuPDFLoader(file_path)\n",
|
89 |
+
" docs = await loader.aload()\n",
|
90 |
+
" all_docs.extend(docs)\n",
|
91 |
+
" except Exception as e:\n",
|
92 |
+
" print(f\"Error loading file {file_path}: {e}\")\n",
|
93 |
+
" continue\n",
|
94 |
+
" return all_docs\n",
|
95 |
+
"\n"
|
96 |
+
]
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"cell_type": "code",
|
100 |
+
"execution_count": 7,
|
101 |
+
"metadata": {},
|
102 |
+
"outputs": [],
|
103 |
+
"source": [
|
104 |
"pdf_loader = PDFLoaderWrapper(\n",
|
105 |
" documents_to_preload, PDFLoaderWrapper.LoaderType.PYMUPDF\n",
|
106 |
")\n",
|
107 |
+
"documents = await pdf_loader.aload()"
|
108 |
+
]
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"cell_type": "code",
|
112 |
+
"execution_count": null,
|
113 |
+
"metadata": {},
|
114 |
+
"outputs": [],
|
115 |
+
"source": [
|
116 |
+
"from langchain_experimental.text_splitter import SemanticChunker\n",
|
117 |
+
"\n",
|
118 |
+
"\n",
|
119 |
"\n",
|
120 |
"text_splitter = SemanticChunker(embedding_model, buffer_size=5, breakpoint_threshold_type=\"percentile\",breakpoint_threshold_amount=90)\n",
|
121 |
"\n",
|
|
|
124 |
},
|
125 |
{
|
126 |
"cell_type": "code",
|
127 |
+
"execution_count": 8,
|
128 |
"metadata": {},
|
129 |
"outputs": [],
|
130 |
"source": [
|
|
|
140 |
},
|
141 |
{
|
142 |
"cell_type": "code",
|
143 |
+
"execution_count": 9,
|
144 |
"metadata": {},
|
145 |
"outputs": [],
|
146 |
"source": [
|
|
|
154 |
},
|
155 |
{
|
156 |
"cell_type": "code",
|
157 |
+
"execution_count": 11,
|
158 |
"metadata": {},
|
159 |
"outputs": [
|
160 |
{
|
161 |
"data": {
|
162 |
"text/plain": [
|
163 |
+
"['7e6a73422dd04376b5212e1c71275f5c',\n",
|
164 |
+
" '9fc355b29e534d4d9ea5d87d8c9bf77c',\n",
|
165 |
+
" '5e6a933b1bae4db7922b50a8a6bab44d',\n",
|
166 |
+
" '83ddb01ec3954f1dbdda744124a8c76d',\n",
|
167 |
+
" 'd6e795fad13242498f11b8bfd8216f7d',\n",
|
168 |
+
" 'ba26430bfb714249ac4ce151f6819ac2',\n",
|
169 |
+
" '06818b77aa704eeba5c7499d653c0433',\n",
|
170 |
+
" 'bfff4b85b5b440869b1fadcaf0ba0489',\n",
|
171 |
+
" '7810842f60234aa0bee8c81a6577fc73',\n",
|
172 |
+
" '85fbfbfdd52349dca312edcbd70af79e',\n",
|
173 |
+
" 'abddd6d787a9496c9c3f93d7386cd1c4',\n",
|
174 |
+
" 'da0779d479454c1ba82407a68d536997',\n",
|
175 |
+
" '39b0ea70a678462fbf305d1096338cc0',\n",
|
176 |
+
" 'aff239aa6f5c447d885b6f7b0346ed1f',\n",
|
177 |
+
" 'e486a65b3a414225aaee7cb21e44cd4a',\n",
|
178 |
+
" '16014bcfe609485d919b310f040ff735',\n",
|
179 |
+
" '163bee1b3971476d97bf3c92c2aafd32',\n",
|
180 |
+
" '10806d74287549288bc664e03e1528c8',\n",
|
181 |
+
" '1c8cb58feffc4bd38061f6bb47fc9974',\n",
|
182 |
+
" 'e48046904f4f44d8a5042385d618b84b',\n",
|
183 |
+
" 'adf374bab6264d75b4598e9e6b0c31f1',\n",
|
184 |
+
" 'dc30905487ec493398c89c5db30c8e41',\n",
|
185 |
+
" 'e571028cd6804e259d217f3936ab1ac5',\n",
|
186 |
+
" '5708400cf26f4f5ab83c151cab3f29b5',\n",
|
187 |
+
" '3d66be0e03ed4c6d8d6d4bd89c3885bc',\n",
|
188 |
+
" '6c1368d79ada4a87b0bcf0d5ecbb1d05',\n",
|
189 |
+
" '3364bb5cb27f4df9a84e671fc47f3299',\n",
|
190 |
+
" 'feea03d2dc68453fb545cf746ec9be62',\n",
|
191 |
+
" 'f1ba6b845006430798e4191b7bf63b7e',\n",
|
192 |
+
" '778f730ed4ce40bc872cac0b1320ee7d',\n",
|
193 |
+
" '322981b8743a4bd88c2366901593feae',\n",
|
194 |
+
" '48a6cebe5639442f87abdff4d4d7ad8f',\n",
|
195 |
+
" 'fcaab76461d04caf8cddaae701bc1ced',\n",
|
196 |
+
" '41d70fefc71a436892b6f724b4643bf8',\n",
|
197 |
+
" '01bdb6c2fa9b43879231c6aa88530bf6',\n",
|
198 |
+
" '6f4da3635bca41ecb6a3b5f6c53b842b',\n",
|
199 |
+
" 'c92f7b0ece1d4e12a116cb43ac7ab1d2',\n",
|
200 |
+
" '807f090f360c4a5b9a29387a6f73cb55',\n",
|
201 |
+
" '7f387826030348e691e24fb3a14cb470',\n",
|
202 |
+
" '8146a7c2a6784cc7bbbe698ea7c1676c',\n",
|
203 |
+
" '8f30db72c63b4b1b8cf598becd646e94',\n",
|
204 |
+
" 'ed405fa7b4f64e8db2944f5220a5c8fe',\n",
|
205 |
+
" '8f1aedce957141bea4dd5d532188567f',\n",
|
206 |
+
" '4e887209f4a64ee3b7b3f8232534a9d9',\n",
|
207 |
+
" '8ccb6b9f04614326ba3bf5f96e59a960',\n",
|
208 |
+
" '2a0f3e2b88da49efb934430eef0c447b',\n",
|
209 |
+
" '27c3630e2f4048b18454e1477885ef6d',\n",
|
210 |
+
" '522d9265216c448db3859c7ccf079bc0',\n",
|
211 |
+
" '29055cef02b14cb8adfb95db7359014d',\n",
|
212 |
+
" '458ae662b668416397a84995a6ef6e9c',\n",
|
213 |
+
" '3a2974f8cce8431ebb562d72fad43809',\n",
|
214 |
+
" '16c5eec5b4574d51bc35125af1624f57',\n",
|
215 |
+
" 'ce0060ece6724eac9999b0f5bd3e1dd9',\n",
|
216 |
+
" 'ae1d9dd5a3264480897857ec12bca123',\n",
|
217 |
+
" '69e6715305a6429db45a9d5db5ffde10',\n",
|
218 |
+
" 'f504045bcf334cc0b53f44fca8c6712d',\n",
|
219 |
+
" '3120dcf898d641ba99dfca96bb916662',\n",
|
220 |
+
" '792a483c0aa74dd48c0df8d92fcdd46d',\n",
|
221 |
+
" '192e82b275f84c60b06fc5bd18de1fbc',\n",
|
222 |
+
" '9b434b7acb85487883b924ac8191fbce',\n",
|
223 |
+
" '57e81edbe7604de4bc470a8f44feafbb',\n",
|
224 |
+
" 'a2267ad4ad2a4aad8b4cbcb36f8274ac',\n",
|
225 |
+
" '1d4aee78da4c43bdb16cccb4cc799b32',\n",
|
226 |
+
" '5b4987809648438d868140daf4d477e4',\n",
|
227 |
+
" '6218bded0a5a4d4e93cd8fca57072ff9',\n",
|
228 |
+
" '4e6d3c750f42485ea29cd176eec1cd62',\n",
|
229 |
+
" '01ca0d114b384e7fad435ac2338e0103',\n",
|
230 |
+
" '80dc9865efae432a952eec9f8c46f778',\n",
|
231 |
+
" 'e1e9da01efe54ddf8c98771b2d1135f1',\n",
|
232 |
+
" '3aae2eb905054cd7812127a4e78534dc',\n",
|
233 |
+
" '8edf2a7164614da7a283abb6ecbe9088',\n",
|
234 |
+
" 'cd82af66911c478e8d37d8cef1729de1',\n",
|
235 |
+
" 'be33bb91fb3d4208933839769fb98dda',\n",
|
236 |
+
" '0edbcc77fa6d4735a959dfe489a4204f',\n",
|
237 |
+
" '85abc246a02c4ab1aba4a39a7994c61f',\n",
|
238 |
+
" 'b9f2b7053e074e909faf2e691bf9c57e',\n",
|
239 |
+
" 'd6f1812813c547ac81c6c64d0bc28ef1',\n",
|
240 |
+
" 'c3a273cc36d7498c94255f47c2dcfa38',\n",
|
241 |
+
" '708872346a8d4731b77ac2b603c8b9a9',\n",
|
242 |
+
" '40577ac91f2f46b2be704852b3b7da73',\n",
|
243 |
+
" '012e24872e4843829d0a8c57897354f6',\n",
|
244 |
+
" '0aaeca11740c45ffab1ac96fec61d0b6',\n",
|
245 |
+
" '83c9873b978f49839a8b801c7c3456bf',\n",
|
246 |
+
" '6ea0896cf6244e9393323f48ceae351c',\n",
|
247 |
+
" 'cbbbc204feff4106ad07182a17663b65',\n",
|
248 |
+
" 'c96038d6efc6460587351b73ce06c477',\n",
|
249 |
+
" '65ff657993f7493a85bf7b8545822958',\n",
|
250 |
+
" 'a5f1605302dd446498e3d97d9a65ff5e',\n",
|
251 |
+
" 'e67ff0603d934f8daf490506e095654e',\n",
|
252 |
+
" 'fbb4ddc0fffa43c693b73f35e6b8b046',\n",
|
253 |
+
" '52b0f10d941b44ce91ab2d201c6abc39',\n",
|
254 |
+
" '8b87551f187741f3841131c45b1f98cf',\n",
|
255 |
+
" '8546fe434cce46a98183e85f9fbe508c',\n",
|
256 |
+
" '8b69c605429e4843871064329c727940',\n",
|
257 |
+
" '2c6215e1bf8148a39e1ee5044e7d25f6',\n",
|
258 |
+
" 'a800aa91c46a4ac6938205ccefbd8cc2',\n",
|
259 |
+
" '39c5beb614c9410f9db1006fe064c89d',\n",
|
260 |
+
" '61158196e2274bb6a6116040e306aa41',\n",
|
261 |
+
" '4ecdf2b619444dfeafdfa08c0d7a188b',\n",
|
262 |
+
" '2ef6c61937a346c4b7d45044d0ab9e3f',\n",
|
263 |
+
" '3aff173caadf422584f6d0d00b238250',\n",
|
264 |
+
" '199301aec5584b31bb173ef65af3c226',\n",
|
265 |
+
" '0329bbf676454d5cb83a754634a3b039',\n",
|
266 |
+
" '8892f081f8d74a519005c2aa0103343b',\n",
|
267 |
+
" 'f248adeb051a4f84b205f4d7f25c6bca',\n",
|
268 |
+
" '1e95159724754c3c827b4705357682a7',\n",
|
269 |
+
" '4578074dbac549799b4415cac01bb42a',\n",
|
270 |
+
" '6e0644408e9c44479f7a39aa2374cce7',\n",
|
271 |
+
" '5568071c77c14cbc810df23635d8a570',\n",
|
272 |
+
" '4866d8188ebd4a73a6b08c8952499880',\n",
|
273 |
+
" '317311af1cb24262a493af4dbd711502',\n",
|
274 |
+
" 'd8030df3c00141dab0a63ea4994b12ee',\n",
|
275 |
+
" '55019342d242454f81794ec6fc4ab672',\n",
|
276 |
+
" '5f6a23f645e94b8ba355cbbbc0edf86f',\n",
|
277 |
+
" '93f909dfc6884ea68be8f42f7222efeb',\n",
|
278 |
+
" '61ee9886027b46d8ac7feee9af1c135c',\n",
|
279 |
+
" '4328e2ca81584d3384e9dae389aa7a42',\n",
|
280 |
+
" 'fa9e59c2a3704dcc98bee24820a3b24e',\n",
|
281 |
+
" 'ca23ebefba4a4785b0c159b601b2684e',\n",
|
282 |
+
" '795ce21fd9ee41d2a307b17d1d2caaee',\n",
|
283 |
+
" '64d5b336913a48c5bca2b1a13950a84f',\n",
|
284 |
+
" '430128844f3a44239e1b8089cfa3cbc8',\n",
|
285 |
+
" '590ba6b4e9e347c9950c1e61eedf4a0a',\n",
|
286 |
+
" 'eb8292f67a2c491aab48a8721437801d',\n",
|
287 |
+
" '74fcf851178146beaf87dacf7e9880b3',\n",
|
288 |
+
" 'ba8b6e6dc7594131966758ee8cc1f281',\n",
|
289 |
+
" 'db5ff7f95ce84f799c44faa9174f9255',\n",
|
290 |
+
" 'f0da98b07de44206bbcce8bf8af5d03a',\n",
|
291 |
+
" '6aa11aa8b65e4c948e0e7902324e335c',\n",
|
292 |
+
" 'fa2a4dd8878a438c91bf462657c655e0',\n",
|
293 |
+
" '22b4dcec8583498b9f5c9272fa03fd03',\n",
|
294 |
+
" 'e89a54b5d0b24306a11f5ff4b34eea50',\n",
|
295 |
+
" 'f930c1ba25a1498d9964df40e039a2b6',\n",
|
296 |
+
" 'f3a5320dcd7741c7a7a7b5414a698bca',\n",
|
297 |
+
" '8aa395d5d9bc46eba9a4e6cb1424a5b0',\n",
|
298 |
+
" '315cb577037c4d468e6a109876406c2b',\n",
|
299 |
+
" '6ecdcd5051a1403da50545e71d6da209',\n",
|
300 |
+
" 'f440b367ddc64c3796d37c681e2f2611',\n",
|
301 |
+
" '22b4bcfc59f242e68724607b592a1570',\n",
|
302 |
+
" '35a5bacfa0724c47a3ef49dd707216a3',\n",
|
303 |
+
" '65dc165cf9604918870e1f8bc659edc9',\n",
|
304 |
+
" 'a39b7bb6d500424f89b0397cd58242a9',\n",
|
305 |
+
" 'b797378ce695439f9e33909cd9b59a19',\n",
|
306 |
+
" '5c807877b26447aca4cbd19a6d437706',\n",
|
307 |
+
" '7e47508998b54b01a4028350464b0651',\n",
|
308 |
+
" 'eb0ed3dcc27c413f86b97c6373a95b2d',\n",
|
309 |
+
" '36b590a2bce247b19c2493d77fc79f35',\n",
|
310 |
+
" '7eeb74d2a0a842f88a6cf54c22fdaf78',\n",
|
311 |
+
" '03aaa157c1b64b4583a22a8f68a041d5',\n",
|
312 |
+
" 'a52aa607634e42debe74aa23703ced9a',\n",
|
313 |
+
" 'fb61c2ab526a40d7973b277302c28884',\n",
|
314 |
+
" '133a7abc2b6744e1bb39936bfe4ab153',\n",
|
315 |
+
" '4819c2e044bb40baa0955d500726ae25',\n",
|
316 |
+
" '9a097681fac5463eab74d6b8d037c7f2',\n",
|
317 |
+
" 'eed0c6e24a7a4d60b50fd77560f09515',\n",
|
318 |
+
" '51c53cf71b1049108115f72111dd0628',\n",
|
319 |
+
" 'eb8eecc5375e4055b0f9dbb3ef9de2d5',\n",
|
320 |
+
" '35f84f29041243ef8c706f29c4291885',\n",
|
321 |
+
" '18b009a2f29349d3898cdf36945d8a79',\n",
|
322 |
+
" 'fc123881e9034da0822eb70738d6d32c',\n",
|
323 |
+
" 'b4f6c6f6f5a842b38d83984c338325a3',\n",
|
324 |
+
" 'f20d2030d1364928881c784f431e5bfc',\n",
|
325 |
+
" '493d5cf218ac4d4fbebfbc86cac36765',\n",
|
326 |
+
" 'b4126c1ea602422fa497d73be286c109',\n",
|
327 |
+
" 'a255ec54a9e944bf8e8ed4fb3750ea88',\n",
|
328 |
+
" '920aeb2be1694376992f641365e8bd43',\n",
|
329 |
+
" '8307d5c77e4346af954f9202f56420a3',\n",
|
330 |
+
" 'c91a1252fcd843fdaa67bf1716d37441',\n",
|
331 |
+
" '60492f40e6814fbca9475582b1f1b126',\n",
|
332 |
+
" '0a08b2e235394021a43c9e7b5ae9550d',\n",
|
333 |
+
" '8a2746def9a64df29cfc906498503baf',\n",
|
334 |
+
" 'd805a2f9efa549138d9e008beb51f92c',\n",
|
335 |
+
" 'af9273ba0f9d4124a83c4f8a1de7cf8f',\n",
|
336 |
+
" '6c58a810f69d4d7ca0d8de99fd52c776',\n",
|
337 |
+
" 'f407e14346b64292a81c666f5ab45ebe',\n",
|
338 |
+
" '9b0ba54ac74d4982b5bea390beb42fa5',\n",
|
339 |
+
" '54ea0f7eeb694034b7ca0a81b8a2439d',\n",
|
340 |
+
" '6b407ca21eea4650b52f1f06d5f50513',\n",
|
341 |
+
" '298b573bcc96424a8d20f2237dfafdf2',\n",
|
342 |
+
" 'e9ed96c3a795415e9fc7a6cfee48fdfb',\n",
|
343 |
+
" '1fe33d1cac824d38a7e7c935cf9d95e2',\n",
|
344 |
+
" '3dbf2b5152f243939ffd87fa068e469b',\n",
|
345 |
+
" 'ded73ec56ff94f5ca844b8f56a55c560',\n",
|
346 |
+
" 'cb3bf04b763249a7853cde719e0bfe37',\n",
|
347 |
+
" '7411abd2e00a46189a3341ff1a4077c7',\n",
|
348 |
+
" '49a9a48d88cb4b8ea9fb28d5300e6279',\n",
|
349 |
+
" '1aeb055eb2984cb1a67d1f8ba7b40975',\n",
|
350 |
+
" '4662c1887c504009bb5f3f6edfb8d8e0',\n",
|
351 |
+
" 'e1145b7317bc4695898567041071d13b',\n",
|
352 |
+
" '5124c1648cc1458c9747f3eea6a2a301',\n",
|
353 |
+
" 'e4a2f6f9d63047d1b768faa280c8686f',\n",
|
354 |
+
" '955259cbf75a427395cefe5bc5834d08',\n",
|
355 |
+
" 'af8c9f932d9a45d2956e32b6c2762dc9',\n",
|
356 |
+
" '8f03b01f28e6401f8779a8b98dd0e584',\n",
|
357 |
+
" '4c80fe955cce49c2866e565a7d2c1c58',\n",
|
358 |
+
" 'b35c3a592cc549bf8f37a4b470759ddb',\n",
|
359 |
+
" '81e348d53d7f47a1a8b63ce096bd5dc0',\n",
|
360 |
+
" '143d5eef78ab4561a97e1a54ad54bd6a',\n",
|
361 |
+
" 'f9e56fd50d79462bb649902940e78aa7',\n",
|
362 |
+
" '7ba07d3105ca4e30b3e292aaa315ab0e',\n",
|
363 |
+
" 'e55b0a37dfd34565a8b896fe1c30c5c0',\n",
|
364 |
+
" 'ff0bad0434074a789797a01fb93828fb',\n",
|
365 |
+
" '1263eee1196c4f52a87748bf1fc32faa',\n",
|
366 |
+
" '7e9f6b49537f4f7d86eb167c54ef53fd',\n",
|
367 |
+
" '517f9f9b9d664f05b12803bbade89cfe',\n",
|
368 |
+
" '3f7e8ca69eca44f2b02024de41a60747',\n",
|
369 |
+
" '36361f8ed6c84a1c88cbfa3c44620f94',\n",
|
370 |
+
" '3b1a1dd74fa44410a5f7b5923d194046',\n",
|
371 |
+
" '20c990f82aae4c27a65f427a61a4ff2e',\n",
|
372 |
+
" '6cf1b79772db47119861ae7368cdcfb9',\n",
|
373 |
+
" 'bd9c8cbb658540d5ba56054c0b9001e0',\n",
|
374 |
+
" 'c6d74fd0f00343f08dba101d12919539',\n",
|
375 |
+
" 'd69c356e507c4ec2a9894979c0dacec1',\n",
|
376 |
+
" 'ab981093667c4dfbb699a23631f0ece8',\n",
|
377 |
+
" '067cc55582b949b182315c7a782cb871',\n",
|
378 |
+
" '149f1899cdd5435693e03a6fe34faafd',\n",
|
379 |
+
" 'f11e46b3744d48359af1289b2c329670',\n",
|
380 |
+
" 'baacb9574ff04cdba608ab1213836623',\n",
|
381 |
+
" '91b3470ebdcf4105a0fd5b5ed8131588',\n",
|
382 |
+
" '202c5981c8874a0ba7516eb30361183c',\n",
|
383 |
+
" 'f12a70efb5534d4ba33731b2e02f3c16',\n",
|
384 |
+
" '9dd7f662df5346238845363b93374d67',\n",
|
385 |
+
" '68b06be60d17429b8bb72c3f48d9513e',\n",
|
386 |
+
" '2f4ea4436156470fa9c32c2b045100f7',\n",
|
387 |
+
" 'a3a5ffc9711b4feb9f2fa6a7b65017a1',\n",
|
388 |
+
" '461fd5b2359f4234af45e0c14dc754ee',\n",
|
389 |
+
" 'e9521091e9d54c469b426e451dcc549a',\n",
|
390 |
+
" '5f985651e7564c3abec885352e0e8c34',\n",
|
391 |
+
" '24cfff725b2b49e6aa0888777e6e6377',\n",
|
392 |
+
" 'c20f6353f47541a4bd6add6e2b49f5f7',\n",
|
393 |
+
" 'c17ad6f1db804af98021992177d05901',\n",
|
394 |
+
" '0c08d3080f1f403f8a25e9aadba4d515',\n",
|
395 |
+
" '2316394619e44d9eb3ec83fcadb03d71',\n",
|
396 |
+
" '109c1b954c114152800ab3dff35897b3',\n",
|
397 |
+
" '6f2da3f0de9e4821920927cd442df7f2',\n",
|
398 |
+
" 'cb610e59a9c54cfc8f6961cc1c84e9c6',\n",
|
399 |
+
" 'f35424710f1d401fa7934f6d5c418235',\n",
|
400 |
+
" '069ff4b21bc943bfa9dcdfa4fabc7d32',\n",
|
401 |
+
" '3567a864869841da9068c9646b6e35f3',\n",
|
402 |
+
" '4665091843c34983906f4dddb04908d8',\n",
|
403 |
+
" '94179992c73a4c4187ec6bbcefe79ce0',\n",
|
404 |
+
" '62d850b75fea48dabdf133627f8bac4c',\n",
|
405 |
+
" '3a27b1fd38d541d79cdf8834bde3e08b',\n",
|
406 |
+
" 'd37aac990eed4b4eaef154355283d51b',\n",
|
407 |
+
" 'a46cba7542234baaa29ac6349c1a8ff7',\n",
|
408 |
+
" 'dd54b984ac1048a693d12641e5d17fb0',\n",
|
409 |
+
" '967f7918604a4ee386c41ec4fdc78de4',\n",
|
410 |
+
" 'bbe13bb721054544b560c29eb101ec7a',\n",
|
411 |
+
" 'ee58b2e1ef584f73a16f947d4c2dac2b',\n",
|
412 |
+
" 'cb821f21e675460ca22a0feb58787658',\n",
|
413 |
+
" 'c87c4709fded48a6956776d3e2236040',\n",
|
414 |
+
" '981dde0ace23485e94aed52aa0ff9a7a',\n",
|
415 |
+
" '90f53d8039524f75ab132e7e7507c0c4',\n",
|
416 |
+
" 'ec166cb9fbac4947869a03b52e518401',\n",
|
417 |
+
" 'e9796c57fadf4226be635cacac670a7d',\n",
|
418 |
+
" '0d5b0f646568418ab53b0a1c7c3f649e',\n",
|
419 |
+
" 'a7477b8261624f5a9b79052649e5adde',\n",
|
420 |
+
" '1498599a5522410d9d72fd3f382d0093',\n",
|
421 |
+
" '7b32b94f550147ec93c49889c72ddab4',\n",
|
422 |
+
" 'dc38bf202eef471ea1abbdd6a44008b5',\n",
|
423 |
+
" '5789a475a4a6453eba7522b88536a56c',\n",
|
424 |
+
" 'ef17f685fb184e179587886fb3cf2fca',\n",
|
425 |
+
" '976d8b607ce941048d58376ce9cc86d3',\n",
|
426 |
+
" '07303a1defb046029ac61adc07dae7ec',\n",
|
427 |
+
" '9eb77576298742709ca790ef7114fbb7',\n",
|
428 |
+
" '618846b8c0ab4b3bb1a740e45d6c1f53',\n",
|
429 |
+
" 'dec492d1d29f4416957f1156d8c2982d',\n",
|
430 |
+
" 'd059a5a5d19c4490b17fca7d049bcabe',\n",
|
431 |
+
" 'ee4edd8f7f9b410bb60fd55bd6ac1010',\n",
|
432 |
+
" 'dedfda38a2a942558716d0c094b583a6',\n",
|
433 |
+
" '9499ddff0eef4d368e72a17a80df12b0',\n",
|
434 |
+
" '67c059163f6b49e1bd98ee30cd8913c1',\n",
|
435 |
+
" '1400b471ea774a51b67ed115ed3d2629',\n",
|
436 |
+
" 'eb168f728da042fb99b221e736cc58ba',\n",
|
437 |
+
" '2e13e8baa94c413fb36354751d6500a7',\n",
|
438 |
+
" '7a05c4e1ec244287bdc53434f385237e',\n",
|
439 |
+
" '77c3f0079d3442e59bf2a4292e3b9889',\n",
|
440 |
+
" 'b3ab4e9272914e1fa6ce549f615bb5af',\n",
|
441 |
+
" 'a219215029174816ad2ad730a419e4ab',\n",
|
442 |
+
" 'a97e6c5a132345cc9129513e8fd4f629',\n",
|
443 |
+
" '3c2f44dfb00f4a9cb83212190710d165',\n",
|
444 |
+
" '43717b33878246bd92a12a79c24eb6ad',\n",
|
445 |
+
" '00983beb4f4e4f0c85063db2412aa0a4',\n",
|
446 |
+
" '9ee6cebe64fe4c94aa8022db635a1834',\n",
|
447 |
+
" '1394c2cbe97b4561bf2871cab16ba969',\n",
|
448 |
+
" '70046c7fbe104d9cac9593156dfa3baa',\n",
|
449 |
+
" 'ccf1ade95ce24d33a915ea4723ee01e4',\n",
|
450 |
+
" 'cffd6cd2d3b34fbaa32d677294d2b811',\n",
|
451 |
+
" '8a7ebcd4521b432b803d0d6fa2c035c6',\n",
|
452 |
+
" 'a9d009255da74706825461a6c4c1aed8',\n",
|
453 |
+
" '377153d600df475f91971ea413ba1eb0',\n",
|
454 |
+
" '0f7c81ba23324798bbc25bf941f8e4b0',\n",
|
455 |
+
" 'f0a482198a1440628370e7bed1a85bc9',\n",
|
456 |
+
" '7da7c179070644f99e1b2fdab4b7fb75',\n",
|
457 |
+
" '25932ef3bd104c1aa25c7ba24c8225dd',\n",
|
458 |
+
" '62ed7a98dc964dc5b05838dc7fac652c',\n",
|
459 |
+
" '9a42744ce7ce40218341bfc98958244f',\n",
|
460 |
+
" '23b6cb8e8f1b403aa472a3b2138c75b1',\n",
|
461 |
+
" '3616f0b38d964fec980bd3d1353fbcae',\n",
|
462 |
+
" '0b23a4d006a0434d8a17a3c85675cc50',\n",
|
463 |
+
" 'c4dd0bc2654c48b88c4c645149145461',\n",
|
464 |
+
" '44bbccb8264b4831a95a7134fddb6986',\n",
|
465 |
+
" '9cef11cbe3c3455bbe304ca4b9f7a761',\n",
|
466 |
+
" '84c93c3fef4f48f1bd977dde14de25e7',\n",
|
467 |
+
" 'd410724332284b6491672429905d8841',\n",
|
468 |
+
" '58102d923dbb405fb8543001a39b2fa9',\n",
|
469 |
+
" '82a54c0a1a2d4297ab9667b1613e1a62',\n",
|
470 |
+
" '4dead17ef7144f4d97b447fda4aebe43',\n",
|
471 |
+
" '612f6cf6123e400e95e8ff7f0253d3fd',\n",
|
472 |
+
" '7a6f3cf51abc41a09254b70d01c3d7c0',\n",
|
473 |
+
" '9614ad1cda3f40b1942c6e2f1c6ac785',\n",
|
474 |
+
" 'ec99982b671b4bb799763c9b45b43af1',\n",
|
475 |
+
" '23cdc25d55214f5ab0728a9af99115a3',\n",
|
476 |
+
" '88c36979cd574d26a4abcec3d22c3dc6',\n",
|
477 |
+
" '00ddf8f6f08b415d97262cd9ce31037e',\n",
|
478 |
+
" '25eac9c21b8b497db1127b023daafac4',\n",
|
479 |
+
" 'd870e4c7e50f43e7b02258c5c74cf729',\n",
|
480 |
+
" 'ad27ad6c04864d9694ace786bc6453a7',\n",
|
481 |
+
" '568443576592490db5c3e54b196bf078',\n",
|
482 |
+
" '14db0f53ad8c471c8ea14c74f5d92f23',\n",
|
483 |
+
" 'aeacfbc1adda40ee81b7c8bf4e27ed81',\n",
|
484 |
+
" 'be506fd2d1ff4430af8fff44ab5d5dd4',\n",
|
485 |
+
" 'ad169d2df3e74117b8432cbb80b23682',\n",
|
486 |
+
" 'ca6e653042c54fbe8f66c715d0799e1b',\n",
|
487 |
+
" '34e46b2b561845b3bb58d8ae1497433c',\n",
|
488 |
+
" 'ef6470836d2f42fb96fa4d8a76029009',\n",
|
489 |
+
" 'e7f547eba08f46529116dcd17b971d44',\n",
|
490 |
+
" '0badc20385f74c7f8b21e6d8b4b77285',\n",
|
491 |
+
" '96e2d5a8b73941e299787afb22b56447',\n",
|
492 |
+
" '22bacdff64dd41899670c930d0bdc3a9',\n",
|
493 |
+
" 'fcc0f6618237436d98bdb9ee918f1689',\n",
|
494 |
+
" '840adc73a593400eb79ae99a972a8237',\n",
|
495 |
+
" '8a929883791a4584a2229f68d07d4e40',\n",
|
496 |
+
" 'f2bfce8fb0f845cfbfd0dd7c68489a0b',\n",
|
497 |
+
" '45c77fa7f00448cbad485da8fff9d59c',\n",
|
498 |
+
" '9ae29211013c49adab26fd76000de05f',\n",
|
499 |
+
" 'd9b4f6ff9eab4ab492262766d7dcc5f1',\n",
|
500 |
+
" 'edcce910e0e342f58b9aa60751b2f9e7',\n",
|
501 |
+
" '0b9ddfa4be1e466c92d6da22021263e7',\n",
|
502 |
+
" 'be8a0b13fe634a558a8f3787da07d3c6',\n",
|
503 |
+
" 'c7e8436b98534ab9992e7598d82f62a1',\n",
|
504 |
+
" 'a3d5e62deba04f64ae592da593863837',\n",
|
505 |
+
" 'a058d293c88842769184bd31cdd46157',\n",
|
506 |
+
" 'e134c211bbde4cee9c184481c861eb28',\n",
|
507 |
+
" 'f2853c4d3da9403ea054e1acbfb14c23',\n",
|
508 |
+
" '6095ecdf409e49e0894b8b409bfa61ef',\n",
|
509 |
+
" '9fcd650ff6e84d53b6c0d85cf51ee0e3',\n",
|
510 |
+
" 'eca00c45032a4b548ca0bbfca9123e52',\n",
|
511 |
+
" '20599a4f87034dba864a683a98aaa7c5',\n",
|
512 |
+
" 'af646536cdbc48f38149f5d37f768f6f',\n",
|
513 |
+
" '0849d591aace432e877b87405e8875f6',\n",
|
514 |
+
" '93aab2bc54894dad96b714936951540f',\n",
|
515 |
+
" '01f8251387af402fb3babd21db484b1f',\n",
|
516 |
+
" 'c8d5ec4643e9488284a72693ab903ee1',\n",
|
517 |
+
" 'f1ccf53b5cd34d12ab12e2bc1a54cc65',\n",
|
518 |
+
" 'ffcddc3efcf646449f996ca0c48b5dda',\n",
|
519 |
+
" '853dffcece54408db6d851745bbb1b6b',\n",
|
520 |
+
" '692f4bc7425f419ab5d716aae8553ebd',\n",
|
521 |
+
" '9260733ca60548f2a6385ab1ce923865',\n",
|
522 |
+
" '34705d1443714e7f9684d0d9bd67b2e1',\n",
|
523 |
+
" '4a9e6f60bd564c5c97f6babe95ea8fe0',\n",
|
524 |
+
" 'd5ab615c8ea64428b5dcb7214cada1fe',\n",
|
525 |
+
" '062c0c4daacf42da865484f4a30a588e',\n",
|
526 |
+
" '029dcc932f6b479aa20b3dde7e98423c',\n",
|
527 |
+
" '66542637c66842d9a0218f941ae9b5aa',\n",
|
528 |
+
" 'c2af2bc898b74a1eb9694292dd639f07',\n",
|
529 |
+
" 'a7e2168cb5644703aa95839b4ce030f5',\n",
|
530 |
+
" '18b4bb366fa84ade94295d9811a6bc03',\n",
|
531 |
+
" '4c405bded8094ac99314eca15a158636',\n",
|
532 |
+
" '555bcd57341047d49fbbf24bb643b85c',\n",
|
533 |
+
" 'fbf9a1f3639f4c6eb108b8fdc2f5f342',\n",
|
534 |
+
" '32d0da43076d40bb9d28bc852fd87d63',\n",
|
535 |
+
" '0535837a18714cba9768f823055518f5',\n",
|
536 |
+
" '0817188bfef4461f8f6d3df6d196c34f',\n",
|
537 |
+
" '6484425365594adab5a9f4bd7c36a83b',\n",
|
538 |
+
" '8ad7968e37a9408fb89fb78bb1c3005f',\n",
|
539 |
+
" 'b403a906399d45adbffcb10fecf0fecb',\n",
|
540 |
+
" 'ddf79f51d4544b989b0788987ee147c9',\n",
|
541 |
+
" 'ddaa17eb15674e29809dcf804449c645',\n",
|
542 |
+
" '0e876fb1cc00450097dfb79b2410395d',\n",
|
543 |
+
" '1397f3810eb64c69b8b049a1f10f8d13',\n",
|
544 |
+
" 'ac87d1446f1b44f28ca73d7e84145c57',\n",
|
545 |
+
" '69be5aedf644486f8a2fd88507532a56',\n",
|
546 |
+
" 'f2114973a4fe4d408ddf30b721b5ef10',\n",
|
547 |
+
" 'a2dca7cc375d4499848180bdb30c1445',\n",
|
548 |
+
" 'e09f62b7ccb24ec5b83c4c594b5d98d2',\n",
|
549 |
+
" '411b5054a2e4445dbfbca42783cff1c2',\n",
|
550 |
+
" '9c1e6c59375e4cc5b35a9cefa0cd2cbe',\n",
|
551 |
+
" '5a46f2388ff44c65aa7f64fb7c4323a0',\n",
|
552 |
+
" 'e1dbd1b6c3c3431a960fe2c21a6dde12',\n",
|
553 |
+
" '2374ce9cb0bf4e9eb0dff0e49bce6472',\n",
|
554 |
+
" '9a525105b44d4ca68e0adb98864f439c',\n",
|
555 |
+
" '050374e7aab94e7ba852d46393149296',\n",
|
556 |
+
" 'a9ee85e00ef94e1eaca35ac31a9f897a',\n",
|
557 |
+
" '72f950b08c12431f9e78310521956ee0',\n",
|
558 |
+
" '5b7fd2143f774014a47ccddc2f7b341b',\n",
|
559 |
+
" '7dad2a8ace574afe850c6ad1d94bca94',\n",
|
560 |
+
" 'd4c2c17550cb4a3f93e0baefd0736481',\n",
|
561 |
+
" 'd468ba975bd14da5ba7c12d62baea086',\n",
|
562 |
+
" '0a9b07ca34f3497a87736cedd9d3b717',\n",
|
563 |
+
" 'ee48a8101b8646048f22af19fbfcb15c',\n",
|
564 |
+
" '936aa920ab1e4639aac5adeadc4ddea8',\n",
|
565 |
+
" 'e0a43b13e02b443ab605f496d7883599',\n",
|
566 |
+
" '66ea7fb1c42e48f18cc8be7e19025fde',\n",
|
567 |
+
" 'db14e88e7a4b42b1b89c02c509436637',\n",
|
568 |
+
" '261c80c62a9d498c85e6015e8e7b6a00',\n",
|
569 |
+
" '380765573ec6450a8b9c88ca090e73dc',\n",
|
570 |
+
" '7ea2367162214d6b8568af4339fb566f',\n",
|
571 |
+
" '0b58820498a24afe9ebb0f1e732e61f6',\n",
|
572 |
+
" 'd14f20185d81474b8c693be9b25d819a',\n",
|
573 |
+
" 'b3c8a1723f35429eabca07670537dae9',\n",
|
574 |
+
" '30e9aaa2eea34b1aa721788c0936dcec',\n",
|
575 |
+
" 'edef675b0ae241bf908d9f33a2157b94',\n",
|
576 |
+
" 'bc9ffa0bef644901bd335eb6569becf5',\n",
|
577 |
+
" '710894a380aa4ab7b8a5f00998351fd7',\n",
|
578 |
+
" 'bca6ad1fc44747dcb3841ca028c933c1',\n",
|
579 |
+
" 'da0d232bd42d4143863bdc3662a3b0ba',\n",
|
580 |
+
" '3ca05f6fb84f400aa40647e909b99f86',\n",
|
581 |
+
" 'ac3d4115af5f4f9b84d9d91441476eea',\n",
|
582 |
+
" '8f95d44658fb467ebf078da25c5d3048',\n",
|
583 |
+
" '77522a3de4b64cba9cac3976ce486556',\n",
|
584 |
+
" '98b110db2b934b0092af19c5cb3d661d',\n",
|
585 |
+
" '92652663675848bd88952e6fa060510a',\n",
|
586 |
+
" '7e8b14b3189048698fa9dc991bd2e969',\n",
|
587 |
+
" 'a0f266fa20794f27b6884b84d61405df',\n",
|
588 |
+
" 'bef94726a254426ebb56b5e31cb84e6d',\n",
|
589 |
+
" '19aa6056079e43d59ec313be58b7d15c',\n",
|
590 |
+
" 'e9eaaf55533240acbe3e826243fcce5d',\n",
|
591 |
+
" '7daead857efd48d3a924121a75f1d554',\n",
|
592 |
+
" 'e5b478a30210476a80cadd87e6b08142',\n",
|
593 |
+
" 'df5d55b2532c4652be2d5931e1801b29',\n",
|
594 |
+
" 'b5835c1c3bae4a8daf54eba94e971f44',\n",
|
595 |
+
" 'a4683fd71d8e40bbabf4cef4456cd964',\n",
|
596 |
+
" 'a78e39b65ebf4c3c9b30aebd5ec79982',\n",
|
597 |
+
" 'b581cab62fb74ecfacd8f375873467d6',\n",
|
598 |
+
" 'dd441662fa93493b82bb1ee75062ab46',\n",
|
599 |
+
" '16247f37e8cf46ef801958417634dccb',\n",
|
600 |
+
" 'a58028cc6f8740849fb48649f2d962ad',\n",
|
601 |
+
" '4b38111bc5434823ac19c6b1a7f2f2d6',\n",
|
602 |
+
" '382e7f375c9443199ae54635cb98508c',\n",
|
603 |
+
" '5a249cd0ee6e467b8efe5ae1ed5a7ed3',\n",
|
604 |
+
" '3b2ec61acb7a403fa49b0fa7dd6cd781',\n",
|
605 |
+
" '768086e7a529403b9cf2ff8d2a7cbbb4',\n",
|
606 |
+
" '7ca4df9a94af468ca5ce2301fe4c6d62',\n",
|
607 |
+
" '98e495ddab774a0d979c936fe465f6e6',\n",
|
608 |
+
" '177c7f64f17c45ca9305ee781ca4262a',\n",
|
609 |
+
" '1e7ae0bf23214eefa127b74da16af4a9',\n",
|
610 |
+
" '4c21c9b290404bf8aa79158c05d14e05',\n",
|
611 |
+
" '2ef558347c014b4ca14aa4b3ec008922',\n",
|
612 |
+
" 'e8ca0578755a498a91f2d0e571315cd9',\n",
|
613 |
+
" 'a5354ab136384770bcff710559952c91',\n",
|
614 |
+
" '7843a15da9614bfab5b8ec924f06d3c4',\n",
|
615 |
+
" 'bc61624871844fa2881909e81482daec',\n",
|
616 |
+
" '9ab126657c5344e3b4a5fa8f434c6e8a',\n",
|
617 |
+
" 'cbfdf829144b42ec94c45b908e65b9cc',\n",
|
618 |
+
" '03181f2d16c649e6a71e1f34a9791c7d',\n",
|
619 |
+
" '0659af0b07884a948181548e4863ce05',\n",
|
620 |
+
" '01c1681b11ac4ed79ff841e4bbf04458',\n",
|
621 |
+
" 'bef030cd7fd54d2eaad06670fd10e5a4',\n",
|
622 |
+
" 'aa4247fba2564b339c8ad1cdad241ebf',\n",
|
623 |
+
" '96368c9f1df448298937970ab3e382dd',\n",
|
624 |
+
" 'f22b1a033e4549e7933131817d1f6dbb',\n",
|
625 |
+
" '2d7de339810a4d66ad1350385406a040',\n",
|
626 |
+
" 'b76569b1b16a45048e67630c823be3dd',\n",
|
627 |
+
" '291feba0e65240a39ab40b04898ed776',\n",
|
628 |
+
" 'dc42dc35613c4fbc8097d5bb3c60101a']"
|
629 |
]
|
630 |
},
|
631 |
+
"execution_count": 11,
|
632 |
"metadata": {},
|
633 |
"output_type": "execute_result"
|
634 |
}
|
|
|
640 |
"from qdrant_client.http.models import Distance, VectorParams\n",
|
641 |
"\n",
|
642 |
"dimension = 1024\n",
|
643 |
+
"collection_name = \"ai-safety-sf-arctic-embed-l-semantic\"\n",
|
644 |
"qdrant_server = os.environ[\"QDRANT_API_URL\"]\n",
|
645 |
"qdrant_client = QdrantClient(url=qdrant_server,api_key=os.environ[\"QDRANT_API_KEY\"])\n",
|
646 |
+
"# qdrant_client.create_collection(\n",
|
647 |
+
"# collection_name=collection_name,\n",
|
648 |
+
"# vectors_config=VectorParams(size=dimension, distance=Distance.COSINE),\n",
|
649 |
+
"# )\n",
|
650 |
"\n",
|
651 |
"vector_store = QdrantVectorStore(\n",
|
652 |
" client=qdrant_client,\n",
|
|
|
654 |
" embedding=embedding_model,\n",
|
655 |
")\n",
|
656 |
"\n",
|
657 |
+
"vector_store.add_documents(recursive_chunked_docs)\n",
|
658 |
"\n"
|
659 |
]
|
660 |
},
|
661 |
{
|
662 |
"cell_type": "code",
|
663 |
+
"execution_count": 12,
|
664 |
"metadata": {},
|
665 |
"outputs": [],
|
666 |
"source": [
|
|
|
670 |
},
|
671 |
{
|
672 |
"cell_type": "code",
|
673 |
+
"execution_count": 13,
|
674 |
"metadata": {},
|
675 |
"outputs": [
|
676 |
{
|
677 |
"data": {
|
678 |
"text/plain": [
|
679 |
+
"[Document(metadata={'source': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 11, 'total_pages': 73, 'format': 'PDF 1.6', 'title': 'Blueprint for an AI Bill of Rights', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe Illustrator 26.3 (Macintosh)', 'producer': 'iLovePDF', 'creationDate': \"D:20220920133035-04'00'\", 'modDate': \"D:20221003104118-04'00'\", 'trapped': '', '_id': 'ed405fa7-b4f6-4e8d-b294-4f5220a5c8fe', '_collection_name': 'ai-safety-sf-arctic-embed-l-semantic'}, page_content='FROM \\nPRINCIPLES \\nTO PRACTICE \\nA TECHINCAL COMPANION TO\\nTHE Blueprint for an \\nAI BILL OF RIGHTS\\n12'),\n",
|
680 |
+
" Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 50, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': '98b110db-2b93-4b00-92af-19c5cb3d661d', '_collection_name': 'ai-safety-sf-arctic-embed-l-semantic'}, page_content='• Accessibility and reasonable \\naccommodations \\n• AI actor credentials and qualifications \\n• Alignment to organizational values \\n• Auditing and assessment \\n• Change-management controls \\n• Commercial use \\n• Data provenance'),\n",
|
681 |
+
" Document(metadata={'source': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 19, 'total_pages': 73, 'format': 'PDF 1.6', 'title': 'Blueprint for an AI Bill of Rights', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe Illustrator 26.3 (Macintosh)', 'producer': 'iLovePDF', 'creationDate': \"D:20220920133035-04'00'\", 'modDate': \"D:20221003104118-04'00'\", 'trapped': '', '_id': 'e1e9da01-efe5-4ddf-8c98-771b2d1135f1', '_collection_name': 'ai-safety-sf-arctic-embed-l-semantic'}, page_content='organization’s business processes or other activities, system goals, any human-run procedures that form a \\npart of the system, and specific performance expectations; a description of any data used to train machine \\nlearning models or for other purposes, including how data sources were processed and interpreted, a \\nsummary of what data might be missing, incomplete, or erroneous, and data relevancy justifications; the \\nresults of public consultation such as concerns raised and any decisions made due to these concerns; risk \\nidentification and management assessments and any steps taken to mitigate potential harms; the results of \\nperformance testing including, but not limited to, accuracy, differential demographic impact, resulting \\nerror rates (overall and per demographic group), and comparisons to previously deployed systems; \\nongoing monitoring procedures and regular performance testing reports, including monitoring frequency,'),\n",
|
682 |
+
" Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 51, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': 'bef94726-a254-426e-bb56-b5e31cb84e6d', '_collection_name': 'ai-safety-sf-arctic-embed-l-semantic'}, page_content='lifecycle and informed by representative AI Actors (see Figure 3 of the AI RMF). Until new and rigorous'),\n",
|
683 |
+
" Document(metadata={'source': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 25, 'total_pages': 73, 'format': 'PDF 1.6', 'title': 'Blueprint for an AI Bill of Rights', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe Illustrator 26.3 (Macintosh)', 'producer': 'iLovePDF', 'creationDate': \"D:20220920133035-04'00'\", 'modDate': \"D:20221003104118-04'00'\", 'trapped': '', '_id': '2c6215e1-bf81-48a3-9e1e-e5044e7d25f6', '_collection_name': 'ai-safety-sf-arctic-embed-l-semantic'}, page_content='for any resulting algorithmic discrimination. \\n26\\nAlgorithmic \\nDiscrimination \\nProtections'),\n",
|
684 |
+
" Document(metadata={'source': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 0, 'total_pages': 73, 'format': 'PDF 1.6', 'title': 'Blueprint for an AI Bill of Rights', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe Illustrator 26.3 (Macintosh)', 'producer': 'iLovePDF', 'creationDate': \"D:20220920133035-04'00'\", 'modDate': \"D:20221003104118-04'00'\", 'trapped': '', '_id': '7e6a7342-2dd0-4376-b521-2e1c71275f5c', '_collection_name': 'ai-safety-sf-arctic-embed-l-semantic'}, page_content='BLUEPRINT FOR AN \\nAI BILL OF \\nRIGHTS \\nMAKING AUTOMATED \\nSYSTEMS WORK FOR \\nTHE AMERICAN PEOPLE \\nOCTOBER 2022'),\n",
|
685 |
+
" Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 38, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': '411b5054-a2e4-445d-bfbc-a42783cff1c2', '_collection_name': 'ai-safety-sf-arctic-embed-l-semantic'}, page_content='guide the design of provenance data-tracking techniques. \\nHuman-AI Configuration; \\nInformation Integrity \\nMS-2.10-003 Verify deduplication of GAI training data samples, particularly regarding synthetic \\ndata. \\nHarmful Bias and Homogenization \\nAI Actor Tasks: AI Deployment, AI Impact Assessment, Domain Experts, End-Users, Operation and Monitoring, TEVV'),\n",
|
686 |
+
" Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 59, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': '9ab12665-7c53-44e3-b4a5-fa8f434c6e8a', '_collection_name': 'ai-safety-sf-arctic-embed-l-semantic'}, page_content='https://www.bloomberg.com/graphics/2023-generative-ai-bias/. \\nNational Institute of Standards and Technology (2024) Adversarial Machine Learning: A Taxonomy and \\nTerminology of Attacks and Mitigations https://csrc.nist.gov/pubs/ai/100/2/e2023/final \\nNational Institute of Standards and Technology (2023) AI Risk Management Framework. \\nhttps://www.nist.gov/itl/ai-risk-management-framework \\nNational Institute of Standards and Technology (2023) AI Risk Management Framework, Chapter 3: AI \\nRisks and Trustworthiness. \\nhttps://airc.nist.gov/AI_RMF_Knowledge_Base/AI_RMF/Foundational_Information/3-sec-characteristics \\nNational Institute of Standards and Technology (2023) AI Risk Management Framework, Chapter 6: AI \\nRMF Profiles. https://airc.nist.gov/AI_RMF_Knowledge_Base/AI_RMF/Core_And_Profiles/6-sec-profile \\nNational Institute of Standards and Technology (2023) AI Risk Management Framework, Appendix A: \\nDescriptions of AI Actor Tasks.'),\n",
|
687 |
+
" Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 57, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': '98e495dd-ab77-4a0d-979c-936fe465f6e6', '_collection_name': 'ai-safety-sf-arctic-embed-l-semantic'}, page_content='54 \\nAppendix B. References \\nAcemoglu, D. (2024) The Simple Macroeconomics of AI https://www.nber.org/papers/w32487 \\nAI Incident Database. https://incidentdatabase.ai/ \\nAtherton, D. (2024) Deepfakes and Child Safety: A Survey and Analysis of 2023 Incidents and Responses. \\nAI Incident Database. https://incidentdatabase.ai/blog/deepfakes-and-child-safety/ \\nBadyal, N. et al. (2023) Intentional Biases in LLM Responses. arXiv. https://arxiv.org/pdf/2311.07611 \\nBing Chat: Data Exfiltration Exploit Explained. Embrace The Red. \\nhttps://embracethered.com/blog/posts/2023/bing-chat-data-exfiltration-poc-and-fix/ \\nBommasani, R. et al. (2022) Picking on the Same Person: Does Algorithmic Monoculture lead to Outcome \\nHomogenization? arXiv. https://arxiv.org/pdf/2211.13972 \\nBoyarskaya, M. et al. (2020) Overcoming Failures of Imagination in AI Infused System Development and \\nDeployment. arXiv. https://arxiv.org/pdf/2011.13416 \\nBrowne, D. et al. (2023) Securing the AI Pipeline. Mandiant.'),\n",
|
688 |
+
" Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 12, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': '4dead17e-f714-4f4d-97b4-47fda4aebe43', '_collection_name': 'ai-safety-sf-arctic-embed-l-semantic'}, page_content='Priorities Related to Information Integrity Research and Development.')]"
|
689 |
]
|
690 |
},
|
691 |
+
"execution_count": 13,
|
692 |
"metadata": {},
|
693 |
"output_type": "execute_result"
|
694 |
}
|
|
|
1180 |
],
|
1181 |
"source": [
|
1182 |
"# Vector Store with recursive chunked documents\n",
|
1183 |
+
"from langchain_qdrant import QdrantVectorStore\n",
|
1184 |
+
"from langchain_core.documents import Document\n",
|
1185 |
+
"from qdrant_client import QdrantClient\n",
|
1186 |
+
"from qdrant_client.http.models import Distance, VectorParams\n",
|
1187 |
"\n",
|
1188 |
+
"dimension = 1024\n",
|
1189 |
+
"qdrant_server = os.environ[\"QDRANT_API_URL\"]\n",
|
1190 |
+
"recursive_collection_name = \"ai-safety-ft-arctic-embed-l-recursive\"\n",
|
1191 |
"\n",
|
1192 |
"recursive_qdrant_client = QdrantClient(url=qdrant_server,api_key=os.environ[\"QDRANT_API_KEY\"])\n",
|
1193 |
+
"recursive_qdrant_client.create_collection(\n",
|
1194 |
+
" collection_name=recursive_collection_name,\n",
|
1195 |
+
" vectors_config=VectorParams(size=dimension, distance=Distance.COSINE),\n",
|
1196 |
+
")\n",
|
1197 |
"\n",
|
1198 |
"recursive_vector_store = QdrantVectorStore(\n",
|
1199 |
" client=recursive_qdrant_client,\n",
|
app.py
CHANGED
@@ -46,7 +46,7 @@ Now preloading below documents:
|
|
46 |
Please wait for a moment to load the documents.
|
47 |
"""
|
48 |
chat_model_name = "gpt-4o"
|
49 |
-
embedding_model_name = "
|
50 |
chat_model = ChatOpenAI(model=chat_model_name, temperature=0)
|
51 |
|
52 |
async def connect_to_qdrant():
|
@@ -99,7 +99,6 @@ def get_text_splitter(strategy, embedding_model):
|
|
99 |
if strategy == "semantic":
|
100 |
return SemanticChunker(
|
101 |
embedding_model,
|
102 |
-
buffer_size=3,
|
103 |
breakpoint_threshold_type="percentile",
|
104 |
breakpoint_threshold_amount=90,
|
105 |
)
|
@@ -246,4 +245,4 @@ async def main(message: cl.Message):
|
|
246 |
if __name__ == "__main__":
|
247 |
from chainlit.cli import run_chainlit
|
248 |
|
249 |
-
run_chainlit(__file__)
|
|
|
46 |
Please wait for a moment to load the documents.
|
47 |
"""
|
48 |
chat_model_name = "gpt-4o"
|
49 |
+
embedding_model_name = "jeevanions/finetuned_arctic-embedd-l" # Fine tuned model used
|
50 |
chat_model = ChatOpenAI(model=chat_model_name, temperature=0)
|
51 |
|
52 |
async def connect_to_qdrant():
|
|
|
99 |
if strategy == "semantic":
|
100 |
return SemanticChunker(
|
101 |
embedding_model,
|
|
|
102 |
breakpoint_threshold_type="percentile",
|
103 |
breakpoint_threshold_amount=90,
|
104 |
)
|
|
|
245 |
if __name__ == "__main__":
|
246 |
from chainlit.cli import run_chainlit
|
247 |
|
248 |
+
run_chainlit(__file__)
|