File size: 3,685 Bytes
0c2a143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# -*- coding: utf-8 -*-
"""langchain_vectara.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1XzD7XHO_a-gYFBnGUWH1MOtstT4sDY3J
"""

!pip install -r requirements.txt

!pip install langchain_community langchain-text-splitters unstructured[local-inference] pdf2image pdfminer.six langchain-together pillow_heif

from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.fake import FakeEmbeddings
from langchain_community.vectorstores import Vectara
from langchain_text_splitters import CharacterTextSplitter

from google.colab import userdata

TOGETHER_API_KEY = userdata.get('TOGETHER_API_KEY')
vectara_customer_id =  userdata.get('VECTARA_CUSTOMER_ID')
vectara_corpus_id = userdata.get('VECTARA_CORPUS_ID')
vectara_api_key = userdata.get('VECTARA_API_KEY')

vectorstore = Vectara(
                vectara_customer_id=vectara_customer_id,
                vectara_corpus_id=vectara_corpus_id,
                vectara_api_key=vectara_api_key
            )

from langchain_community.document_loaders import UnstructuredPDFLoader

!mkdir docs
# upload sample file

loader = UnstructuredPDFLoader('ISB-020-U3-W-S-01-B18003-001-020.pdf', strategy='fast')
data = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(data)

import json

from langchain_community.document_transformers import DoctranPropertyExtractor
from langchain_core.documents import Document

properties = [
    {
        "name": "document_number",
        "description": "Unique identifier for the document within its project.",
        "type": "string",
        "required": True
    },
    {
        "name": "discipline",
        "description": "The discipline associated with the document.",
        "type": "string",
        "required": True
    },
    {
        "name": "title",
        "description": "Title of the document.",
        "type": "string",
        "required": True
    },
    {
        "name": "version",
        "description": "Version number of the document.",
        "type": "integer",
        "required": True
    },
    {
        "name": "date",
        "description": "Creation date of the document.",
        "type": "string",
        "format": "date",
        "required": True
    },
    {
        "name": "author",
        "description": "Author of the document.",
        "type": "object",
        "properties": {
            "name": {
                "type": "string",
                "required": True
            },
            "email": {
                "type": "string",
                "format": "email",
                "required": False
            }
        },
        "required": True
    },
    {
        "name": "related_documents",
        "description": "List of related documents.",
        "type": "array",
        "items": {
            "type": "string"
        },
        "required": False
    },
    {
        "name": "status",
        "description": "Current status of the document.",
        "type": "string",
        "enum": ["draft", "under_review", "approved", "rejected"],
        "required": True
    },
    {
        "name": "keywords",
        "description": "Keywords associated with the document.",
        "type": "array",
        "items": {
            "type": "string"
        },
        "required": False
    },
    {
        "name": "summary",
        "description": "Short summary of the document content.",
        "type": "string",
        "required": False
    }
]

property_extractor = DoctranPropertyExtractor(properties=properties)

from dotenv import load_dotenv

load_dotenv()