Spaces:
Running
Running
File size: 3,685 Bytes
0c2a143 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
# -*- coding: utf-8 -*-
"""langchain_vectara.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1XzD7XHO_a-gYFBnGUWH1MOtstT4sDY3J
"""
!pip install -r requirements.txt
!pip install langchain_community langchain-text-splitters unstructured[local-inference] pdf2image pdfminer.six langchain-together pillow_heif
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.fake import FakeEmbeddings
from langchain_community.vectorstores import Vectara
from langchain_text_splitters import CharacterTextSplitter
from google.colab import userdata
TOGETHER_API_KEY = userdata.get('TOGETHER_API_KEY')
vectara_customer_id = userdata.get('VECTARA_CUSTOMER_ID')
vectara_corpus_id = userdata.get('VECTARA_CORPUS_ID')
vectara_api_key = userdata.get('VECTARA_API_KEY')
vectorstore = Vectara(
vectara_customer_id=vectara_customer_id,
vectara_corpus_id=vectara_corpus_id,
vectara_api_key=vectara_api_key
)
from langchain_community.document_loaders import UnstructuredPDFLoader
!mkdir docs
# upload sample file
loader = UnstructuredPDFLoader('ISB-020-U3-W-S-01-B18003-001-020.pdf', strategy='fast')
data = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(data)
import json
from langchain_community.document_transformers import DoctranPropertyExtractor
from langchain_core.documents import Document
properties = [
{
"name": "document_number",
"description": "Unique identifier for the document within its project.",
"type": "string",
"required": True
},
{
"name": "discipline",
"description": "The discipline associated with the document.",
"type": "string",
"required": True
},
{
"name": "title",
"description": "Title of the document.",
"type": "string",
"required": True
},
{
"name": "version",
"description": "Version number of the document.",
"type": "integer",
"required": True
},
{
"name": "date",
"description": "Creation date of the document.",
"type": "string",
"format": "date",
"required": True
},
{
"name": "author",
"description": "Author of the document.",
"type": "object",
"properties": {
"name": {
"type": "string",
"required": True
},
"email": {
"type": "string",
"format": "email",
"required": False
}
},
"required": True
},
{
"name": "related_documents",
"description": "List of related documents.",
"type": "array",
"items": {
"type": "string"
},
"required": False
},
{
"name": "status",
"description": "Current status of the document.",
"type": "string",
"enum": ["draft", "under_review", "approved", "rejected"],
"required": True
},
{
"name": "keywords",
"description": "Keywords associated with the document.",
"type": "array",
"items": {
"type": "string"
},
"required": False
},
{
"name": "summary",
"description": "Short summary of the document content.",
"type": "string",
"required": False
}
]
property_extractor = DoctranPropertyExtractor(properties=properties)
from dotenv import load_dotenv
load_dotenv() |