Spaces:
Running
Running
# -*- coding: utf-8 -*- | |
"""langchain_vectara.ipynb | |
Automatically generated by Colab. | |
Original file is located at | |
https://colab.research.google.com/drive/1XzD7XHO_a-gYFBnGUWH1MOtstT4sDY3J | |
""" | |
!pip install -r requirements.txt | |
!pip install langchain_community langchain-text-splitters unstructured[local-inference] pdf2image pdfminer.six langchain-together pillow_heif | |
from langchain_community.document_loaders import TextLoader | |
from langchain_community.embeddings.fake import FakeEmbeddings | |
from langchain_community.vectorstores import Vectara | |
from langchain_text_splitters import CharacterTextSplitter | |
from google.colab import userdata | |
TOGETHER_API_KEY = userdata.get('TOGETHER_API_KEY') | |
vectara_customer_id = userdata.get('VECTARA_CUSTOMER_ID') | |
vectara_corpus_id = userdata.get('VECTARA_CORPUS_ID') | |
vectara_api_key = userdata.get('VECTARA_API_KEY') | |
vectorstore = Vectara( | |
vectara_customer_id=vectara_customer_id, | |
vectara_corpus_id=vectara_corpus_id, | |
vectara_api_key=vectara_api_key | |
) | |
from langchain_community.document_loaders import UnstructuredPDFLoader | |
!mkdir docs | |
# upload sample file | |
loader = UnstructuredPDFLoader('ISB-020-U3-W-S-01-B18003-001-020.pdf', strategy='fast') | |
data = loader.load() | |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
docs = text_splitter.split_documents(data) | |
import json | |
from langchain_community.document_transformers import DoctranPropertyExtractor | |
from langchain_core.documents import Document | |
properties = [ | |
{ | |
"name": "document_number", | |
"description": "Unique identifier for the document within its project.", | |
"type": "string", | |
"required": True | |
}, | |
{ | |
"name": "discipline", | |
"description": "The discipline associated with the document.", | |
"type": "string", | |
"required": True | |
}, | |
{ | |
"name": "title", | |
"description": "Title of the document.", | |
"type": "string", | |
"required": True | |
}, | |
{ | |
"name": "version", | |
"description": "Version number of the document.", | |
"type": "integer", | |
"required": True | |
}, | |
{ | |
"name": "date", | |
"description": "Creation date of the document.", | |
"type": "string", | |
"format": "date", | |
"required": True | |
}, | |
{ | |
"name": "author", | |
"description": "Author of the document.", | |
"type": "object", | |
"properties": { | |
"name": { | |
"type": "string", | |
"required": True | |
}, | |
"email": { | |
"type": "string", | |
"format": "email", | |
"required": False | |
} | |
}, | |
"required": True | |
}, | |
{ | |
"name": "related_documents", | |
"description": "List of related documents.", | |
"type": "array", | |
"items": { | |
"type": "string" | |
}, | |
"required": False | |
}, | |
{ | |
"name": "status", | |
"description": "Current status of the document.", | |
"type": "string", | |
"enum": ["draft", "under_review", "approved", "rejected"], | |
"required": True | |
}, | |
{ | |
"name": "keywords", | |
"description": "Keywords associated with the document.", | |
"type": "array", | |
"items": { | |
"type": "string" | |
}, | |
"required": False | |
}, | |
{ | |
"name": "summary", | |
"description": "Short summary of the document content.", | |
"type": "string", | |
"required": False | |
} | |
] | |
property_extractor = DoctranPropertyExtractor(properties=properties) | |
from dotenv import load_dotenv | |
load_dotenv() |