Spaces:

salgadev
/

docverifyrag

Sleeping

File size: 3,685 Bytes

0c2a143

# -*- coding: utf-8 -*-
"""langchain_vectara.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1XzD7XHO_a-gYFBnGUWH1MOtstT4sDY3J
"""

!pip install -r requirements.txt

!pip install langchain_community langchain-text-splitters unstructured[local-inference] pdf2image pdfminer.six langchain-together pillow_heif

from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.fake import FakeEmbeddings
from langchain_community.vectorstores import Vectara
from langchain_text_splitters import CharacterTextSplitter

from google.colab import userdata

TOGETHER_API_KEY = userdata.get('TOGETHER_API_KEY')
vectara_customer_id =  userdata.get('VECTARA_CUSTOMER_ID')
vectara_corpus_id = userdata.get('VECTARA_CORPUS_ID')
vectara_api_key = userdata.get('VECTARA_API_KEY')

vectorstore = Vectara(
                vectara_customer_id=vectara_customer_id,
                vectara_corpus_id=vectara_corpus_id,
                vectara_api_key=vectara_api_key
            )

from langchain_community.document_loaders import UnstructuredPDFLoader

!mkdir docs
# upload sample file

loader = UnstructuredPDFLoader('ISB-020-U3-W-S-01-B18003-001-020.pdf', strategy='fast')
data = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(data)

import json

from langchain_community.document_transformers import DoctranPropertyExtractor
from langchain_core.documents import Document

properties = [
    {
        "name": "document_number",
        "description": "Unique identifier for the document within its project.",
        "type": "string",
        "required": True
    },
    {
        "name": "discipline",
        "description": "The discipline associated with the document.",
        "type": "string",
        "required": True
    },
    {
        "name": "title",
        "description": "Title of the document.",
        "type": "string",
        "required": True
    },
    {
        "name": "version",
        "description": "Version number of the document.",
        "type": "integer",
        "required": True
    },
    {
        "name": "date",
        "description": "Creation date of the document.",
        "type": "string",
        "format": "date",
        "required": True
    },
    {
        "name": "author",
        "description": "Author of the document.",
        "type": "object",
        "properties": {
            "name": {
                "type": "string",
                "required": True
            },
            "email": {
                "type": "string",
                "format": "email",
                "required": False
            }
        },
        "required": True
    },
    {
        "name": "related_documents",
        "description": "List of related documents.",
        "type": "array",
        "items": {
            "type": "string"
        },
        "required": False
    },
    {
        "name": "status",
        "description": "Current status of the document.",
        "type": "string",
        "enum": ["draft", "under_review", "approved", "rejected"],
        "required": True
    },
    {
        "name": "keywords",
        "description": "Keywords associated with the document.",
        "type": "array",
        "items": {
            "type": "string"
        },
        "required": False
    },
    {
        "name": "summary",
        "description": "Short summary of the document content.",
        "type": "string",
        "required": False
    }
]

property_extractor = DoctranPropertyExtractor(properties=properties)

from dotenv import load_dotenv

load_dotenv()