|
import tempfile |
|
import streamlit as st |
|
from streamlit_chat import message |
|
|
|
import torch |
|
import torch.nn |
|
|
|
import transformers |
|
from transformers import ( |
|
AutoModelForCausalLM, |
|
AutoTokenizer, |
|
BitsAndBytesConfig, |
|
HfArgumentParser, |
|
TrainingArguments, |
|
pipeline, |
|
logging, |
|
) |
|
|
|
|
|
import pandas as pd |
|
import numpy as np |
|
import os |
|
import io |
|
|
|
from langchain.document_loaders import TextLoader |
|
from langchain import PromptTemplate |
|
from langchain.text_splitter import CharacterTextSplitter |
|
from langchain.document_loaders import PyPDFLoader |
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from langchain.vectorstores import FAISS |
|
from langchain.chains.question_answering import load_qa_chain |
|
from langchain.chains import RetrievalQA |
|
from langchain import HuggingFacePipeline |
|
|
|
|
|
def pdf_loader(file_path): |
|
'''This is a function for loading the PDFs |
|
Params: |
|
file_path: The path of the PDF file |
|
''' |
|
output_file = "Loaded_PDF.txt" |
|
loader = PyPDFLoader(file_path) |
|
pdf_file_as_loaded_docs = loader.load() |
|
return pdf_file_as_loaded_docs |
|
|
|
def splitDoc(loaded_docs): |
|
'''This is a function that creates the chunks of our loaded Document |
|
Params: |
|
loaded_docs:The loaded document from the pdf_loader function''' |
|
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10) |
|
chunked_docs = splitter.split_documents(loaded_docs) |
|
return chunked_docs |
|
|
|
def makeEmbeddings(chunked_docs): |
|
'''This is a functuon for making the embeddings of the chunked document |
|
Params: |
|
chunked_docs:The chunked docs''' |
|
embedder = HuggingFaceEmbeddings() |
|
vector_store = FAISS.from_documents(chunked_docs, embedder) |
|
return vector_store |
|
|
|
|
|
def create_flan_t5_base(load_in_8bit=False): |
|
''''Loading the Flan T5 base in the form of pipeline''' |
|
|
|
model="google/flan-t5-base" |
|
tokenizer = AutoTokenizer.from_pretrained(model) |
|
return pipeline( |
|
task="text2text-generation", |
|
model=model, |
|
tokenizer = tokenizer, |
|
max_new_tokens=100, |
|
model_kwargs={ "load_in_8bit": load_in_8bit, "max_length": 512, "temperature": 0.} |
|
) |
|
|
|
|
|
|
|
|
|
|