Spaces:

Vageesh1
/

PDF_QA

Sleeping

App Files Files Community

PDF_QA / helper.py

Vageesh1

Update helper.py

65d98da over 1 year ago

raw

history blame contribute delete

2.21 kB

	import tempfile
	import streamlit as st
	from streamlit_chat import message

	import torch
	import torch.nn

	import transformers
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	BitsAndBytesConfig,
	HfArgumentParser,
	TrainingArguments,
	pipeline,
	logging,
	)


	import pandas as pd
	import numpy as np
	import os
	import io

	from langchain.document_loaders import TextLoader
	from langchain import PromptTemplate
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.document_loaders import PyPDFLoader
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.vectorstores import FAISS
	from langchain.chains.question_answering import load_qa_chain
	from langchain.chains import RetrievalQA
	from langchain import HuggingFacePipeline


	def pdf_loader(file_path):
	'''This is a function for loading the PDFs
	Params:
	file_path: The path of the PDF file
	'''
	output_file = "Loaded_PDF.txt"
	loader = PyPDFLoader(file_path)
	pdf_file_as_loaded_docs = loader.load()
	return pdf_file_as_loaded_docs

	def splitDoc(loaded_docs):
	'''This is a function that creates the chunks of our loaded Document
	Params:
	loaded_docs:The loaded document from the pdf_loader function'''
	splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
	chunked_docs = splitter.split_documents(loaded_docs)
	return chunked_docs

	def makeEmbeddings(chunked_docs):
	'''This is a functuon for making the embeddings of the chunked document
	Params:
	chunked_docs:The chunked docs'''
	embedder = HuggingFaceEmbeddings()
	vector_store = FAISS.from_documents(chunked_docs, embedder)#making a FAISS based vector data
	return vector_store


	def create_flan_t5_base(load_in_8bit=False):
	''''Loading the Flan T5 base in the form of pipeline'''
	# Wrap it in HF pipeline for use with LangChain
	model="google/flan-t5-base"
	tokenizer = AutoTokenizer.from_pretrained(model)
	return pipeline(
	task="text2text-generation",
	model=model,
	tokenizer = tokenizer,
	max_new_tokens=100,
	model_kwargs={ "load_in_8bit": load_in_8bit, "max_length": 512, "temperature": 0.}
	)