File size: 5,596 Bytes
fd4d33c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
from dotenv import load_dotenv
import os
import uuid
from PyPDF2 import PdfReader
from docx import Document
from docx.text.paragraph import Paragraph
from docx.table import Table
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OpenAIEmbeddings
import streamlit as st
from textwrap import dedent
from Prompts_and_Chains import LLMChains
def extract_text_from_file(file):
text = file.read().decode("utf-8")
return text
def process_paragraph(paragraph):
# Process the content of the paragraph as needed
return paragraph.text
def process_table(table):
# Process the content of the table as needed
text = ""
for row in table.rows:
for cell in row.cells:
text += cell.text
return text
def read_docx(file_path):
doc = Document(file_path)
data = []
for element in doc.iter_inner_content():
if isinstance(element, Paragraph):
data.append(process_paragraph(element))
if isinstance(element, Table):
data.append(process_table(element))
return "\n".join(data)
def get_pdf_text(pdf):
"""This function extracts the text from the PDF file"""
text = []
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text.append(page.extract_text())
return "\n".join(text)
class RFPProcessor:
def __init__(self):
load_dotenv()
self.openai_api_key = os.getenv("OPENAI_API_KEY")
self.chains_obj = LLMChains()
def genrate_legal_adviser_bot_result(self):
if len(st.session_state["input"]) > 0:
# vector_store = st.session_state["legal_adviser_vectorstore"]
# if vector_store:
# vector_store = st.session_state["legal_adviser_vectorstore"]
query = st.session_state["input"]
# results = vector_store.similarity_search(query, 3)
# get the text from the results
# source_knowledge = "\n".join([x.page_content for x in results])
inputs = {
"input": query,
}
output = self.chains_obj.legal_adviser_bot_chain.run(inputs)
st.session_state.messages.append(
{"role": "assistant", "content": output})
st.session_state["input"] = ""
def process_case_data(self, case_name, files):
if case_name and files:
# Generate a unique identifier for the case data set
case_id = str(uuid.uuid4())
extracted_data = []
all_texts = []
for file in files:
file_text = []
if file.name.endswith(".docx"):
file_text = read_docx(file)
elif file.name.endswith(".pdf"):
file_text = get_pdf_text(file)
else:
file_text = extract_text_from_file(file)
text_splitter = CharacterTextSplitter(
separator="\n", chunk_size=1000, chunk_overlap=150, length_function=len
)
texts = text_splitter.split_text(" ".join(file_text))
all_texts.extend(texts)
extracted_data.append(" ".join(file_text))
project_dir = os.path.dirname(os.path.abspath(__file__))
vectorstore = Chroma(
persist_directory=os.path.join(
project_dir, "vector_stores", case_name),
embedding_function=OpenAIEmbeddings(
openai_api_key=self.openai_api_key),
)
vectorstore.add_texts(all_texts)
st.session_state[case_id] = {
"vectorstore": vectorstore,
"extracted_data": extracted_data,
}
all_text = " ".join(extracted_data)
st.session_state["case_summary"] = self.chains_obj.summary_chain.run(
{
"case_name": case_name,
"case_details": dedent(all_text),
}
)
st.session_state["is_data_processed"] = True
st.session_state["case_name"] = case_name
st.session_state["case_details"] = dedent(all_text)
# Store the current rfp_id in the session state
st.session_state["current_case_id"] = case_id
st.success("Data processed successfully")
def genrate_legal_bot_result(self):
if len(st.session_state["input"]) > 0:
case_id = st.session_state.get("current_case_id")
if case_id:
vector_store = st.session_state[case_id]["vectorstore"]
query = st.session_state["bot_input"]
results = vector_store.similarity_search(query, 3)
# get the text from the results
source_knowledge = "\n".join([x.page_content for x in results])
inputs = {
"context": source_knowledge,
"input": st.session_state["input"],
}
output = self.chains_obj.bot_chain.run(inputs)
st.session_state.past.append(st.session_state["bot_input"])
st.session_state.generated.append(output)
st.session_state["bot_input"] = ""
else:
st.warning(f"No vector store found for the current case ID")
|