Spaces:

regraded01
/

pdf-to-table

Sleeping

regraded01 commited on Jun 16, 2024

Commit

5d4bf7d

1 Parent(s): d3ce4e0

feat: store model_id as a config variable

Files changed (3) hide show

app_langchain.py CHANGED Viewed

@@ -2,23 +2,13 @@ from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.runnables import RunnablePassthrough
 import streamlit as st
-import fitz
 from src.utils import load_config_values
 from src.dev_llm import FakeLLM
-def extract_text_from_pdf(pdf_file):
-    document = fitz.open(stream=pdf_file.read(), filetype="pdf")
-    text = ""
-    for page_num in range(len(document)):
-        page = document.load_page(page_num)
-        text += page.get_text()
-    return text
-# TODO: Change this to reflect prod model rather than dev models
-# Initalise fake values and a fake LLM to test out the full pipeline
-tmp_llm = FakeLLM()
-tmp_pdf_text = "This patient is due for an appointment on 1st June 2024" # replace with Runner to a file uploader
 # Load in model and pipeline configuration values
 system_message, context_message, model_id = load_config_values(
@@ -29,15 +19,15 @@ system_message, context_message, model_id = load_config_values(
     ]
 )
 prompt = ChatPromptTemplate.from_template(
     template=context_message,
 )
-uploaded_doc = st.file_uploader(
-    label="Upload PDF:\n",
-    type=".pdf"
-)
-if uploaded_doc is not None:
-    uploaded_text = extract_text_from_pdf(uploaded_doc)
 chain = (
     {
@@ -51,4 +41,16 @@ chain = (
 user_input_1 = st.text_input(label="input")
-st.text(chain.invoke(user_input_1))

 from langchain_core.runnables import RunnablePassthrough
 import streamlit as st
+import tiktoken
 from src.utils import load_config_values
 from src.dev_llm import FakeLLM
+from src.pdfParser import extract_text_from_pdf
 # Load in model and pipeline configuration values
 system_message, context_message, model_id = load_config_values(
     ]
 )
+openai_tokeniser = tiktoken.encoding_for_model(model_id)
+# TODO: Change this to reflect prod model rather than dev models
+# Initalise fake values and a fake LLM to test out the full pipeline
+tmp_llm = FakeLLM()
 prompt = ChatPromptTemplate.from_template(
     template=context_message,
 )
 chain = (
     {
 user_input_1 = st.text_input(label="input")
+uploaded_doc = st.file_uploader(
+    label="Upload PDF:\n",
+    type=".pdf"
+)
+if uploaded_doc is not None:
+    uploaded_text = extract_text_from_pdf(uploaded_doc)
+    st.text(chain.invoke(user_input_1))
+    text_length = len(openai_tokeniser.encode(uploaded_text))
+    system_message_length = len(openai_tokeniser.encode(system_message))
+    context_message_length = len(openai_tokeniser.encode(context_message))
+    st.text(text_length + system_message_length + context_message_length)

config/model_config.yml CHANGED Viewed

@@ -1,3 +1,3 @@
 system_message: "Your role is to take PDF documents and extract their raw text into a JSON format that can be uploaded into a database. Return the JSON only. \nFor example if you need to extract information about a report written on 2nd February 2011 with an author called Jane Mary then return this only: {'report_written_date': '02/02/2011', 'author_name': 'Jane Mary'}\nAnother example would be a clinical exam passed by a student on the 3rd of July 2022 would return this only: {'result' : 'pass', 'date_of_exam' : '03/07/2022'}"
 context_message: "{system_message}\n\nUse the text provided and denoted by 3 backticks ```{pdf_text}```. \nExtract the following values in JSON format.\n{data_to_extract}"
-model_id: "meta-llama/Llama-2-70b-chat-hf"

 system_message: "Your role is to take PDF documents and extract their raw text into a JSON format that can be uploaded into a database. Return the JSON only. \nFor example if you need to extract information about a report written on 2nd February 2011 with an author called Jane Mary then return this only: {'report_written_date': '02/02/2011', 'author_name': 'Jane Mary'}\nAnother example would be a clinical exam passed by a student on the 3rd of July 2022 would return this only: {'result' : 'pass', 'date_of_exam' : '03/07/2022'}"
 context_message: "{system_message}\n\nUse the text provided and denoted by 3 backticks ```{pdf_text}```. \nExtract the following values in JSON format.\n{data_to_extract}"
+model_id: "gpt-3.5-turbo"

src/pdfParser.py CHANGED Viewed

@@ -1,11 +1,12 @@
-import PyPDF2
-import streamlit as st
-@st.cache_resource
-def get_pdf_text(filepath):
-    # Open the PDF file in read-binary mode
-    # Create a PDF object
-    pdf = PyPDF2.PdfReader(filepath)
-    pdf_text = " ".join([page.extract_text() for page in pdf.pages])
-    return pdf_text

+import fitz
+def extract_text_from_pdf(pdf_file):
+    try:
+        document = fitz.open(stream=pdf_file.read(), filetype="pdf")
+        text = ""
+        for page_num in range(len(document)):
+            page = document.load_page(page_num)
+            text += page.get_text()
+        return text
+    except Exception as e:
+        return f"Failed to load in text: {str(e)}"