Spaces:
Running
Running
regraded01
commited on
Commit
·
5d4bf7d
1
Parent(s):
d3ce4e0
feat: store model_id as a config variable
Browse files- app_langchain.py +22 -20
- config/model_config.yml +1 -1
- src/pdfParser.py +11 -10
app_langchain.py
CHANGED
@@ -2,23 +2,13 @@ from langchain_core.prompts import ChatPromptTemplate
|
|
2 |
from langchain_core.runnables import RunnablePassthrough
|
3 |
|
4 |
import streamlit as st
|
5 |
-
|
|
|
6 |
|
7 |
from src.utils import load_config_values
|
8 |
from src.dev_llm import FakeLLM
|
|
|
9 |
|
10 |
-
def extract_text_from_pdf(pdf_file):
|
11 |
-
document = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
12 |
-
text = ""
|
13 |
-
for page_num in range(len(document)):
|
14 |
-
page = document.load_page(page_num)
|
15 |
-
text += page.get_text()
|
16 |
-
return text
|
17 |
-
|
18 |
-
# TODO: Change this to reflect prod model rather than dev models
|
19 |
-
# Initalise fake values and a fake LLM to test out the full pipeline
|
20 |
-
tmp_llm = FakeLLM()
|
21 |
-
tmp_pdf_text = "This patient is due for an appointment on 1st June 2024" # replace with Runner to a file uploader
|
22 |
|
23 |
# Load in model and pipeline configuration values
|
24 |
system_message, context_message, model_id = load_config_values(
|
@@ -29,15 +19,15 @@ system_message, context_message, model_id = load_config_values(
|
|
29 |
]
|
30 |
)
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
prompt = ChatPromptTemplate.from_template(
|
33 |
template=context_message,
|
34 |
)
|
35 |
-
uploaded_doc = st.file_uploader(
|
36 |
-
label="Upload PDF:\n",
|
37 |
-
type=".pdf"
|
38 |
-
)
|
39 |
-
if uploaded_doc is not None:
|
40 |
-
uploaded_text = extract_text_from_pdf(uploaded_doc)
|
41 |
|
42 |
chain = (
|
43 |
{
|
@@ -51,4 +41,16 @@ chain = (
|
|
51 |
|
52 |
user_input_1 = st.text_input(label="input")
|
53 |
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
from langchain_core.runnables import RunnablePassthrough
|
3 |
|
4 |
import streamlit as st
|
5 |
+
|
6 |
+
import tiktoken
|
7 |
|
8 |
from src.utils import load_config_values
|
9 |
from src.dev_llm import FakeLLM
|
10 |
+
from src.pdfParser import extract_text_from_pdf
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
# Load in model and pipeline configuration values
|
14 |
system_message, context_message, model_id = load_config_values(
|
|
|
19 |
]
|
20 |
)
|
21 |
|
22 |
+
openai_tokeniser = tiktoken.encoding_for_model(model_id)
|
23 |
+
|
24 |
+
# TODO: Change this to reflect prod model rather than dev models
|
25 |
+
# Initalise fake values and a fake LLM to test out the full pipeline
|
26 |
+
tmp_llm = FakeLLM()
|
27 |
+
|
28 |
prompt = ChatPromptTemplate.from_template(
|
29 |
template=context_message,
|
30 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
chain = (
|
33 |
{
|
|
|
41 |
|
42 |
user_input_1 = st.text_input(label="input")
|
43 |
|
44 |
+
|
45 |
+
uploaded_doc = st.file_uploader(
|
46 |
+
label="Upload PDF:\n",
|
47 |
+
type=".pdf"
|
48 |
+
)
|
49 |
+
if uploaded_doc is not None:
|
50 |
+
uploaded_text = extract_text_from_pdf(uploaded_doc)
|
51 |
+
|
52 |
+
st.text(chain.invoke(user_input_1))
|
53 |
+
text_length = len(openai_tokeniser.encode(uploaded_text))
|
54 |
+
system_message_length = len(openai_tokeniser.encode(system_message))
|
55 |
+
context_message_length = len(openai_tokeniser.encode(context_message))
|
56 |
+
st.text(text_length + system_message_length + context_message_length)
|
config/model_config.yml
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
system_message: "Your role is to take PDF documents and extract their raw text into a JSON format that can be uploaded into a database. Return the JSON only. \nFor example if you need to extract information about a report written on 2nd February 2011 with an author called Jane Mary then return this only: {'report_written_date': '02/02/2011', 'author_name': 'Jane Mary'}\nAnother example would be a clinical exam passed by a student on the 3rd of July 2022 would return this only: {'result' : 'pass', 'date_of_exam' : '03/07/2022'}"
|
2 |
context_message: "{system_message}\n\nUse the text provided and denoted by 3 backticks ```{pdf_text}```. \nExtract the following values in JSON format.\n{data_to_extract}"
|
3 |
-
model_id: "
|
|
|
1 |
system_message: "Your role is to take PDF documents and extract their raw text into a JSON format that can be uploaded into a database. Return the JSON only. \nFor example if you need to extract information about a report written on 2nd February 2011 with an author called Jane Mary then return this only: {'report_written_date': '02/02/2011', 'author_name': 'Jane Mary'}\nAnother example would be a clinical exam passed by a student on the 3rd of July 2022 would return this only: {'result' : 'pass', 'date_of_exam' : '03/07/2022'}"
|
2 |
context_message: "{system_message}\n\nUse the text provided and denoted by 3 backticks ```{pdf_text}```. \nExtract the following values in JSON format.\n{data_to_extract}"
|
3 |
+
model_id: "gpt-3.5-turbo"
|
src/pdfParser.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
-
import
|
2 |
-
import streamlit as st
|
3 |
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
1 |
+
import fitz
|
|
|
2 |
|
3 |
+
def extract_text_from_pdf(pdf_file):
|
4 |
+
try:
|
5 |
+
document = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
6 |
+
text = ""
|
7 |
+
for page_num in range(len(document)):
|
8 |
+
page = document.load_page(page_num)
|
9 |
+
text += page.get_text()
|
10 |
+
return text
|
11 |
+
except Exception as e:
|
12 |
+
return f"Failed to load in text: {str(e)}"
|