regraded01 commited on
Commit
5d4bf7d
·
1 Parent(s): d3ce4e0

feat: store model_id as a config variable

Browse files
Files changed (3) hide show
  1. app_langchain.py +22 -20
  2. config/model_config.yml +1 -1
  3. src/pdfParser.py +11 -10
app_langchain.py CHANGED
@@ -2,23 +2,13 @@ from langchain_core.prompts import ChatPromptTemplate
2
  from langchain_core.runnables import RunnablePassthrough
3
 
4
  import streamlit as st
5
- import fitz
 
6
 
7
  from src.utils import load_config_values
8
  from src.dev_llm import FakeLLM
 
9
 
10
- def extract_text_from_pdf(pdf_file):
11
- document = fitz.open(stream=pdf_file.read(), filetype="pdf")
12
- text = ""
13
- for page_num in range(len(document)):
14
- page = document.load_page(page_num)
15
- text += page.get_text()
16
- return text
17
-
18
- # TODO: Change this to reflect prod model rather than dev models
19
- # Initalise fake values and a fake LLM to test out the full pipeline
20
- tmp_llm = FakeLLM()
21
- tmp_pdf_text = "This patient is due for an appointment on 1st June 2024" # replace with Runner to a file uploader
22
 
23
  # Load in model and pipeline configuration values
24
  system_message, context_message, model_id = load_config_values(
@@ -29,15 +19,15 @@ system_message, context_message, model_id = load_config_values(
29
  ]
30
  )
31
 
 
 
 
 
 
 
32
  prompt = ChatPromptTemplate.from_template(
33
  template=context_message,
34
  )
35
- uploaded_doc = st.file_uploader(
36
- label="Upload PDF:\n",
37
- type=".pdf"
38
- )
39
- if uploaded_doc is not None:
40
- uploaded_text = extract_text_from_pdf(uploaded_doc)
41
 
42
  chain = (
43
  {
@@ -51,4 +41,16 @@ chain = (
51
 
52
  user_input_1 = st.text_input(label="input")
53
 
54
- st.text(chain.invoke(user_input_1))
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from langchain_core.runnables import RunnablePassthrough
3
 
4
  import streamlit as st
5
+
6
+ import tiktoken
7
 
8
  from src.utils import load_config_values
9
  from src.dev_llm import FakeLLM
10
+ from src.pdfParser import extract_text_from_pdf
11
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # Load in model and pipeline configuration values
14
  system_message, context_message, model_id = load_config_values(
 
19
  ]
20
  )
21
 
22
+ openai_tokeniser = tiktoken.encoding_for_model(model_id)
23
+
24
+ # TODO: Change this to reflect prod model rather than dev models
25
+ # Initalise fake values and a fake LLM to test out the full pipeline
26
+ tmp_llm = FakeLLM()
27
+
28
  prompt = ChatPromptTemplate.from_template(
29
  template=context_message,
30
  )
 
 
 
 
 
 
31
 
32
  chain = (
33
  {
 
41
 
42
  user_input_1 = st.text_input(label="input")
43
 
44
+
45
+ uploaded_doc = st.file_uploader(
46
+ label="Upload PDF:\n",
47
+ type=".pdf"
48
+ )
49
+ if uploaded_doc is not None:
50
+ uploaded_text = extract_text_from_pdf(uploaded_doc)
51
+
52
+ st.text(chain.invoke(user_input_1))
53
+ text_length = len(openai_tokeniser.encode(uploaded_text))
54
+ system_message_length = len(openai_tokeniser.encode(system_message))
55
+ context_message_length = len(openai_tokeniser.encode(context_message))
56
+ st.text(text_length + system_message_length + context_message_length)
config/model_config.yml CHANGED
@@ -1,3 +1,3 @@
1
  system_message: "Your role is to take PDF documents and extract their raw text into a JSON format that can be uploaded into a database. Return the JSON only. \nFor example if you need to extract information about a report written on 2nd February 2011 with an author called Jane Mary then return this only: {'report_written_date': '02/02/2011', 'author_name': 'Jane Mary'}\nAnother example would be a clinical exam passed by a student on the 3rd of July 2022 would return this only: {'result' : 'pass', 'date_of_exam' : '03/07/2022'}"
2
  context_message: "{system_message}\n\nUse the text provided and denoted by 3 backticks ```{pdf_text}```. \nExtract the following values in JSON format.\n{data_to_extract}"
3
- model_id: "meta-llama/Llama-2-70b-chat-hf"
 
1
  system_message: "Your role is to take PDF documents and extract their raw text into a JSON format that can be uploaded into a database. Return the JSON only. \nFor example if you need to extract information about a report written on 2nd February 2011 with an author called Jane Mary then return this only: {'report_written_date': '02/02/2011', 'author_name': 'Jane Mary'}\nAnother example would be a clinical exam passed by a student on the 3rd of July 2022 would return this only: {'result' : 'pass', 'date_of_exam' : '03/07/2022'}"
2
  context_message: "{system_message}\n\nUse the text provided and denoted by 3 backticks ```{pdf_text}```. \nExtract the following values in JSON format.\n{data_to_extract}"
3
+ model_id: "gpt-3.5-turbo"
src/pdfParser.py CHANGED
@@ -1,11 +1,12 @@
1
- import PyPDF2
2
- import streamlit as st
3
 
4
-
5
- @st.cache_resource
6
- def get_pdf_text(filepath):
7
- # Open the PDF file in read-binary mode
8
- # Create a PDF object
9
- pdf = PyPDF2.PdfReader(filepath)
10
- pdf_text = " ".join([page.extract_text() for page in pdf.pages])
11
- return pdf_text
 
 
 
1
+ import fitz
 
2
 
3
+ def extract_text_from_pdf(pdf_file):
4
+ try:
5
+ document = fitz.open(stream=pdf_file.read(), filetype="pdf")
6
+ text = ""
7
+ for page_num in range(len(document)):
8
+ page = document.load_page(page_num)
9
+ text += page.get_text()
10
+ return text
11
+ except Exception as e:
12
+ return f"Failed to load in text: {str(e)}"