regraded01 commited on
Commit
2267014
·
1 Parent(s): 01296c9
Files changed (3) hide show
  1. app.py +95 -0
  2. pdfParser.py +11 -0
  3. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ import re
4
+ from pdfParser import get_pdf_text
5
+
6
+ api_key = st.secrets.hf_credentials.hf_api
7
+
8
+ model_id = "meta-llama/Llama-2-13b-chat-hf"
9
+ system_message = """
10
+ Your role is to take PDF documents and extract their raw text into a table format that can be uploaded into a database.
11
+ Return the table only. For example if you need to extract information about a report written on 2nd February 2011 with an author called Jane Mary then return this only:
12
+ | report_written_date | author_name | \n | --- | --- | \n | 02/02/2011 | Jane Mary |
13
+ """
14
+
15
+
16
+ def query(payload, model_id):
17
+ headers = {"Authorization": f"Bearer {api_key}"}
18
+ API_URL = f"https://api-inference.huggingface.co/models/{model_id}"
19
+ response = requests.post(API_URL, headers=headers, json=payload)
20
+ return response.json()
21
+
22
+
23
+ def prompt_generator(system_message, user_message):
24
+ return f"""
25
+ <s>[INST] <<SYS>>
26
+ {system_message}
27
+ <</SYS>>
28
+ {user_message} [/INST]
29
+ """
30
+
31
+
32
+ # Pattern to clean up text response from API
33
+ pattern = r".*\[/INST\]([\s\S]*)$"
34
+
35
+ # Initialize chat history
36
+ if "messages" not in st.session_state:
37
+ st.session_state.messages = []
38
+
39
+ # Include PDF upload ability
40
+ pdf_upload = st.file_uploader(
41
+ "Upload a .PDF here",
42
+ type=".pdf",
43
+ )
44
+
45
+ if pdf_upload is not None:
46
+ pdf_text = get_pdf_text(pdf_upload)
47
+
48
+
49
+ if "key_inputs" not in st.session_state:
50
+ st.session_state.key_inputs = {}
51
+
52
+ col1, col2, col3 = st.columns([3, 3, 2])
53
+
54
+ with col1:
55
+ key_name = st.text_input("Key/Column Name (e.g. patient_name)", key="key_name")
56
+
57
+ with col2:
58
+ key_description = st.text_area(
59
+ "*(Optional) Description of key/column", key="key_description"
60
+ )
61
+
62
+ with col3:
63
+ if st.button("Extract this column"):
64
+ if key_description:
65
+ st.session_state.key_inputs[key_name] = key_description
66
+ else:
67
+ st.session_state.key_inputs[key_name] = "No further description provided"
68
+
69
+ if st.session_state.key_inputs:
70
+ keys_title = st.write("\nKeys/Columns for extraction:")
71
+ keys_values = st.write(st.session_state.key_inputs)
72
+
73
+ if st.button("Extract data!"):
74
+ user_message = f"""
75
+ Use the text provided and denoted by 3 backticks ```{pdf_text}```.
76
+ Extract the following columns and return a table that could be uploaded to an SQL database.
77
+ {'; '.join([key + ': ' + st.session_state.key_inputs[key] for key in st.session_state.key_inputs])}
78
+ """
79
+ the_prompt = prompt_generator(
80
+ system_message=system_message, user_message=user_message
81
+ )
82
+ response = query(
83
+ {
84
+ "inputs": the_prompt,
85
+ "parameters": {"max_new_tokens": 500, "temperature": 0.1},
86
+ },
87
+ model_id,
88
+ )
89
+ match = re.search(
90
+ pattern, response[0]["generated_text"], re.MULTILINE | re.DOTALL
91
+ )
92
+ if match:
93
+ response = match.group(1).strip()
94
+
95
+ st.markdown(f"Data Extracted!\n{response}")
pdfParser.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ import streamlit as st
3
+
4
+
5
+ @st.cache_resource
6
+ def get_pdf_text(filepath):
7
+ # Open the PDF file in read-binary mode
8
+ # Create a PDF object
9
+ pdf = PyPDF2.PdfReader(filepath)
10
+ pdf_text = " ".join([page.extract_text() for page in pdf.pages])
11
+ return pdf_text
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ PyPDF2
2
+ streamlit