firqaaa commited on
Commit
eb88b82
1 Parent(s): 5da956e

Upload 6 files

Browse files
Files changed (6) hide show
  1. Dockerfile +43 -0
  2. app.py +1107 -0
  3. requirements.txt +16 -0
  4. schema.py +87 -0
  5. summ.py +68 -0
  6. utils.py +116 -0
Dockerfile ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ RUN apt update && \
4
+ apt install -y bash \
5
+ poppler-utils \
6
+ tesseract-ocr \
7
+ libtesseract-dev \
8
+ build-essential \
9
+ git \
10
+ curl \
11
+ ca-certificates \
12
+ python3 \
13
+ python3-pip && \
14
+ rm -rf /var/lib/apt/lists
15
+
16
+
17
+ WORKDIR /code
18
+
19
+ COPY ./requirements.txt /code/requirements.txt
20
+
21
+ # Set up a new user named "user" with user ID 1000
22
+ RUN useradd -m -u 1000 user
23
+
24
+ # Switch to the "user" user
25
+ USER user
26
+
27
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
28
+
29
+ RUN [ "python", "-c", "import nltk; nltk.download('punkt')" ]
30
+
31
+ # Set home to the user's home directory
32
+ ENV HOME=/home/user \
33
+ PATH=/home/user/.local/bin:$PATH
34
+
35
+ # Set the working directory to the user's home directory
36
+ WORKDIR $HOME/app
37
+
38
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
39
+ COPY --chown=user . $HOME/app
40
+
41
+ COPY . .
42
+
43
+ CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.enableXsrfProtection=false"]
app.py ADDED
@@ -0,0 +1,1107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Author: Firqa Aqila Noor Arasyi
2
+ # Date: 2023-12-04
3
+
4
+
5
+ import os
6
+ import io
7
+ import json
8
+ import pandas as pd
9
+ import streamlit as st
10
+ from stqdm import stqdm
11
+ from ast import literal_eval
12
+ from tempfile import NamedTemporaryFile
13
+
14
+ import PyPDF2
15
+ import pdf2image
16
+ import pytesseract
17
+ from utils import *
18
+ from schema import *
19
+ from summ import get_summ
20
+ from datetime import datetime
21
+ import time
22
+ import base64
23
+ import string
24
+ import random
25
+ import numpy as np
26
+
27
+ from langchain.llms import OpenAI
28
+ from langchain.chains import RetrievalQA
29
+ from langchain.vectorstores import Chroma
30
+ from langchain.chat_models import ChatOpenAI
31
+ from langchain.document_loaders import TextLoader
32
+ from chromadb.utils import embedding_functions
33
+ from unstructured.partition.pdf import partition_pdf
34
+ from unstructured.staging.base import elements_to_json
35
+ from langchain.text_splitter import CharacterTextSplitter
36
+ from langchain.embeddings.openai import OpenAIEmbeddings
37
+ from langchain.chains import create_extraction_chain
38
+
39
+ from Bio import Entrez
40
+ nltk.download("punkt")
41
+
42
+ os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")
43
+ Entrez.email = os.getenv("ENTREZ_EMAIL")
44
+ Entrez.api_key = os.getenv("ENTREZ_API_KEY")
45
+
46
+ fold = -1
47
+ buffer = io.BytesIO()
48
+
49
+ st.cache_data()
50
+ def convert_df(df):
51
+ return df.to_csv().encode("utf-8")
52
+
53
+ # Function to create a download link for an Excel file
54
+ # def create_excel_download_link(df, file_name):
55
+ # output = io.BytesIO()
56
+ # with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
57
+ # df.to_excel(writer, sheet_name='Sheet1', index=False)
58
+ # excel_data = output.getvalue()
59
+ # st.download_button(label="Download Excel File", data=excel_data, key=file_name, file_name=f"{file_name}.xlsx")
60
+
61
+ class Journal:
62
+
63
+ def __init__(self, name, bytes):
64
+ self.name = name
65
+ self.bytes = bytes
66
+
67
+ def __repr__(self):
68
+ return f"Journal(name='{self.name}', bytes='{self.bytes}')"
69
+
70
+ llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613")
71
+
72
+ textex_chain = create_extraction_chain(textex_schema, llm)
73
+ tablex_chain = create_extraction_chain(tablex_schema, llm)
74
+
75
+ st.set_page_config(page_title="NutriGenMe Paper Extractor")
76
+ st.title("NutriGenMe - Paper Extraction")
77
+ st.markdown("<div style='text-align: left; color: white; font-size: 16px'>In its latest version, the app is equipped to extract essential information from papers, including tables in both horizontal and vertical orientations, images, and text exclusively.</div><br>", unsafe_allow_html=True)
78
+
79
+ uploaded_files = st.file_uploader("Upload Paper(s) here :", type="pdf", accept_multiple_files=True)
80
+
81
+ if uploaded_files:
82
+ st.warning("""
83
+ Warning! Prior to proceeding, please take a moment to review the following : \n
84
+ Certain guidelines apply when utilizing this application, particularly if you intend to extract information from tables, whether they are oriented horizontally or vertically.
85
+ - If you intend to perform multiple PDF processes using Horizontal Table Extraction, ensure that all your PDF files adhere to a horizontal table format
86
+ - If you plan to undertake multiple PDF processes with Vertical Table Extraction, ensure that all your PDF files conform to a vertical table format
87
+ """, icon="⚠️")
88
+
89
+ col1, col2, col3 = st.columns(3)
90
+
91
+ if uploaded_files:
92
+ journals = []
93
+ strategy = "hi_res"
94
+ model_name = "yolox"
95
+ on_h, on_v, on_t = None, None, None
96
+ parseButtonH, parseButtonV, parseButtonT = None, None, None
97
+ # if uploaded_files:
98
+ with col1:
99
+ if on_v or on_t:
100
+ on_h = st.toggle("Horizontal Table Extraction", disabled=True)
101
+ else:
102
+ on_h = st.toggle("Horizontal Table Extraction")
103
+ if on_h:
104
+ chunk_size_h = st.selectbox(
105
+ 'Tokens amounts per process :',
106
+ (16000, 12000, 10000, 8000, 5000), key='table_h'
107
+ )
108
+ parseButtonH = st.button("Get Result", key='table_H')
109
+
110
+ with col2:
111
+ if on_h or on_t:
112
+ on_v = st.toggle("Vertical Table Extraction", disabled=True)
113
+ else:
114
+ on_v = st.toggle("Vertical Table Extraction")
115
+ if on_v:
116
+ chunk_size_v = st.selectbox(
117
+ 'Tokens amounts per process :',
118
+ (16000, 12000, 10000, 8000, 5000), key='table_v'
119
+ )
120
+ parseButtonV = st.button("Get Result", key='table_V')
121
+ with col3:
122
+ if on_h or on_v:
123
+ on_t = st.toggle("Text Extraction ", disabled=True)
124
+ else:
125
+ on_t = st.toggle("Text Extraction ")
126
+ if on_t:
127
+ chunk_size_t = st.selectbox(
128
+ 'Tokens amounts per process :',
129
+ (16000, 12000, 10000, 8000, 5000), key='no_table'
130
+ )
131
+ parseButtonT = st.button("Get Result", key="no_Table")
132
+
133
+ if on_h:
134
+ if parseButtonH:
135
+ with st.status("Extraction in progress ...", expanded=True) as status:
136
+ st.write("Getting Result ...")
137
+ csv = pd.DataFrame()
138
+ for uploaded_file in stqdm(uploaded_files):
139
+ with NamedTemporaryFile(dir='.', suffix=".pdf") as pdf:
140
+ pdf.write(uploaded_file.getbuffer())
141
+ # st.write(pdf.name)
142
+ L = []
143
+ # Entity Extraction
144
+ st.write("☑ Extracting Entities ...")
145
+ bytes_data = uploaded_file.read()
146
+ journal = Journal(uploaded_file.name, bytes_data)
147
+
148
+ images = pdf2image.convert_from_bytes(journal.bytes)
149
+ extracted_text = ""
150
+ for image in images[:-1]:
151
+ text = pytesseract.image_to_string(image)
152
+ text = clean_text(text)
153
+ extracted_text += text + " "
154
+ text = replace_quotes(extracted_text)
155
+ text_chunk = split_text(text, chunk_size_h)
156
+
157
+ chunkdf = []
158
+ for i, chunk in enumerate(text_chunk):
159
+ inp = chunk
160
+ df = pd.DataFrame(literal_eval(str(json.dumps(tablex_chain.run(inp)[0])).replace("\'", "\"")), index=[0]).fillna('')
161
+ chunkdf.append(df)
162
+
163
+ concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
164
+ st.write("☑ Entities Extraction Done ..")
165
+ time.sleep(0.1)
166
+ st.write("☑ Generating Summary ...")
167
+ summary = get_summ(pdf.name)
168
+ st.write("☑ Generating Summary Done ..")
169
+ time.sleep(0.1)
170
+ st.write("☑ Table Extraction in progress ...")
171
+ # Table Extraction
172
+ # L = []
173
+ output_list = []
174
+
175
+ elements = partition_pdf(filename=pdf.name, strategy=strategy, infer_table_structure=True, model_name=model_name)
176
+ with NamedTemporaryFile(dir=".", suffix=".json") as f:
177
+ elements_to_json(elements, filename=f"{f.name.split('/')[-1]}")
178
+ json_file_path = os.path.abspath(f.name) # Get the absolute file path
179
+ with open(json_file_path, "r", encoding="utf-8") as jsonfile:
180
+ data = json.load(jsonfile)
181
+ extracted_elements = []
182
+ for entry in data:
183
+ if entry["type"] == "Table":
184
+ extracted_elements.append(entry["metadata"]["text_as_html"])
185
+
186
+ with NamedTemporaryFile(dir='.' , suffix='.txt') as txt_file:
187
+ text_file_path = os.path.abspath(txt_file.name)
188
+ with open(text_file_path, "w", encoding="utf-8") as txtfile:
189
+ for element in extracted_elements:
190
+ txtfile.write(element + "\n\n")
191
+ loader = TextLoader(text_file_path)
192
+ documents = loader.load()
193
+ # split it into chunks
194
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
195
+ docs = text_splitter.split_documents(documents)
196
+ embeddings = OpenAIEmbeddings()
197
+
198
+ db = Chroma.from_documents(docs, embeddings)
199
+ llm_table = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0)
200
+ qa_chain = RetrievalQA.from_chain_type(llm_table, retriever=db.as_retriever())
201
+
202
+ # List of questions
203
+ questions = [
204
+ """Mention all genes / locus name with respective rsID / SNP and potential diseases in a curly brackets like this:
205
+ Example 1 : {"Genes" : "FTO", "SNPs" : "rs9939609", "Diseases" : "Obesity"}
206
+ """,
207
+ """Mention all genes / locus name with respective potential diseases in a curly brackets like this:
208
+ Example 2 : {"Genes" : "FTO", "SNPs" : "" (if not available), "Diseases" : "Obesity"}
209
+ """,
210
+ """Mention all rsIDs / SNPs / Variant with respective potential diseases / traits in a curly brackets like this:
211
+ Example 3 : {"Genes" : "", "SNPs" : "rs9939609", "Diseases" : "Obesity"}
212
+ """
213
+ ]
214
+ try:
215
+ for query in questions:
216
+ response = qa_chain({"query" : query})
217
+ output_list.append(response)
218
+ except Exception as e:
219
+ pass
220
+ db.delete_collection()
221
+
222
+ # 1
223
+ for i in range(len(output_list[0]['result'].split('\n'))):
224
+ if output_list[0]['result'].split('\n')[i] != "":
225
+ try:
226
+ row = literal_eval(output_list[0]['result'].split('\n')[i])[0]
227
+ row = {**row, **{
228
+ 'Title' : concat['title'][0],
229
+ 'Authors' : concat['authors'][0],
230
+ 'Publisher Name' : concat['publisher_name'][0],
231
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
232
+ # 'Population' : concat['population_race'][0],
233
+ 'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
234
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
235
+ 'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
236
+ 'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
237
+ 'Recommendation' : summary,
238
+ # 'Sample Size' : concat['sample_size'][0]
239
+ }}
240
+ if len(row['Genes'].strip().split(',')) > 1:
241
+ for g in row['Genes'].strip().split(','):
242
+ L.append({
243
+ 'Title' : concat['title'][0],
244
+ 'Authors' : concat['authors'][0],
245
+ 'Publisher Name' : concat['publisher_name'][0],
246
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
247
+ # 'Population' : concat['population_race'][0],
248
+ 'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
249
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
250
+ 'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
251
+ 'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
252
+ 'Recommendation' : summary,
253
+ # 'Sample Size' : concat['sample_size'][0],
254
+ 'Genes' : g.strip().upper().replace('Unknown', ''),
255
+ 'SNPs' : row['SNPs'].replace('Unknown', ''),
256
+ "Diseases" : ''.join(list(row['Diseases'].title() if row['Diseases'] not in ['T2D', 'T2DM', 'NAFLD', 'CVD'] else row['Diseases'])).replace('Unknown', '').replace('Unknown', '')
257
+ })
258
+ else:
259
+ L.append(row)
260
+
261
+ except KeyError:
262
+ row = literal_eval(output_list[0]['result'].split('\n')[i])
263
+ row = {**row, **{
264
+ 'Title' : concat['title'][0],
265
+ 'Authors' : concat['authors'][0],
266
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
267
+ "Publisher Name" : concat['publisher_name'][0],
268
+ # 'Population' : concat['population_race'][0],
269
+ 'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
270
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
271
+ 'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
272
+ 'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
273
+ 'Recommendation' : summary,
274
+ # 'Sample Size' : concat['sample_size'][0]
275
+ }
276
+ }
277
+ if len(row['Genes'].strip().split(',')) > 1:
278
+ for g in row['Genes'].strip().split(','):
279
+ L.append({
280
+ 'Title' : concat['title'][0],
281
+ 'Authors' : concat['authors'][0],
282
+ 'Publisher Name' : concat['publisher_name'][0],
283
+ 'Publication Year' :get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
284
+ 'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
285
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
286
+ 'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
287
+ 'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
288
+ 'Recommendation' : summary,
289
+ 'Genes' : g.strip().upper().replace('Unknown', ''),
290
+ 'SNPs' : row['SNPs'].replace('Unknown', ''),
291
+ "Diseases" : ''.join(list(row['Diseases'].title() if row['Diseases'] not in ['T2D', 'T2DM', 'NAFLD', 'CVD'] else row['Diseases'])).replace('Unknown', '').replace('Unknown', '')
292
+ })
293
+ else:
294
+ L.append(row)
295
+ except SyntaxError:
296
+ row = literal_eval(output_list[0]['result'].split('\n')[i])
297
+ row = f"""{row}"""
298
+ row = {**row, **{
299
+ 'Title' : concat['title'][0],
300
+ 'Authors' : concat['authors'][0],
301
+ 'Publisher Name' : concat['publisher_name'][0],
302
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
303
+ 'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
304
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
305
+ 'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
306
+ 'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
307
+ 'Recommendation' : summary,
308
+ # 'Population' : concat['population_race'][0],
309
+ # 'Sample Size' : concat['sample_size'][0]
310
+ }
311
+ }
312
+ if not row['SNPs'].startswith("rs"):
313
+ row.update({
314
+ 'SNPs' : "-"
315
+ })
316
+ else:
317
+ L.append(row)
318
+ except ValueError:
319
+ if type(output_list[0]['result'].split('\n')[i]) is dict:
320
+ row = output_list[0]['result'].split('\n')[i]
321
+ row = {**row, **{
322
+ 'Title' : concat['title'][0],
323
+ 'Authors' : concat['authors'][0],
324
+ 'Publisher Name' : concat['publisher_name'][0],
325
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
326
+ 'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
327
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
328
+ 'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
329
+ 'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
330
+ 'Recommendation' : summary,
331
+ }
332
+ }
333
+ if not row['SNPs'].startswith("rs"):
334
+ row.update({
335
+ 'SNPs' : "-"
336
+ })
337
+ else:
338
+ L.append(row)
339
+ # 2
340
+ for i in range(len(output_list[1]['result'].split('\n'))):
341
+ if output_list[1]['result'].split('\n')[i] != "":
342
+ try:
343
+ row = literal_eval(output_list[1]['result'].split('\n')[i])[0]
344
+ row = {**row, **{
345
+ 'Title' : concat['title'][0],
346
+ 'Authors' : concat['authors'][0],
347
+ 'Publisher Name' : concat['publisher_name'][0],
348
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
349
+ 'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
350
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
351
+ 'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
352
+ 'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
353
+ 'Recommendation' : summary,
354
+ }
355
+ }
356
+ if row['SNPs'] != "Not available":
357
+ row.update({
358
+ 'SNPs' : "Not available"
359
+ })
360
+ if len(row['Genes'].strip().split(',')) > 1:
361
+ for g in row['Genes'].strip().split(','):
362
+ L.append({
363
+ 'Title' : concat['title'][0],
364
+ 'Authors' : concat['authors'][0],
365
+ 'Publisher Name' : concat['publisher_name'][0],
366
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
367
+ 'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
368
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
369
+ 'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
370
+ 'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
371
+ 'Recommendation' : summary,
372
+ 'Genes' : g.strip().upper().replace('Unknown', ''),
373
+ "SNPs" : "Not available",
374
+ "Diseases" : ''.join(list(row['Diseases'].title() if row['Diseases'] not in ['T2D', 'T2DM', 'NAFLD', 'CVD'] else row['Diseases'])).replace('Unknown', '').replace('Unknown', '')
375
+ })
376
+ else:
377
+ L.append(row)
378
+ except KeyError:
379
+ row = literal_eval(output_list[1]['result'].split('\n')[i])
380
+ row = {**row, **{
381
+ 'Title' : concat['title'][0],
382
+ 'Authors' : concat['authors'][0],
383
+ 'Publisher Name' : concat['publisher_name'][0],
384
+ 'Publication Year' :get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
385
+ 'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
386
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
387
+ 'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
388
+ 'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
389
+ 'Recommendation' : summary,
390
+ }
391
+ }
392
+ if row['SNPs'] != "Not available":
393
+ row.update({
394
+ 'SNPs' : "Not available"
395
+ })
396
+ if len(row['Genes'].strip().split(',')) > 1:
397
+ for g in row['Genes'].strip().split(','):
398
+ L.append({
399
+ 'Title' : concat['title'][0],
400
+ 'Authors' : concat['authors'][0],
401
+ 'Publisher Name' : concat['publisher_name'][0],
402
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
403
+ 'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
404
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
405
+ 'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
406
+ 'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
407
+ 'Recommendation' : summary,
408
+ 'Genes' : g.strip().upper().replace('Unknown', ''),
409
+ "SNPs" : "Not available",
410
+ "Diseases" : ''.join(list(row['Diseases'].title() if row['Diseases'] not in ['T2D', 'T2DM', 'NAFLD', 'CVD'] else row['Diseases'])).replace('Unknown', '').replace('Unknown', '')
411
+ })
412
+ else:
413
+ L.append(row)
414
+ except SyntaxError:
415
+ row = f"""{row}"""
416
+ row = {**row, **{
417
+ 'Title' : concat['title'][0],
418
+ 'Authors' : concat['authors'][0],
419
+ 'Publisher Name' : concat['publisher_name'][0],
420
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
421
+ 'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
422
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
423
+ 'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
424
+ 'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
425
+ 'Recommendation' : summary,
426
+ }
427
+ }
428
+ if not row['SNPs'].startswith("rs"):
429
+ row.update({
430
+ 'SNPs' : "-"
431
+ })
432
+ else:
433
+ L.append(row)
434
+ except ValueError:
435
+ if type(output_list[1]['result'].split('\n')[i]) is dict:
436
+ row = output_list[1]['result'].split('\n')[i]
437
+ row = {**row, **{
438
+ 'Title' : concat['title'][0],
439
+ 'Authors' : concat['authors'][0],
440
+ 'Publisher Name' : concat['publisher_name'][0],
441
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
442
+ 'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
443
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
444
+ 'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
445
+ 'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
446
+ 'Recommendation' : summary,
447
+ }
448
+ }
449
+ if not row['SNPs'].startswith("rs"):
450
+ row.update({
451
+ 'SNPs' : "-"
452
+ })
453
+ else:
454
+ L.append(row)
455
+ # 3
456
+ for i in range(len(output_list[2]['result'].split('\n'))):
457
+ if output_list[2]['result'].split('\n')[i] != "":
458
+ try:
459
+ row = literal_eval(output_list[2]['result'].split('\n')[i])[0]
460
+ row = {**row, **{
461
+ 'Title' : concat['title'][0],
462
+ 'Authors' : concat['authors'][0],
463
+ 'Publisher Name' : concat['publisher_name'][0],
464
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
465
+ 'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
466
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
467
+ 'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
468
+ 'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
469
+ 'Recommendation' : summary,
470
+ }
471
+ }
472
+ if not row['SNPs'].startswith("rs"):
473
+ row.update({
474
+ 'SNPs' : "-"
475
+ })
476
+ else:
477
+ L.append(row)
478
+ except KeyError:
479
+ row = literal_eval(output_list[2]['result'].split('\n')[i])
480
+ row = {**row, **{
481
+ 'Title' : concat['title'][0],
482
+ 'Authors' : concat['authors'][0],
483
+ 'Publisher Name' : concat['publisher_name'][0],
484
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
485
+ 'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
486
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
487
+ 'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
488
+ 'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
489
+ 'Recommendation' : summary,
490
+ }
491
+ }
492
+ if not row['SNPs'].startswith("rs"):
493
+ row.update({
494
+ 'SNPs' : "-"
495
+ })
496
+ else:
497
+ L.append(row)
498
+ except SyntaxError:
499
+ row = f"""{row}"""
500
+ row = {**row, **{
501
+ 'Title' : concat['title'][0],
502
+ 'Authors' : concat['authors'][0],
503
+ 'Publisher Name' : concat['publisher_name'][0],
504
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
505
+ 'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
506
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
507
+ 'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
508
+ 'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
509
+ 'Recommendation' : summary,
510
+ }
511
+ }
512
+ if not row['SNPs'].startswith("rs"):
513
+ row.update({
514
+ 'SNPs' : "-"
515
+ })
516
+ else:
517
+ L.append(row)
518
+ except ValueError:
519
+ if type(output_list[2]['result'].split('\n')[i]) is dict:
520
+ row = output_list[2]['result'].split('\n')[i]
521
+ row = {**row, **{
522
+ 'Title' : concat['title'][0],
523
+ 'Authors' : concat['authors'][0],
524
+ 'Publisher Name' : concat['publisher_name'][0],
525
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
526
+ 'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
527
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
528
+ 'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
529
+ 'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
530
+ 'Recommendation' : summary,
531
+ }
532
+ }
533
+ if not row['SNPs'].startswith("rs"):
534
+ row.update({
535
+ 'SNPs' : "-"
536
+ })
537
+ else:
538
+ L.append(row)
539
+
540
+ st.write(output_list[2]['result'].split('\n'))
541
+ st.write("☑ Table Extraction Done ...")
542
+ status.update(label="Gene and SNPs succesfully collected.")
543
+ csv = pd.DataFrame(L).replace('', 'Not available')
544
+ csv = pd.DataFrame(L).replace('Unknown', '')
545
+ st.dataframe(csv)
546
+
547
+ generated_key = ''.join(random.choice(string.ascii_letters + string.digits) for i in range(16))
548
+ # if st.button("Download Excel File", key=generated_key):
549
+ # excel_link = create_excel_download_link(csv, uploaded_file.name.replace('.pdf', ''))
550
+ # st.markdown(excel_link, unsafe_allow_html=True)
551
+ with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
552
+ # Write each dataframe to a different worksheet
553
+ csv.to_excel(writer, sheet_name='Result')
554
+ writer.close()
555
+
556
+ # time_now = datetime.now()
557
+ # current_time = time_now.strftime("%H:%M:%S")
558
+
559
+ csv = convert_df(csv)
560
+ st.download_button(
561
+ label="Save Result",
562
+ data=buffer,
563
+ file_name=f'{uploaded_file.name}'.replace('.pdf', '') + '.xlsx',
564
+ mime='application/vnd.ms-excel',
565
+ key=generated_key
566
+ )
567
+
568
+ if on_v:
569
+ if parseButtonV:
570
+ with st.status("Extraction in progress ...", expanded=True) as status:
571
+ st.write("Getting Result ...")
572
+ csv = pd.DataFrame()
573
+ for uploaded_file in stqdm(uploaded_files):
574
+ L = []
575
+ with NamedTemporaryFile(dir='.', suffix=".pdf") as pdf:
576
+ pdf.write(uploaded_file.getbuffer())
577
+ # Open the PDF file in read-binary mode
578
+ with open(pdf.name, 'rb') as pdf_file:
579
+ # Create a PDF reader object
580
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
581
+ # Create a PDF writer object to write the rotated pages to a new PDF
582
+ pdf_writer = PyPDF2.PdfWriter()
583
+ # Iterate through each page in the original PDF
584
+ for page_num in range(len(pdf_reader.pages)):
585
+ # Get the page object
586
+ page = pdf_reader.pages[page_num]
587
+ # Rotate the page 90 degrees clockwise (use -90 for counterclockwise)
588
+ page.rotate(90)
589
+ # Add the rotated page to the PDF writer
590
+ pdf_writer.add_page(page)
591
+
592
+ with NamedTemporaryFile(dir='.', suffix=".pdf") as rotated_pdf:
593
+ pdf_writer.write(rotated_pdf.name)
594
+ # Entity Extraction
595
+ st.write("☑ Extracting Entities ...")
596
+ bytes_data = uploaded_file.read()
597
+ journal = Journal(uploaded_file.name, bytes_data)
598
+
599
+ images = pdf2image.convert_from_bytes(journal.bytes)
600
+ extracted_text = ""
601
+ for image in images[:-1]:
602
+ text = pytesseract.image_to_string(image)
603
+ text = clean_text(text)
604
+ extracted_text += text + " "
605
+ text = replace_quotes(extracted_text)
606
+ text_chunk = split_text(text, chunk_size_v)
607
+
608
+ chunkdf = []
609
+ for i, chunk in enumerate(text_chunk):
610
+ inp = chunk
611
+ df = pd.DataFrame(literal_eval(str(json.dumps(tablex_chain.run(inp)[0])).replace("\'", "\"")), index=[0]).fillna('')
612
+ chunkdf.append(df)
613
+
614
+ concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
615
+ st.write("☑ Entities Extraction Done ..")
616
+ time.sleep(0.1)
617
+ st.write("☑ Generating Summary ...")
618
+ summary = get_summ(pdf.name)
619
+ st.write("☑ Generating Summary Done ..")
620
+ time.sleep(0.1)
621
+ st.write("☑ Table Extraction in progress ...")
622
+
623
+ # Table Extraction
624
+ output_list = []
625
+
626
+ elements = partition_pdf(filename=rotated_pdf.name, strategy=strategy, infer_table_structure=True, model_name=model_name)
627
+ with NamedTemporaryFile(dir=".", suffix=".json") as f:
628
+ elements_to_json(elements, filename=f"{f.name.split('/')[-1]}")
629
+ json_file_path = os.path.abspath(f.name) # Get the absolute file path
630
+ with open(json_file_path, "r", encoding="utf-8") as jsonfile:
631
+ data = json.load(jsonfile)
632
+ extracted_elements = []
633
+ for entry in data:
634
+ if entry["type"] == "Table":
635
+ extracted_elements.append(entry["metadata"]["text_as_html"])
636
+
637
+ with NamedTemporaryFile(dir='.' , suffix='.txt') as txt_file:
638
+ text_file_path = os.path.abspath(txt_file.name)
639
+ with open(text_file_path, "w", encoding="utf-8") as txtfile:
640
+ for element in extracted_elements:
641
+ txtfile.write(element + "\n\n")
642
+ loader = TextLoader(text_file_path)
643
+ documents = loader.load()
644
+ # split it into chunks
645
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
646
+ docs = text_splitter.split_documents(documents)
647
+ embeddings = OpenAIEmbeddings()
648
+
649
+ db = Chroma.from_documents(docs, embeddings)
650
+ llm_table = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0)
651
+ qa_chain = RetrievalQA.from_chain_type(llm_table, retriever=db.as_retriever())
652
+
653
+ # List of questions
654
+ questions = [
655
+ """Mention all genes / locus name with respective rsID / SNP and potential diseases in a curly brackets like this:
656
+ Example 1 : {"Genes" : "FTO", "SNPs" : "rs9939609", "Diseases" : "Obesity"}
657
+ """,
658
+ """Mention all genes / locus name with respective potential diseases in a curly brackets like this:
659
+ Example 2 : {"Genes" : "FTO", "SNPs" : "" (if not available), "Diseases" : "Obesitya"}
660
+ """,
661
+ """Mention all rsIDs / SNPs / Variant with respective potential diseases / traits in a curly brackets like this:
662
+ Example 3 : {"Genes" : "", "SNPs" : "rs9939609", "Diseases" : "Obesity"}
663
+ """
664
+ ]
665
+ try:
666
+ for query in questions:
667
+ response = qa_chain({"query" : query})
668
+ output_list.append(response)
669
+ except Exception as e:
670
+ pass
671
+ db.delete_collection()
672
+ # 1
673
+ for i in range(len(output_list[0]['result'].split('\n'))):
674
+ if output_list[0]['result'].split('\n')[i] != "":
675
+ try:
676
+ row = literal_eval(output_list[0]['result'].split('\n')[i])[0]
677
+ row = {**row, **{
678
+ 'Title' : concat['title'][0],
679
+ 'Authors' : concat['authors'][0],
680
+ 'Publisher Name' : concat['publisher_name'][0],
681
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
682
+ 'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
683
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
684
+ 'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
685
+ 'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
686
+ 'Recommendation' : summary,
687
+ }}
688
+ if len(row['Genes'].strip().split(',')) > 1:
689
+ for g in row['Genes'].strip().split(','):
690
+ L.append({
691
+ 'Genes' : g.strip().upper(),
692
+ 'SNPs' : row['SNPs'],
693
+ "Diseases" : ''.join(list(row['Diseases'].title() if row['Diseases'] not in ['T2D', 'T2DM', 'NAFLD', 'CVD'] else row['Diseases'])).replace('Unknown', ''),
694
+ 'Title' : concat['title'][0],
695
+ 'Authors' : concat['authors'][0],
696
+ 'Publisher Name' : concat['publisher_name'][0],
697
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
698
+ 'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
699
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
700
+ 'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
701
+ 'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
702
+ 'Recommendation' : summary,
703
+ })
704
+ else:
705
+ L.append(row)
706
+ except KeyError:
707
+ row = literal_eval(output_list[0]['result'].split('\n')[i])
708
+ row = {**row, **{
709
+ 'Title' : concat['title'][0],
710
+ 'Authors' : concat['authors'][0],
711
+ 'Publisher Name' : concat['publisher_name'][0],
712
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
713
+ 'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
714
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
715
+ 'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
716
+ 'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
717
+ 'Recommendation' : summary,
718
+ }}
719
+ if len(row['Genes'].strip().split(',')) > 1:
720
+ for g in row['Genes'].strip().split(','):
721
+ L.append({
722
+ 'Genes' : g.strip().upper(),
723
+ 'SNPs' : row['SNPs'],
724
+ "Diseases" : ''.join(list(row['Diseases'].title() if row['Diseases'] not in ['T2D', 'T2DM', 'NAFLD', 'CVD'] else row['Diseases'])).replace('Unknown', ''),
725
+ 'Title' : concat['title'][0],
726
+ 'Authors' : concat['authors'][0],
727
+ 'Publisher Name' : concat['publisher_name'][0],
728
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
729
+ 'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
730
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
731
+ 'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
732
+ 'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
733
+ 'Recommendation' : summary,
734
+ })
735
+ else:
736
+ L.append(row)
737
+ except ValueError:
738
+ if type(output_list[0]['result'].split('\n')[i]) is dict:
739
+ row = output_list[0]['result'].split('\n')[i]
740
+ row = {**row, **{
741
+ 'Title' : concat['title'][0],
742
+ 'Authors' : concat['authors'][0],
743
+ 'Publisher Name' : concat['publisher_name'][0],
744
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
745
+ 'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
746
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
747
+ 'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
748
+ 'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
749
+ 'Recommendation' : summary,
750
+ }
751
+ }
752
+ if not row['SNPs'].startswith("rs"):
753
+ row.update({
754
+ 'SNPs' : "-"
755
+ })
756
+ else:
757
+ L.append(row)
758
+ except SyntaxError:
759
+ row = literal_eval("""{}""".format(output_list[2]['result'].split('\n')[i]))
760
+ row = {**row, **{
761
+ 'Title' : concat['title'][0],
762
+ 'Authors' : concat['authors'][0],
763
+ 'Publisher Name' : concat['publisher_name'][0],
764
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
765
+ 'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
766
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
767
+ 'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
768
+ 'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
769
+ 'Recommendation' : summary,
770
+ }
771
+ }
772
+ if not row['SNPs'].startswith("rs"):
773
+ row.update({
774
+ 'SNPs' : "-"
775
+ })
776
+ else:
777
+ L.append(row)
778
+ # 2
779
+ for i in range(len(output_list[1]['result'].split('\n'))):
780
+ if output_list[1]['result'].split('\n')[i] != "":
781
+ try:
782
+ row = literal_eval(output_list[1]['result'].split('\n')[i])[0]
783
+ row = {**row, **{
784
+ 'Title' : concat['title'][0],
785
+ 'Authors' : concat['authors'][0],
786
+ 'Publisher Name' : concat['publisher_name'][0],
787
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
788
+ 'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
789
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
790
+ 'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
791
+ 'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
792
+ 'Recommendation' : summary,
793
+ }}
794
+ if row['SNPs'] != "Not available":
795
+ row.update({
796
+ 'SNPs' : "Not available"
797
+ })
798
+ if len(row['Genes'].strip().split(',')) > 1:
799
+ for g in row['Genes'].strip().split(','):
800
+ L.append({
801
+ 'Genes' : g.strip().upper(),
802
+ "SNPs" : "Not available",
803
+ "Diseases" : ''.join(list(row['Diseases'].title() if row['Diseases'] not in ['T2D', 'T2DM', 'NAFLD', 'CVD'] else row['Diseases'])).replace('Unknown', ''),
804
+ 'Title' : concat['title'][0],
805
+ 'Authors' : concat['authors'][0],
806
+ 'Publisher Name' : concat['publisher_name'][0],
807
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
808
+ 'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
809
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
810
+ 'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
811
+ 'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
812
+ 'Recommendation' : summary,
813
+ })
814
+ else:
815
+ L.append(row)
816
+ except KeyError:
817
+ row = literal_eval(output_list[1]['result'].split('\n')[i])
818
+ row = {**row, **{
819
+ 'Title' : concat['title'][0],
820
+ 'Authors' : concat['authors'][0],
821
+ 'Publisher Name' : concat['publisher_name'][0],
822
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
823
+ 'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
824
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
825
+ 'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
826
+ 'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
827
+ 'Recommendation' : summary,
828
+ }}
829
+ if row['SNPs'] != "Not available":
830
+ row.update({
831
+ 'SNPs' : "Not available"
832
+ })
833
+ if len(row['Genes'].strip().split(',')) > 1:
834
+ for g in row['Genes'].strip().split(','):
835
+ L.append({
836
+ 'Genes' : g.strip().upper(),
837
+ "SNPs" : "Not available",
838
+ "Diseases" : ''.join(list(row['Diseases'].title() if row['Diseases'] not in ['T2D', 'T2DM', 'NAFLD', 'CVD'] else row['Diseases'])).replace('Unknown', ''),
839
+ 'Title' : concat['title'][0],
840
+ 'Authors' : concat['authors'][0],
841
+ 'Publisher Name' : concat['publisher_name'][0],
842
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
843
+ 'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
844
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
845
+ 'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
846
+ 'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
847
+ 'Recommendation' : summary,
848
+ })
849
+ else:
850
+ L.append(row)
851
+ except ValueError:
852
+ if type(output_list[1]['result'].split('\n')[i]) is dict:
853
+ row = output_list[1]['result'].split('\n')[i]
854
+ row = {**row, **{
855
+ 'Title' : concat['title'][0],
856
+ 'Authors' : concat['authors'][0],
857
+ 'Publisher Name' : concat['publisher_name'][0],
858
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
859
+ 'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
860
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
861
+ 'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
862
+ 'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
863
+ 'Recommendation' : summary,
864
+ }
865
+ }
866
+ if not row['SNPs'].startswith("rs"):
867
+ row.update({
868
+ 'SNPs' : "-"
869
+ })
870
+ else:
871
+ L.append(row)
872
+ except SyntaxError:
873
+ row = literal_eval("""{}""".format(output_list[2]['result'].split('\n')[i]))
874
+ row = {**row, **{
875
+ 'Title' : concat['title'][0],
876
+ 'Authors' : concat['authors'][0],
877
+ 'Publisher Name' : concat['publisher_name'][0],
878
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
879
+ 'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
880
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
881
+ 'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
882
+ 'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
883
+ 'Recommendation' : summary,
884
+ }
885
+ }
886
+ if not row['SNPs'].startswith("rs"):
887
+ row.update({
888
+ 'SNPs' : "-"
889
+ })
890
+ else:
891
+ L.append(row)
892
+ # 3
893
+ for i in range(len(output_list[2]['result'].split('\n'))):
894
+ if output_list[2]['result'].split('\n')[i] != "":
895
+ try:
896
+ row = literal_eval(output_list[2]['result'].split('\n')[i])[0]
897
+ row = {**row, **{
898
+ 'Title' : concat['title'][0],
899
+ 'Authors' : concat['authors'][0],
900
+ 'Publisher Name' : concat['publisher_name'][0],
901
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
902
+ 'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
903
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
904
+ 'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
905
+ 'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
906
+ 'Recommendation' : summary,
907
+ }
908
+ }
909
+ if not row['SNPs'].startswith("rs"):
910
+ row.update({
911
+ 'SNPs' : "-"
912
+ })
913
+ else:
914
+ L.append(row)
915
+ except KeyError:
916
+ row = literal_eval(output_list[2]['result'].split('\n')[i])
917
+ row = {**row, **{
918
+ 'Title' : concat['title'][0],
919
+ 'Authors' : concat['authors'][0],
920
+ 'Publisher Name' : concat['publisher_name'][0],
921
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
922
+ 'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
923
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
924
+ 'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
925
+ 'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
926
+ 'Recommendation' : summary,
927
+ }
928
+ }
929
+ if not row['SNPs'].startswith("rs"):
930
+ row.update({
931
+ 'SNPs' : "-"
932
+ })
933
+ else:
934
+ L.append(row)
935
+ except ValueError:
936
+ if type(output_list[2]['result'].split('\n')[i]) is dict:
937
+ row = output_list[2]['result'].split('\n')[i]
938
+ row = {**row, **{
939
+ 'Title' : concat['title'][0],
940
+ 'Authors' : concat['authors'][0],
941
+ 'Publisher Name' : concat['publisher_name'][0],
942
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
943
+ 'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
944
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
945
+ 'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
946
+ 'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
947
+ 'Recommendation' : summary,
948
+ }
949
+ }
950
+ if not row['SNPs'].startswith("rs"):
951
+ row.update({
952
+ 'SNPs' : "-"
953
+ })
954
+ else:
955
+ L.append(row)
956
+ except SyntaxError:
957
+ row = literal_eval("""{}""".format(output_list[2]['result'].split('\n')[i]))
958
+ row = {**row, **{
959
+ 'Title' : concat['title'][0],
960
+ 'Authors' : concat['authors'][0],
961
+ 'Publisher Name' : concat['publisher_name'][0],
962
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
963
+ 'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
964
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
965
+ 'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
966
+ 'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
967
+ 'Recommendation' : summary,
968
+ }
969
+ }
970
+ if not row['SNPs'].startswith("rs"):
971
+ row.update({
972
+ 'SNPs' : "-"
973
+ })
974
+ else:
975
+ L.append(row)
976
+ st.write("☑ Table Extraction Done")
977
+ status.update(label="Gene and SNPs succesfully collected.")
978
+ csv = pd.DataFrame(L).replace('', 'Not available')
979
+ csv = pd.DataFrame(L).replace('Unknown', '')
980
+ st.dataframe(csv)
981
+
982
+ with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
983
+ # Write each dataframe to a different worksheet
984
+ csv.to_excel(writer, sheet_name='Result')
985
+ writer.close()
986
+
987
+ time_now = datetime.now()
988
+ current_time = time_now.strftime("%H:%M:%S")
989
+
990
+ csv = convert_df(csv)
991
+ st.download_button(
992
+ label="Save Result",
993
+ data=buffer,
994
+ file_name=f'{uploaded_file.name}'.replace('.pdf', '') + '.xlsx',
995
+ mime='application/vnd.ms-excel'
996
+ )
997
+
998
+ if on_t:
999
+ if parseButtonT:
1000
+ with st.status("Extraction in progress ...", expanded=True) as status:
1001
+ st.write("Getting Result ...")
1002
+ csv = pd.DataFrame()
1003
+ for uploaded_file in stqdm(uploaded_files):
1004
+ L = []
1005
+ with NamedTemporaryFile(dir='.', suffix=".pdf") as pdf:
1006
+ pdf.write(uploaded_file.getbuffer())
1007
+
1008
+ # Entity Extraction
1009
+ st.write("☑ Extracting Entities ...")
1010
+ bytes_data = uploaded_file.read()
1011
+ journal = Journal(uploaded_file.name, bytes_data)
1012
+
1013
+ images = pdf2image.convert_from_bytes(journal.bytes)
1014
+ extracted_text = ""
1015
+ for image in images[:-1]:
1016
+ text = pytesseract.image_to_string(image)
1017
+ text = clean_text(text)
1018
+ extracted_text += text + " "
1019
+ text = replace_quotes(extracted_text)
1020
+ text_chunk = split_text(text, chunk_size_t)
1021
+
1022
+ chunkdf = []
1023
+ for i, chunk in enumerate(text_chunk):
1024
+ inp = chunk
1025
+ df = pd.DataFrame(literal_eval(str(json.dumps(textex_chain.run(inp)[0])).replace("\'", "\"")), index=[0]).fillna('')
1026
+ chunkdf.append(df)
1027
+
1028
+ concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
1029
+ st.write("☑ Entities Extraction Done ..")
1030
+ time.sleep(0.1)
1031
+ st.write("☑ Generating Summary ...")
1032
+
1033
+ concat['SNPs'] = concat['SNPs'].apply(lambda x: x if x.startswith('rs') else '')
1034
+ for col in list(concat.columns):
1035
+ concat[col] = concat[col].apply(lambda x: x if x not in ['N/A', 'not mentioned', 'Not mentioned', 'Unknown'] else '')
1036
+
1037
+ summary = get_summ(pdf.name)
1038
+ time.sleep(0.1)
1039
+ st.write("☑ Generating Summary Done...")
1040
+ for i in range(len(concat)):
1041
+ if (len(concat['genes_locus'][i].split(',')) >= 1) and concat['SNPs'][i] == '':
1042
+ for g in concat['genes_locus'][i].split(','):
1043
+ L.append({
1044
+ 'Title' : concat['title'][0],
1045
+ 'Author' : concat['authors'][0],
1046
+ 'Publisher Name' : concat['publisher'][0],
1047
+ 'Publication Year' : get_valid_year(' '.join(concat['publication_year'].values.tolist())),
1048
+ 'Genes' : g.upper(),
1049
+ 'Population' : upper_abbreviation(' '.join(np.unique(concat['population_race'].values.tolist())).title()),
1050
+ 'Diseases' : upper_abbreviation(' '.join(concat['diseases'].values.tolist()).title()),
1051
+ 'Sample Size' : sample_size_postproc(upper_abbreviation(' '.join(concat['sample_size'].values.tolist()).title())),
1052
+ 'SNPs' : concat['SNPs'][i],
1053
+ 'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).title()),
1054
+ 'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).title()),
1055
+ 'Recommendation' : summary,
1056
+ })
1057
+ elif (len(concat['SNPs'][i].split(',')) >= 1):
1058
+ for s in concat['SNPs'][i].split(','):
1059
+ try:
1060
+ L.append({
1061
+ 'Title' : concat['title'][0],
1062
+ 'Author' : concat['authors'][0],
1063
+ 'Publisher Name' : concat['publisher'][0],
1064
+ 'Publication Year' : get_valid_year(' '.join(concat['publication_year'].values.tolist())),
1065
+ 'Genes' : get_geneName(s.strip()).upper(),
1066
+ 'Population' : upper_abbreviation(' '.join(np.unique(concat['population_race'].values.tolist())).title()),
1067
+ 'Diseases' : upper_abbreviation(' '.join(concat['diseases'].values.tolist()).title()),
1068
+ 'Sample Size' : sample_size_postproc(upper_abbreviation(' '.join(concat['sample_size'].values.tolist()).title())),
1069
+ 'SNPs' : s,
1070
+ 'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).title()),
1071
+ 'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).title()),
1072
+ 'Recommendation' : summary,
1073
+ })
1074
+ except Exception as e:
1075
+ L.append({
1076
+ 'Title' : concat['title'][0],
1077
+ 'Author' : concat['authors'][0],
1078
+ 'Publisher Name' : concat['publisher'][0],
1079
+ 'Publication Year' : get_valid_year(' '.join(concat['publication_year'].values.tolist())),
1080
+ 'Genes' : '',
1081
+ 'Population' : upper_abbreviation(' '.join(np.unique(concat['population_race'].values.tolist())).title()),
1082
+ 'Diseases' : upper_abbreviation(' '.join(concat['diseases'].values.tolist()).title()),
1083
+ 'Sample Size' : sample_size_postproc(upper_abbreviation(' '.join(concat['sample_size'].values.tolist()).title())),
1084
+ 'SNPs' : s,
1085
+ 'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).title()),
1086
+ 'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).title()),
1087
+ 'Recommendation' : summary,
1088
+ })
1089
+
1090
+ csv = pd.concat([csv, pd.DataFrame(L)], ignore_index=True)
1091
+ status.update(label="Gene and SNPs succesfully collected.")
1092
+ st.dataframe(csv)
1093
+ with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
1094
+ # Write each dataframe to a different worksheet
1095
+ csv.to_excel(writer, sheet_name='Result')
1096
+ writer.close()
1097
+
1098
+ time_now = datetime.now()
1099
+ current_time = time_now.strftime("%H:%M:%S")
1100
+
1101
+ csv = convert_df(csv)
1102
+ st.download_button(
1103
+ label="Save Result",
1104
+ data=buffer,
1105
+ file_name=f'{uploaded_file.name}'.replace('.pdf', '') + '.xlsx',
1106
+ mime='application/vnd.ms-excel'
1107
+ )
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pikepdf
2
+ stqdm
3
+ pdf2image
4
+ PyPDF2
5
+ pytesseract
6
+ unstructured
7
+ chromadb==0.3.29
8
+ nltk
9
+ pandas
10
+ streamlit
11
+ xlsxwriter
12
+ openai
13
+ biopython
14
+ langchain
15
+ unstructured-pytesseract
16
+ unstructured-inference
schema.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ textex_schema = {
2
+ "properties" : {
3
+ "title" : {
4
+ "type" : "string",
5
+ "description" : "Title of the given text. Often located in the top of the first page."
6
+ },
7
+ "authors" : {
8
+ "type" : "string",
9
+ "description" : "Authors / writers of the given text. Some of the names of the people."
10
+ },
11
+ "publisher" : {
12
+ "type" : "string",
13
+ "description" : "Publisher name of the given text."
14
+ },
15
+ "publication_year" : {
16
+ "type" : "string",
17
+ "description" : "The year when the given text publised."
18
+ },
19
+ "genes_locus" : {
20
+ "type" : "string",
21
+ "description" : "The gene or locus names mentioned in the text."
22
+ },
23
+ "diseases" : {
24
+ "type" : "string",
25
+ "description" : "Diseases / Phenotypes / Traits corresponding to the Gene / Locus / SNP mentioned in the text."
26
+ },
27
+ "SNPs" : {
28
+ "type" : "string",
29
+ "description" : "SNPs (Single Nucleotide Polymorphism) / rsID mentioned in the text. Usually startwith `rs` followed by some numbers."
30
+ },
31
+ "population_race" : {
32
+ "type" : "string",
33
+ "description" : "Population / race used by the author in the given text."
34
+ },
35
+ "sample_size" : {
36
+ "type" : "string",
37
+ "description" : "Sample size of the population used in the research that mentioned in the paper."
38
+ },
39
+ "study_methodology" : {
40
+ "type" : "string",
41
+ "description" : "Study methodoly mentioned in the text."
42
+ },
43
+ "study_level" : {
44
+ "type" : "string",
45
+ "description" : "Study level mentioned in the text."
46
+ }
47
+ },
48
+ "required" : ["title"]
49
+ }
50
+
51
+ tablex_schema = {
52
+ "properties" : {
53
+ "title" : {
54
+ "type" : "string",
55
+ "description" : "Title of the given text. Often located in the top of the first page. Usually at the top of authors name."
56
+ },
57
+ "authors" : {
58
+ "type" : "string",
59
+ "description" : "Authors / writers of the given text. Some of the names of the people."
60
+ },
61
+ "publisher_name" : {
62
+ "type" : "string",
63
+ "description" : "Publisher name of the given text."
64
+ },
65
+ "year_of_publication" : {
66
+ "type" : "string",
67
+ "description" : "The year when the given text publised."
68
+ },
69
+ "population_race" : {
70
+ "type" : "string",
71
+ "description" : "Population / race used by the author in the given text."
72
+ },
73
+ "sample_size" : {
74
+ "type" : "string",
75
+ "description" : "Sample size of the population used in the research that mentioned in the paper."
76
+ },
77
+ "study_methodology" : {
78
+ "type" : "string",
79
+ "description" : "Study methodoly mentioned in the text."
80
+ },
81
+ "study_level" : {
82
+ "type" : "string",
83
+ "description" : "Study level mentioned in the text."
84
+ }
85
+ },
86
+ "required" : ["title"]
87
+ }
summ.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain.chains.llm import LLMChain
3
+ from langchain.chat_models import ChatOpenAI
4
+ from langchain.prompts import PromptTemplate
5
+ from langchain.document_loaders import PDFPlumberLoader
6
+ from langchain.text_splitter import CharacterTextSplitter
7
+ from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
8
+ from langchain.chains.combine_documents.stuff import StuffDocumentsChain
9
+
10
+ os.environ['OPENAI_API_KEY'] = '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'
11
+
12
+ llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
13
+
14
+ def get_summ(path):
15
+
16
+ loader = PDFPlumberLoader(path)
17
+ docs = loader.load()
18
+ # Map
19
+ map_template = """The following is a set of documents
20
+ {docs}
21
+ Based on this list of docs, please identify the main themes
22
+ Helpful Answer:"""
23
+ map_prompt = PromptTemplate.from_template(map_template)
24
+ map_chain = LLMChain(llm=llm, prompt=map_prompt)
25
+
26
+ # Reduce
27
+ reduce_template = """The following is set of summaries:
28
+ {doc_summaries}
29
+ Take these and distill it into a final, consolidated summary of the main themes.
30
+ Helpful Answer:"""
31
+ reduce_prompt = PromptTemplate.from_template(reduce_template)
32
+
33
+ # Run chain
34
+ reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
35
+
36
+ # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
37
+ combine_documents_chain = StuffDocumentsChain(
38
+ llm_chain=reduce_chain, document_variable_name="doc_summaries"
39
+ )
40
+
41
+ # Combines and iteravely reduces the mapped documents
42
+ reduce_documents_chain = ReduceDocumentsChain(
43
+ # This is final chain that is called.
44
+ combine_documents_chain=combine_documents_chain,
45
+ # If documents exceed context for `StuffDocumentsChain`
46
+ collapse_documents_chain=combine_documents_chain,
47
+ # The maximum number of tokens to group documents into.
48
+ token_max=12000,
49
+ )
50
+
51
+ # Combining documents by mapping a chain over them, then combining results
52
+ map_reduce_chain = MapReduceDocumentsChain(
53
+ # Map chain
54
+ llm_chain=map_chain,
55
+ # Reduce chain
56
+ reduce_documents_chain=reduce_documents_chain,
57
+ # The variable name in the llm_chain to put the documents in
58
+ document_variable_name="docs",
59
+ # Return the results of the map steps in the output
60
+ return_intermediate_steps=False,
61
+ )
62
+
63
+ text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
64
+ chunk_size=12000, chunk_overlap=0
65
+ )
66
+ split_docs = text_splitter.split_documents(docs)
67
+
68
+ return map_reduce_chain.run(split_docs)
utils.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import textwrap
4
+
5
+ import nltk
6
+ import re
7
+ from Bio import Entrez
8
+
9
+
10
+ def replace_quotes(text):
11
+ pattern = r'(?<=")[^"]*(?=")'
12
+ return re.sub(pattern, lambda match: match.group(0).replace('"', "'"), text)
13
+
14
+
15
+ def clean_text(text):
16
+ """Remove section titles and figure descriptions from text"""
17
+ pattern = r'[^\w\s]'
18
+ clean = "\n".join([row for row in text.split("\n") if (len(row.split(" "))) > 3 and not (row.startswith("(a)")) and not row.startswith("Figure")])
19
+ return re.sub(pattern, '', clean)
20
+
21
+
22
+ def truncate_text(text, max_tokens):
23
+ wrapper = textwrap.TextWrapper(width=max_tokens)
24
+ truncated_text = wrapper.wrap(text)
25
+ if len(truncated_text) > 0:
26
+ return truncated_text[0]
27
+ else:
28
+ return ""
29
+
30
+
31
+ def split_text(text, chunk_size):
32
+ chunks = []
33
+ start = 0
34
+ end = chunk_size
35
+ while start < len(text):
36
+ chunks.append(text[start:end])
37
+ start = end
38
+ end += chunk_size
39
+ return chunks
40
+
41
+
42
+ def extract_gene_name(text):
43
+
44
+ text_str = text.decode("utf-8")
45
+ text_str = text_str.replace("\\n", "").replace("\\t", "").replace("\\'", "'")
46
+ pattern = r"<NAME>(.*?)</NAME>"
47
+ match = re.search(pattern, text_str)
48
+ if match:
49
+ gene_name = match.group(1)
50
+ return gene_name
51
+ else:
52
+ return None
53
+
54
+
55
+ def get_geneName(rsid):
56
+
57
+ text = Entrez.efetch(db="snp", id=rsid, retmode='xml').read()
58
+ text = extract_gene_name(text)
59
+ return text
60
+
61
+
62
+ def split_text_into_sentences(text, num_sentences):
63
+
64
+ sentences = nltk.sent_tokenize(text)
65
+ grouped_sentences = [sentences[i:i+num_sentences] for i in range(0, len(sentences), num_sentences)]
66
+ return grouped_sentences
67
+
68
+
69
+ def flatten_list(nested_list):
70
+
71
+ flattened_list = []
72
+ for item in nested_list:
73
+ if isinstance(item, list):
74
+ flattened_list.extend(flatten_list(item))
75
+ else:
76
+ flattened_list.append(item)
77
+ return flattened_list
78
+
79
+
80
+ def move_file(source_path, destination_path):
81
+
82
+ if not os.path.exists(destination_path):
83
+ os.makedirs(destination_path)
84
+
85
+ try:
86
+ shutil.move(source_path, destination_path)
87
+ print(f"File moved successfully from '{source_path}' to '{destination_path}'.")
88
+ except Exception as e:
89
+ print(f"Error: {e}")
90
+
91
+
92
+ def upper_abbreviation(text):
93
+ pattern1 = r'\b(?:[A-Z][a-z.]*\.?\s*)+\b'
94
+ pattern2 = re.compile(r'unknown', re.IGNORECASE)
95
+ def convert_to_upper(match):
96
+ return match.group(0).replace('.', '').upper()
97
+ text = re.sub(pattern2, '', text)
98
+ output_string = re.sub(pattern1, convert_to_upper, text)
99
+ return output_string
100
+
101
+
102
+ def get_valid_year(input_text):
103
+ four_letter_words = re.findall(r'\b\w{4}\b', input_text)
104
+ result_text = ' '.join(four_letter_words)
105
+ if len(result_text.split(' ')) > 1:
106
+ return ''.join(result_text.split(' ')[0])
107
+ return result_text
108
+
109
+
110
+ def sample_size_postproc(text):
111
+ words = text.split()
112
+ pattern = r'\b[A-Za-z]+\d+\b'
113
+ cleaned_words = [word for word in words if not re.match(r'.*\d.*[A-Za-z].*$', word)]
114
+ cleaned_text = ' '.join(cleaned_words)
115
+ cleaned_text = re.sub(pattern, '', cleaned_text)
116
+ return cleaned_text