fadliaulawi's picture
Initial commit
fb4710e
raw
history blame
No virus
5.8 kB
import io
import os
import pandas as pd
import streamlit as st
from datetime import datetime
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_core.documents.base import Document
from langchain_text_splitters import TokenTextSplitter
from process import get_entity, get_entity_one, get_table, validate
from tempfile import NamedTemporaryFile
from stqdm import stqdm
from threading import Thread
class CustomThread(Thread):
def __init__(self, func, chunk):
super().__init__()
self.func = func
self.chunk = chunk
self.result = ''
def run(self):
self.result = self.func(self.chunk)
buffer = io.BytesIO()
st.cache_data()
st.set_page_config(page_title="NutriGenMe Paper Extractor")
st.title("NutriGenMe - Paper Extraction")
st.markdown("<div style='text-align: left; color: white; font-size: 16px'>In its latest version, the app is equipped to extract essential information from papers, including tables in both horizontal and vertical orientations, images, and text exclusively.</div><br>", unsafe_allow_html=True)
uploaded_files = st.file_uploader("Upload Paper(s) here :", type="pdf", accept_multiple_files=True)
chunk_option = st.selectbox(
'Tokens amounts per process :',
(32000, 16000, 8000, 0), key='table_hv'
)
chunk_overlap = 0
if uploaded_files:
journals = []
parseButtonHV = st.button("Get Result", key='table_HV')
if parseButtonHV:
with st.status("Extraction in progress ...", expanded=True) as status:
start_time = datetime.now()
csv = pd.DataFrame()
for uploaded_file in stqdm(uploaded_files):
with NamedTemporaryFile(dir='.', suffix=".pdf", delete=eval(os.getenv('DELETE_TEMP_PDF', 'True'))) as pdf:
pdf.write(uploaded_file.getbuffer())
loader = PyPDFLoader(pdf.name)
pages = loader.load()
chunk_size = 120000
chunk_overlap = 0
docs = pages
if chunk_option:
docs = [Document('\n'.join([page.page_content for page in pages]))]
docs[0].metadata = {'source': pages[0].metadata['source']}
chunk_size = chunk_option
chunk_overlap = int(0.25 * chunk_size)
text_splitter = TokenTextSplitter.from_tiktoken_encoder(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
chunks = text_splitter.split_documents(docs)
threads = []
threads.append(CustomThread(get_entity, (chunks, 'gsd')))
threads.append(CustomThread(get_entity, (chunks, 'summ')))
threads.append(CustomThread(get_entity, (chunks, 'all')))
threads.append(CustomThread(get_entity_one, [c.page_content for c in chunks[:1]]))
threads.append(CustomThread(get_table, pdf.name))
[t.start() for t in threads]
[t.join() for t in threads]
result_gsd = threads[0].result
result_summ = threads[1].result
result = threads[2].result
result_one = threads[3].result
res_gene, res_snp, res_dis = threads[4].result
# Combine
result['Genes'] = res_gene + result_gsd['Genes']
result['SNPs'] = res_snp + result_gsd['SNPs']
result['Diseases'] = res_dis + result_gsd['Diseases']
result['Conclusion'] = result_summ
for k in result_one.keys():
result[k] = result_one[k]
if len(result['Genes']) == 0:
result['Genes'] = ['']
num_rows = max(max(len(result['Genes']), len(result['SNPs'])), len(result['Diseases']))
# Adjust Genes, SNPs, Diseases
for k in ['Genes', 'SNPs', 'Diseases']:
while len(result[k]) < num_rows:
result[k].append('')
# Temporary handling
result[k] = result[k][:num_rows]
# Key Column
result = {key: value if isinstance(value, list) else [value] * num_rows for key, value in result.items()}
dataframe = pd.DataFrame(result)
dataframe = dataframe[['Genes', 'SNPs', 'Diseases', 'Title', 'Authors', 'Publisher Name', 'Publication Year', 'Population', 'Sample Size', 'Study Methodology', 'Study Level', 'Conclusion']]
dataframe.drop_duplicates(['Genes', 'SNPs'], inplace=True)
dataframe.reset_index(drop=True, inplace=True)
cleaned_dataframe = validate(dataframe)
end_time = datetime.now()
st.write("Success in ", round((end_time.timestamp() - start_time.timestamp()) / 60, 2), "minutes")
st.dataframe(cleaned_dataframe)
with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
cleaned_dataframe.to_excel(writer, sheet_name='Result')
dataframe.to_excel(writer, sheet_name='Original')
writer.close()
st.download_button(
label="Save Result",
data=buffer,
file_name=f"{uploaded_file.name.replace('.pdf', '')}_{chunk_option}.xlsx",
mime='application/vnd.ms-excel'
)