fadliaulawi commited on
Commit
fb4710e
0 Parent(s):

Initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.github/workflows/main.yml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v3
14
+ with:
15
+ fetch-depth: 0
16
+ lfs: true
17
+ - name: LFS Install
18
+ run: git lfs install
19
+ - name: LFS Track
20
+ run: git lfs track *.pdf
21
+ - name: Checkout LFS objects
22
+ run: git lfs checkout
23
+ - name: Push to hub
24
+ env:
25
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
26
+ run: git push --force https://fadliaulawi:$HF_TOKEN@huggingface.co/spaces/KalbeDigitalLab/nutrigenme-paper-extractor main
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ __pycache__
2
+ .env
3
+ .vscode
4
+ resources/images/
5
+ resources/papers/
6
+ result/
Dockerfile ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ RUN apt update && \
4
+ apt install -y bash \
5
+ poppler-utils \
6
+ tesseract-ocr \
7
+ libtesseract-dev \
8
+ build-essential \
9
+ git \
10
+ curl \
11
+ ca-certificates \
12
+ python3 \
13
+ python3-pip && \
14
+ rm -rf /var/lib/apt/lists
15
+
16
+ RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y
17
+
18
+ WORKDIR /code
19
+
20
+ COPY ./requirements.txt /code/requirements.txt
21
+
22
+ # Set up a new user named "user" with user ID 1000
23
+ RUN useradd -m -u 1000 user
24
+
25
+ # Switch to the "user" user
26
+ USER user
27
+
28
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
29
+
30
+ RUN [ "python", "-c", "import nltk; nltk.download('punkt')" ]
31
+
32
+ # Set home to the user's home directory
33
+ ENV HOME=/home/user \
34
+ PATH=/home/user/.local/bin:$PATH
35
+
36
+ # Set the working directory to the user's home directory
37
+ WORKDIR $HOME/app
38
+
39
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
40
+ COPY --chown=user . $HOME/app
41
+
42
+ COPY . .
43
+
44
+ CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.enableXsrfProtection=false"]
README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: NutriGenMe PaperExtractor
3
+ emoji: 📄
4
+ colorFrom: green
5
+ colorTo: blue
6
+ sdk: docker
7
+ pinned: false
8
+ license: apache-2.0
9
+ app_port: 8501
10
+ ---
11
+
12
+ # NutriGenMe Paper Extractor
13
+
14
+ ## Overview
15
+ The NutriGenMe Paper Extractor is a tool designed to extract relevant information from genomic papers related to the NutriGenMe project. It utilizes natural language processing techniques to parse through documents and extract key data points, enabling researchers and practitioners to efficiently gather insights from a large corpus of literature.
16
+
17
+ ## Features
18
+ - **Automated Extraction**: Extracts various entities, such as title, authors, and conclusion of the study, from academic papers automatically.
19
+ - **Fast Extraction**: Capable of extracting information from complex papers in under 10 minutes.
20
+ - **Table Extraction**: Extracts values from tables, particularly focusing on gene names, SNPs, and associated diseases.
21
+ - **Export to Excel**: Export extraction results to Excel format for easy integration and further analysis.
22
+
23
+ ## Usage
24
+ 1. Clone this repository:
25
+ ```bash
26
+ git clone https://github.com/KalbeDigitalLab/nutrigenme-paper-extractor
27
+ ```
28
+
29
+ 2. Install dependencies:
30
+ ```bash
31
+ pip install -r requirements.txt
32
+ ```
33
+
34
+ 3. Prepare environment keys:
35
+ ```
36
+ # Credentials for GPT-4 Model
37
+ OPENAI_API_KEY=<openai_api_key>
38
+
39
+ # (Optional) Tracking your extraction process with LangSmith
40
+ LANGCHAIN_TRACING_V2='true'
41
+ LANGCHAIN_API_KEY=<langchain_api_key>
42
+ LANGCHAIN_ENDPOINT='https://api.smith.langchain.com'
43
+ LANGCHAIN_PROJECT=<project_name>
44
+ ```
45
+ 4. Run the application with `streamlit`:
46
+ ```bash
47
+ streamlit run app.py
48
+ ```
49
+
50
+ This program is also already deployed in 🤗HuggingFace [Space](https://huggingface.co/spaces/KalbeDigitalLab/nutrigenme-paper-extractor/).
51
+
52
+ ## Documentation
53
+ **app.py**: Designs the user interface and guides the application flow, calling on other scripts for specific tasks.
54
+
55
+ **process.py**: Orchestrates the information extraction by delegating tasks to other scripts and handling the overall workflow.
56
+
57
+ **prompt.py**: Stores prompts crafted for Large Language Models (LLMs) to target specific information during extraction.
58
+
59
+ **table_detector.py**: Focuses on extracting info from Optical Character Recognition (OCR) tables, using functions to detect and process them.
60
+
61
+ ## Contributing
62
+ Contributions are welcome! If you'd like to contribute to this project, feel free to create pull requests.
app.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ import pandas as pd
4
+ import streamlit as st
5
+
6
+ from datetime import datetime
7
+ from langchain_community.document_loaders.pdf import PyPDFLoader
8
+ from langchain_core.documents.base import Document
9
+ from langchain_text_splitters import TokenTextSplitter
10
+ from process import get_entity, get_entity_one, get_table, validate
11
+ from tempfile import NamedTemporaryFile
12
+ from stqdm import stqdm
13
+ from threading import Thread
14
+
15
+ class CustomThread(Thread):
16
+ def __init__(self, func, chunk):
17
+ super().__init__()
18
+ self.func = func
19
+ self.chunk = chunk
20
+ self.result = ''
21
+
22
+ def run(self):
23
+ self.result = self.func(self.chunk)
24
+
25
+ buffer = io.BytesIO()
26
+
27
+ st.cache_data()
28
+ st.set_page_config(page_title="NutriGenMe Paper Extractor")
29
+ st.title("NutriGenMe - Paper Extraction")
30
+ st.markdown("<div style='text-align: left; color: white; font-size: 16px'>In its latest version, the app is equipped to extract essential information from papers, including tables in both horizontal and vertical orientations, images, and text exclusively.</div><br>", unsafe_allow_html=True)
31
+
32
+ uploaded_files = st.file_uploader("Upload Paper(s) here :", type="pdf", accept_multiple_files=True)
33
+
34
+ chunk_option = st.selectbox(
35
+ 'Tokens amounts per process :',
36
+ (32000, 16000, 8000, 0), key='table_hv'
37
+ )
38
+ chunk_overlap = 0
39
+
40
+ if uploaded_files:
41
+ journals = []
42
+ parseButtonHV = st.button("Get Result", key='table_HV')
43
+
44
+ if parseButtonHV:
45
+ with st.status("Extraction in progress ...", expanded=True) as status:
46
+ start_time = datetime.now()
47
+
48
+ csv = pd.DataFrame()
49
+ for uploaded_file in stqdm(uploaded_files):
50
+ with NamedTemporaryFile(dir='.', suffix=".pdf", delete=eval(os.getenv('DELETE_TEMP_PDF', 'True'))) as pdf:
51
+ pdf.write(uploaded_file.getbuffer())
52
+ loader = PyPDFLoader(pdf.name)
53
+ pages = loader.load()
54
+
55
+ chunk_size = 120000
56
+ chunk_overlap = 0
57
+ docs = pages
58
+
59
+ if chunk_option:
60
+ docs = [Document('\n'.join([page.page_content for page in pages]))]
61
+ docs[0].metadata = {'source': pages[0].metadata['source']}
62
+
63
+ chunk_size = chunk_option
64
+ chunk_overlap = int(0.25 * chunk_size)
65
+
66
+ text_splitter = TokenTextSplitter.from_tiktoken_encoder(
67
+ chunk_size=chunk_size, chunk_overlap=chunk_overlap
68
+ )
69
+ chunks = text_splitter.split_documents(docs)
70
+
71
+ threads = []
72
+ threads.append(CustomThread(get_entity, (chunks, 'gsd')))
73
+ threads.append(CustomThread(get_entity, (chunks, 'summ')))
74
+ threads.append(CustomThread(get_entity, (chunks, 'all')))
75
+ threads.append(CustomThread(get_entity_one, [c.page_content for c in chunks[:1]]))
76
+ threads.append(CustomThread(get_table, pdf.name))
77
+
78
+ [t.start() for t in threads]
79
+ [t.join() for t in threads]
80
+
81
+ result_gsd = threads[0].result
82
+ result_summ = threads[1].result
83
+ result = threads[2].result
84
+ result_one = threads[3].result
85
+ res_gene, res_snp, res_dis = threads[4].result
86
+
87
+ # Combine
88
+ result['Genes'] = res_gene + result_gsd['Genes']
89
+ result['SNPs'] = res_snp + result_gsd['SNPs']
90
+ result['Diseases'] = res_dis + result_gsd['Diseases']
91
+ result['Conclusion'] = result_summ
92
+ for k in result_one.keys():
93
+ result[k] = result_one[k]
94
+
95
+ if len(result['Genes']) == 0:
96
+ result['Genes'] = ['']
97
+
98
+ num_rows = max(max(len(result['Genes']), len(result['SNPs'])), len(result['Diseases']))
99
+
100
+ # Adjust Genes, SNPs, Diseases
101
+ for k in ['Genes', 'SNPs', 'Diseases']:
102
+ while len(result[k]) < num_rows:
103
+ result[k].append('')
104
+
105
+ # Temporary handling
106
+ result[k] = result[k][:num_rows]
107
+
108
+ # Key Column
109
+ result = {key: value if isinstance(value, list) else [value] * num_rows for key, value in result.items()}
110
+
111
+ dataframe = pd.DataFrame(result)
112
+ dataframe = dataframe[['Genes', 'SNPs', 'Diseases', 'Title', 'Authors', 'Publisher Name', 'Publication Year', 'Population', 'Sample Size', 'Study Methodology', 'Study Level', 'Conclusion']]
113
+ dataframe.drop_duplicates(['Genes', 'SNPs'], inplace=True)
114
+ dataframe.reset_index(drop=True, inplace=True)
115
+ cleaned_dataframe = validate(dataframe)
116
+
117
+ end_time = datetime.now()
118
+ st.write("Success in ", round((end_time.timestamp() - start_time.timestamp()) / 60, 2), "minutes")
119
+
120
+ st.dataframe(cleaned_dataframe)
121
+ with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
122
+ cleaned_dataframe.to_excel(writer, sheet_name='Result')
123
+ dataframe.to_excel(writer, sheet_name='Original')
124
+ writer.close()
125
+
126
+ st.download_button(
127
+ label="Save Result",
128
+ data=buffer,
129
+ file_name=f"{uploaded_file.name.replace('.pdf', '')}_{chunk_option}.xlsx",
130
+ mime='application/vnd.ms-excel'
131
+ )
process.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from dotenv import load_dotenv
3
+ from img2table.document import Image
4
+ from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain
5
+ from langchain.chains.combine_documents.reduce import ReduceDocumentsChain
6
+ from langchain.chains.combine_documents.stuff import StuffDocumentsChain
7
+ from langchain.chains.llm import LLMChain
8
+ from langchain.prompts import PromptTemplate
9
+ from langchain_openai import ChatOpenAI
10
+ from pdf2image import convert_from_path
11
+ from prompt import prompt_entity_gsd_chunk, prompt_entity_gsd_combine, prompt_entity_summ_chunk, prompt_entity_summ_combine, prompt_entities_chunk, prompt_entities_combine, prompt_entity_one_chunk, prompt_table
12
+ from table_detector import detection_transform, device, model, ocr, outputs_to_objects
13
+
14
+ import io
15
+ import json
16
+ import os
17
+ import pandas as pd
18
+ import re
19
+ import torch
20
+
21
+ load_dotenv()
22
+
23
+ llm = ChatOpenAI(temperature=0, model_name="gpt-4-0125-preview")
24
+ llm_p = ChatOpenAI(temperature=0, api_key=os.environ['PERPLEXITY_API_KEY'], base_url="https://api.perplexity.ai")
25
+
26
+ prompts = {
27
+ 'gsd': [prompt_entity_gsd_chunk, prompt_entity_gsd_combine],
28
+ 'summ': [prompt_entity_summ_chunk, prompt_entity_summ_combine],
29
+ 'all': [prompt_entities_chunk, prompt_entities_combine]
30
+ }
31
+
32
+ def get_entity(data):
33
+
34
+ chunks, types = data
35
+
36
+ map_template = prompts[types][0]
37
+ map_prompt = PromptTemplate.from_template(map_template)
38
+ map_chain = LLMChain(llm=llm, prompt=map_prompt)
39
+
40
+ reduce_template = prompts[types][1]
41
+ reduce_prompt = PromptTemplate.from_template(reduce_template)
42
+ reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
43
+
44
+ combine_chain = StuffDocumentsChain(
45
+ llm_chain=reduce_chain, document_variable_name="doc_summaries"
46
+ )
47
+
48
+ reduce_documents_chain = ReduceDocumentsChain(
49
+ combine_documents_chain=combine_chain,
50
+ collapse_documents_chain=combine_chain,
51
+ token_max=100000,
52
+ )
53
+
54
+ map_reduce_chain = MapReduceDocumentsChain(
55
+ llm_chain=map_chain,
56
+ reduce_documents_chain=reduce_documents_chain,
57
+ document_variable_name="docs",
58
+ return_intermediate_steps=False,
59
+ )
60
+
61
+ result = map_reduce_chain.invoke(chunks)['output_text']
62
+ print(types)
63
+ print(result)
64
+ if types != 'summ':
65
+ result = re.findall('(\{[^}]+\})', result)[0]
66
+ return eval(result)
67
+
68
+ return result
69
+
70
+ def get_entity_one(chunks):
71
+
72
+ result = llm.invoke(prompt_entity_one_chunk.format(chunks)).content
73
+
74
+ print('One')
75
+ print(result)
76
+ result = re.findall('(\{[^}]+\})', result)[0]
77
+
78
+ return eval(result)
79
+
80
+ def get_table(path):
81
+
82
+ start_time = datetime.now()
83
+ images = convert_from_path(path)
84
+ print('PDF to Image', round((datetime.now().timestamp() - start_time.timestamp()) / 60, 2), "minutes")
85
+ tables = []
86
+
87
+ # Loop pages
88
+ for image in images:
89
+
90
+ pixel_values = detection_transform(image).unsqueeze(0).to(device)
91
+ with torch.no_grad():
92
+ outputs = model(pixel_values)
93
+
94
+ id2label = model.config.id2label
95
+ id2label[len(model.config.id2label)] = "no object"
96
+ detected_tables = outputs_to_objects(outputs, image.size, id2label)
97
+
98
+ # Loop table in page (if any)
99
+ for idx in range(len(detected_tables)):
100
+ cropped_table = image.crop(detected_tables[idx]["bbox"])
101
+ if detected_tables[idx]["label"] == 'table rotated':
102
+ cropped_table = cropped_table.rotate(270, expand=True)
103
+
104
+ # TODO: what is the perfect threshold?
105
+ if detected_tables[idx]['score'] > 0.9:
106
+ print(detected_tables[idx])
107
+ tables.append(cropped_table)
108
+
109
+ print('Detect table from image', round((datetime.now().timestamp() - start_time.timestamp()) / 60, 2), "minutes")
110
+ genes = []
111
+ snps = []
112
+ diseases = []
113
+
114
+ # Loop tables
115
+ for table in tables:
116
+
117
+ buffer = io.BytesIO()
118
+ table.save(buffer, format='PNG')
119
+ image = Image(buffer)
120
+
121
+ # Extract to dataframe
122
+ extracted_tables = image.extract_tables(ocr=ocr, implicit_rows=True, borderless_tables=True, min_confidence=0)
123
+
124
+ if len(extracted_tables) == 0:
125
+ continue
126
+
127
+ # Combine multiple dataframe
128
+ df_table = extracted_tables[0].df
129
+ for extracted_table in extracted_tables[1:]:
130
+ df_table = pd.concat([df_table, extracted_table.df]).reset_index(drop=True)
131
+
132
+ df_table.loc[0] = df_table.loc[0].fillna('')
133
+
134
+ # Identify multiple rows (in dataframe) as one row (in image)
135
+ rows = []
136
+ indexes = []
137
+ for i in df_table.index:
138
+ if not df_table.loc[i].isna().any():
139
+ if len(indexes) > 0:
140
+ rows.append(indexes)
141
+ indexes = []
142
+ indexes.append(i)
143
+ rows.append(indexes)
144
+
145
+ df_table_cleaned = pd.DataFrame(columns=df_table.columns)
146
+ for row in rows:
147
+ row_str = df_table.loc[row[0]]
148
+ for idx in row[1:]:
149
+ row_str += ' ' + df_table.loc[idx].fillna('')
150
+ row_str = row_str.str.strip()
151
+ df_table_cleaned.loc[len(df_table_cleaned)] = row_str
152
+
153
+ # Ask LLM with JSON data
154
+ json_table = df_table_cleaned.to_json(orient='records')
155
+ str_json_table = json.dumps(json.loads(json_table), indent=2)
156
+
157
+ result = llm.invoke(prompt_table.format(str_json_table)).content
158
+ print('table')
159
+ print(result)
160
+ result = result[result.find('['):result.rfind(']')+1]
161
+ try:
162
+ result = eval(result)
163
+ except SyntaxError:
164
+ result = []
165
+
166
+ for res in result:
167
+ res_gene = res['Genes']
168
+ res_snp = res['SNPs']
169
+ res_disease = res['Diseases']
170
+
171
+ for snp in res_snp:
172
+ genes.append(res_gene)
173
+ snps.append(snp)
174
+ diseases.append(res_disease)
175
+
176
+ print('OCR table to extract', round((datetime.now().timestamp() - start_time.timestamp()) / 60, 2), "minutes")
177
+ print(genes, snps, diseases)
178
+
179
+ return genes, snps, diseases
180
+
181
+ def validate(df):
182
+
183
+ df = df.fillna('')
184
+ df['Genes'] = df['Genes'].str.upper()
185
+ df['SNPs'] = df['SNPs'].str.lower()
186
+
187
+ # Check if there is two gene names
188
+ sym = ['-', '/', '|']
189
+ for i in df.index:
190
+ gene = df.loc[i, 'Genes']
191
+ for s in sym:
192
+ if s in gene:
193
+ genes = gene.split(s)
194
+ df.loc[len(df)] = df.loc[i]
195
+ df.loc[i, 'Genes'] = genes[0]
196
+ df.loc[len(df) - 1, 'Genes'] = genes[1]
197
+
198
+ # Check if there is SNPs without 'rs'
199
+ for i in df.index:
200
+ safe = True
201
+ snp = df.loc[i, 'SNPs']
202
+ if not re.fullmatch('rs(\d)+|', snp):
203
+ if not re.fullmatch('s(\d)+', snp):
204
+ if not re.fullmatch('(\d)+', snp):
205
+ safe = False
206
+ df = df.drop(i)
207
+ else:
208
+ snp = 'rs' + snp
209
+ else:
210
+ snp = 'r' + snp
211
+
212
+ if safe:
213
+ df.loc[i, 'SNPs'] = snp
214
+
215
+ df.reset_index(drop=True, inplace=True)
216
+
217
+ # TODO: How to validate genes and SNPs?
218
+
219
+ # TODO: Validate genes and diseases with LLM
220
+ result = llm_p.invoke(model='mistral-7b-instruct', input='How many stars?')
221
+
222
+ return df
prompt.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ prompt_entity_gsd_chunk = """
2
+ # CONTEXT #
3
+ In my capacity as a genomics specialist, I have recently completed the review of a scholarly publication. I am interested in extracting specific genomic information, or entities, from the body of the paper.
4
+ To facilitate this process, I have constructed a predefined schema that outlines the desired entities and their corresponding description.
5
+
6
+ This is the schema provided:
7
+
8
+ {{
9
+ "Genes" : {{
10
+ "type" : "list of strings",
11
+ "description" : "All relevant genes mentioned in the text. Gene names can only contain uppercase letters and digits."
12
+ }},
13
+ "SNPs" : {{
14
+ "type" : "list of strings",
15
+ "description" : "Unique identifier associated with each value in Genes schema. These identifiers typically begin with 'rs' and appear near the gene name in the text."
16
+ }},
17
+ "Diseases" : {{
18
+ "type" : "list of strings",
19
+ "description" : "Type of diseases that related to each value in Genes, typically appear near the gene name in the text."
20
+ }}
21
+ }}
22
+
23
+ Note that the values within each list of Genes, SNPs, and Diseases columns correspond directly to each other. Consequently, the lengths of these lists must be identical.
24
+
25
+ This is a passage from the paper: {docs}
26
+
27
+ # OBJECTIVE #
28
+ Given a predefined schema outlining relevant entities, meticulously extract these entities from the provided text passage. Exercise caution when handling the reference section of the document, abstain from extracting information from this section.
29
+ IMPORTANT: It is crucial to maintain the utmost accuracy in this process, as any false or fabricated information (hallucination) can have severe consequences for academic integrity and research credibility. If an entity is entirely absent from the passage, just leave the corresponding field blank with an empty string ('').
30
+
31
+ # RESPONSE #
32
+ The extracted information will be utilizing the JSON format. Information within strings will be demarcated by double quotes (" "), while quotes contained within strings will be denoted by single quotes (' '). List data will be enclosed within square brackets ([]).
33
+ This is the example of the respose:
34
+
35
+ {{
36
+ "Genes": ["A", "B", "C"],
37
+ "SNPs": ["rs1", "rs2", "rs3"],
38
+ "Diseases": ["X", "Y", "Z"]
39
+ }}
40
+
41
+ If there is no specific extracted entities, just leave the corresponding field blank with an empty lists ([]).
42
+ """
43
+
44
+ prompt_entity_gsd_combine = """
45
+ # CONTEXT #
46
+ In my role as a genomics specialist, I have extracted specific entities from a scholarly publication. These entities were identified and retrieved from various sections throughout the document. My current objective is to consolidate this extracted information into a concise summary, facilitating a comprehensive understanding of the publication's key findings.
47
+ To achieve this, I have constructed a predefined schema that outlines the desired entities and the methods for their summarization.
48
+
49
+ {{
50
+ "Genes" : {{
51
+ "type" : "list of strings",
52
+ "description" : "Identify the most relevant gene from the compiled gene list across all sections."
53
+ }},
54
+ "SNPs" : {{
55
+ "type" : "list of strings",
56
+ "description" : "Upon completion of the gene combination process, associate each resulting value with its unique identifier."
57
+ }},
58
+ "Diseases" : {{
59
+ "type" : "list of strings",
60
+ "description" : "Upon completion of the gene combination process, associate each resulting value with its diseases."
61
+ }}
62
+ }}
63
+
64
+ This is a set of summaries: {doc_summaries}
65
+
66
+ If there is no extracted entities, just leave the corresponding result with an empty list ([]) later.
67
+
68
+ # OBJECTIVE #
69
+ In the context of a predefined schema that specifies entities and their corresponding operations, construct a comprehensive synopsis that incorporates all critical details gleaned from each section.
70
+ IMPORTANT: It is crucial to maintain the utmost accuracy in this process, as any false or fabricated information (hallucination) can have severe consequences for academic integrity and research credibility.
71
+
72
+ # RESPONSE #
73
+ The extracted information will be utilizing the JSON format. Information within strings will be demarcated by double quotes (" "), while quotes contained within strings will be denoted by single quotes (' '). List data will be enclosed within square brackets ([]).
74
+ This is the example of the respose:
75
+
76
+ {{
77
+ "Genes": ["A", "B", "C"],
78
+ "SNPs": ["rs1", "rs2", "rs3"],
79
+ "Diseases": ["X", "Y", "Z"]
80
+ }}
81
+ """
82
+
83
+ prompt_entity_summ_chunk = """
84
+ # CONTEXT #
85
+ In my capacity as a genomics specialist, I have recently completed the review of a scholarly publication. I am interested in extracting the summary from the body of the paper.
86
+
87
+ This is a passage from the paper: {docs}
88
+
89
+ # OBJECTIVE #
90
+ Extract the summary or the conclusion from the provided text passage. Exercise caution when handling the reference section of the document, abstain from extracting information from this section.
91
+ IMPORTANT: It is crucial to maintain the utmost accuracy in this process, as any false or fabricated information (hallucination) can have severe consequences for academic integrity and research credibility.
92
+
93
+ # RESPONSE #
94
+ Provide the information in a concise way, using four concise paragraphs. The text should be presented in a continuous format, omitting introductory elements like numbers or titles within each paragraph.
95
+
96
+ 1. Overview. Explanation of the provided documents and their exploration of the genetic underpinnings of the disease, and understanding of genetic factors in disease pathology.
97
+ 2. Main Themes. Identification of genetic variants and mutations contributing to disease susceptibility. Role of specific genes and genetic pathways in disease development and progression.
98
+ 3. Key Genetic Factors and Their Implications. Highlighting specific genes or genetic variants associated with the disease. Discussion of how these genetic factors may influence disease susceptibility, severity, or treatment response.
99
+ 4. Conclusion. Recap of the key findings regarding genetic factors and disease mechanisms. Suggestions for future research directions or clinical applications based on the insights gained from genetic analysis.
100
+ """
101
+
102
+ prompt_entity_summ_combine = """
103
+ # CONTEXT #
104
+ In my role as a genomics specialist, I have extracted some summaries from a scholarly publication. These summaries were identified and retrieved from various sections throughout the document.
105
+ My current objective is to consolidate this extracted information into a concise summary, facilitating a comprehensive understanding of the publication's key findings.
106
+
107
+ This is a set of summaries: {doc_summaries}
108
+
109
+ # OBJECTIVE #
110
+ Construct a comprehensive synopsis that incorporates all critical details gleaned from the summaries of each section.
111
+ IMPORTANT: It is crucial to maintain the utmost accuracy in this process, as any false or fabricated information (hallucination) can have severe consequences for academic integrity and research credibility.
112
+
113
+ # RESPONSE #
114
+ Provide the information in a concise way, using four concise paragraphs. The text should be presented in a continuous format, omitting introductory elements like numbers or titles within each paragraph..
115
+
116
+ 1. Overview. Explanation of the provided documents and their exploration of the genetic underpinnings of the disease, and understanding of genetic factors in disease pathology.
117
+ 2. Main Themes. Identification of genetic variants and mutations contributing to disease susceptibility. Role of specific genes and genetic pathways in disease development and progression.
118
+ 3. Key Genetic Factors and Their Implications. Highlighting specific genes or genetic variants associated with the disease. Discussion of how these genetic factors may influence disease susceptibility, severity, or treatment response.
119
+ 4. Conclusion. Recap of the key findings regarding genetic factors and disease mechanisms. Suggestions for future research directions or clinical applications based on the insights gained from genetic analysis.
120
+ """
121
+
122
+ prompt_entities_chunk = """
123
+ # CONTEXT #
124
+ In my capacity as a genomics specialist, I have recently completed the review of a scholarly publication. I am interested in extracting specific genomic information, or entities, from the body of the paper.
125
+ To facilitate this process, I have constructed a predefined schema that outlines the desired entities and their corresponding description.
126
+
127
+ This is the schema provided:
128
+
129
+ {{
130
+ "Population" : {{
131
+ "type" : "string",
132
+ "description" : "Population / race used by the author in the given text."
133
+ }},
134
+ "Sample Size" : {{
135
+ "type" : "string",
136
+ "description" : "Sample size of the population used in the research that mentioned in the paper."
137
+ }},
138
+ "Study Methodology" : {{
139
+ "type" : "string",
140
+ "description" : "Study methodology mentioned in the text."
141
+ }},
142
+ "Study Level" : {{
143
+ "type" : "string",
144
+ "description" : "Study level mentioned in the text."
145
+ }}
146
+ }}
147
+
148
+ This is a passage from the paper: {docs}
149
+
150
+ # OBJECTIVE #
151
+ Given a predefined schema outlining relevant entities, meticulously extract these entities from the provided text passage. Exercise caution when handling the reference section of the document, abstain from extracting information from this section.
152
+ IMPORTANT: It is crucial to maintain the utmost accuracy in this process, as any false or fabricated information (hallucination) can have severe consequences for academic integrity and research credibility. If an entity is entirely absent from the passage, just leave the corresponding field blank with an empty string ('').
153
+
154
+ # RESPONSE #
155
+ The extracted information will be utilizing the JSON format. Information within strings will be demarcated by double quotes (" "), while quotes contained within strings will be denoted by single quotes (' ').
156
+ This is the example of the respose:
157
+
158
+ {{
159
+ "Population": "South Asian",
160
+ "Sample Size": "403 Relatively Small",
161
+ "Study Methodology": "Double-Blind Randomized Controlled Trial",
162
+ "Study Level": "Postdoctoral"
163
+ }}
164
+
165
+ If there is no specific extracted entities, just leave the corresponding field blank with an empty string ("").
166
+ """
167
+
168
+ prompt_entities_combine = """
169
+ # CONTEXT #
170
+ In my role as a genomics specialist, I have extracted some summaries from a scholarly publication. These summaries were identified and retrieved from various sections throughout the document. My current objective is to consolidate this extracted information into a concise summary, facilitating a comprehensive understanding of the publication's key findings.
171
+
172
+ This is a set of summaries: {doc_summaries}
173
+
174
+ If there is no extracted entities, just leave the corresponding result with an empty string ("") later.
175
+
176
+ # OBJECTIVE #
177
+ Construct a comprehensive synopsis that incorporates all critical details gleaned from the summaries of each section.
178
+ IMPORTANT: It is crucial to maintain the utmost accuracy in this process, as any false or fabricated information (hallucination) can have severe consequences for academic integrity and research credibility.
179
+
180
+ # RESPONSE #
181
+ The extracted information will be utilizing the JSON format. Information within strings will be demarcated by double quotes (" "), while quotes contained within strings will be denoted by single quotes (' ').
182
+ This is the example of the respose:
183
+
184
+ {{
185
+ "Population": "South Asian",
186
+ "Sample Size": "403 Relatively Small",
187
+ "Study Methodology": "Double-Blind Randomized Controlled Trial",
188
+ "Study Level": "Postdoctoral"
189
+ }}
190
+ """
191
+
192
+ prompt_entity_one_chunk = """
193
+ # CONTEXT #
194
+ In my capacity as a genomics specialist, I have recently completed the review of a scholarly publication. I am interested in extracting specific genomic information, or entities, from the body of the paper. To facilitate this process, I have constructed a predefined schema that outlines the desired entities and their corresponding description.
195
+
196
+ This is the schema provided:
197
+
198
+ {{
199
+ "Title" : {{
200
+ "type" : "string",
201
+ "description" : "Title of the given text."
202
+ }},
203
+ "Authors" : {{
204
+ "type" : "string",
205
+ "description" : "Authors / writers of the given text. To maintain readability, consider only the first 10 author names, ensuring the other key informations and response format are clear."
206
+ }},
207
+ "Publisher Name" : {{
208
+ "type" : "string",
209
+ "description" : "Publisher name of the given text."
210
+ }},
211
+ "Publication Year" : {{
212
+ "type" : "string",
213
+ "description" : "The year when the given text published."
214
+ }},
215
+ }}
216
+
217
+ This is a passage from the paper: {}
218
+
219
+ # OBJECTIVE #
220
+ Given a predefined schema outlining relevant entities, meticulously extract these entities from the provided text passage.
221
+ IMPORTANT: It is crucial to maintain the utmost accuracy in this process, as any false or fabricated information (hallucination) can have severe consequences for academic integrity and research credibility. If an entity is entirely absent from the passage, just leave the corresponding field blank with an empty string ('').
222
+
223
+ # RESPONSE #
224
+ The extracted information will be utilizing the JSON format. Information within strings will be demarcated by double quotes (" "), while quotes contained within strings will be denoted by single quotes (' ').
225
+ This is the example of the respose:
226
+
227
+ {{
228
+ "Title": "Lorem Ipsum",
229
+ "Authors": "John Doe, Jane Doe, Alias",
230
+ "Publisher Name": "Journal of Internal Medicine",
231
+ "Publication Year": "2024"
232
+ }}
233
+
234
+ If there is no specific extracted entities, just leave the corresponding field blank with an empty string ("").
235
+ """
236
+
237
+ prompt_table = """
238
+ # CONTEXT #
239
+ In my capacity as a genomics specialist, I have table data obtained from a published research paper in the field of genomics. The data is provided in a list of JSONs format, with each JSON object representing a single row in a tabular structure. The first JSON element in the list represents the header row of the table, containing the names of each column.
240
+ This is the data:
241
+ {}
242
+
243
+ # OBJECTIVE #
244
+ Given the provided table data, the following tasks need to be completed:
245
+
246
+ 1. Identify all unique gene names present within the table. Each row can contains more than one gene name.
247
+ 2. If present, extract any entries starting with "rs" (presumably representing Single Nucleotide Polymorphisms or rsIDs) that correspond to the same row as their associated gene names. Each gene name can correspond with more than one SNPs.
248
+ 3. If available, extract any disease information associated with both the gene name and its corresponding SNP/rsID.
249
+
250
+ It is crucial to maintain the utmost accuracy in this process, as any false or fabricated information (hallucination) can have severe consequences for academic integrity and research credibility.
251
+ If an SNPs or Diseases is absent from the table, leave the corresponding field blank with an empty string ('').
252
+
253
+ # RESPONSE #
254
+ The output should only be a string containing list of JSON objects, each representing an entry with the following structure:
255
+ [
256
+ {{
257
+ "Genes": "A",
258
+ "SNPs": ["rs123", "rs456"],
259
+ "Diseases": "A disease"
260
+ }}
261
+ ]
262
+
263
+ If there is no specific extracted entities provided from the table, just leave the response with an empty lists ([]).
264
+ """
265
+
266
+ prompt_validation = """
267
+ # CONTEXT #
268
+ In my capacity as a genomics specialist, I have table data containing gene names with its corresponding SNPs and diseases. The data is provided in a list of JSONs format, with each JSON object representing a single row in a tabular structure.
269
+ The problem is because the data is extracetd using OCR, some gene names and SNPs maybe have a typo.
270
+
271
+ This is the data:
272
+ {}
273
+
274
+ # OBJECTIVE #
275
+ Given the provided table data, the following tasks need to be completed:
276
+
277
+ 1. Check whether the gene name is a correct gene name. If the gene name is suspected to be a typo, fix it into a correct form. If not, eliminate this row data because the gene name is not valid.
278
+ 2. If diseases not empty, check whether the gene name is correspond with the gene names. Fix it with the correct diseases if the original disease is wrong.
279
+
280
+ # RESPONSE #
281
+ The output should only be a string containing list of JSON objects, each representing an validated entry with the following structure:
282
+ [
283
+ {{
284
+ "Genes": "A",
285
+ "SNPs": "rs123",
286
+ "Diseases": "A disease"
287
+ }}
288
+ ]
289
+ """
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pikepdf
2
+ stqdm
3
+ pdf2image
4
+ nltk
5
+ pandas
6
+ streamlit
7
+ xlsxwriter
8
+ openai
9
+ biopython
10
+ langchain
11
+ pypdf
12
+ tiktoken
13
+ pillow-heif
14
+ torchvision
15
+ transformers
16
+ python-dotenv
17
+ rapidocr-onnxruntime
18
+ langchain-openai
19
+ img2table
20
+ timm
21
+ python-doctr
resources/experiment.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
resources/experiment.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import base64
3
+
4
+ from img2table.document import Image
5
+ from img2table.ocr import DocTR
6
+ from langchain.schema.messages import HumanMessage, AIMessage
7
+ from langchain_experimental.agents import create_pandas_dataframe_agent
8
+ from langchain_openai import ChatOpenAI
9
+ from dotenv import load_dotenv
10
+
11
+ load_dotenv()
12
+
13
+ def encode_image(image_path):
14
+ with open(image_path, "rb") as image_file:
15
+ return base64.b64encode(image_file.read()).decode('utf-8')
16
+
17
+ path = '../NutriGenMe-Testing/ukmss-1.png'
18
+
19
+ vision = ChatOpenAI(model="gpt-4-vision-preview", max_tokens=4096)
20
+
21
+ img = encode_image(path)
22
+ msg = vision.invoke(
23
+ [
24
+ AIMessage(content="You are an experienced doctor specializing in genomics and want to identify names of genes, SNPs, and their related diseases based on the tables given."),
25
+ HumanMessage(
26
+ content=[
27
+ { "type": "text",
28
+ "text": 'You will be provided with the image of a table. Extract all genes / locus names with its respective rsID / SNP and potential diseases in curly brackets like this: {"Genes" : "", "SNPs" : "", "Diseases" : ""}.'
29
+ },
30
+ {
31
+ "type": "image_url",
32
+ "image_url": {
33
+ "url": f"data:image/jpeg;base64,{img}",
34
+ "detail": "low"
35
+ },
36
+ },
37
+ ]
38
+ )
39
+ ]
40
+ )
41
+
42
+ print(msg.content)
43
+
44
+ # exit()
45
+
46
+ image = Image(path)
47
+ ocr = DocTR()
48
+
49
+ extracted_tables = image.extract_tables(ocr=ocr,
50
+ implicit_rows=True,
51
+ borderless_tables=True,
52
+ min_confidence=0)
53
+
54
+ df = extracted_tables[0].df
55
+ for et in extracted_tables[1:]:
56
+ df = pd.concat([df, et.df]).reset_index(drop=True)
57
+
58
+ print(df)
59
+
60
+ llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
61
+
62
+ agent = create_pandas_dataframe_agent(llm, df, verbose=True)
63
+ agent_output = agent.invoke("Is this table contain Gene names?")
64
+ print(agent_output)
resources/paper-extractor.excalidraw ADDED
The diff for this file is too large to render. See raw diff
 
table_detector.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from img2table.ocr import DocTR
2
+ from torchvision import transforms
3
+ from transformers import AutoModelForObjectDetection
4
+
5
+ import torch
6
+
7
+ def box_cxcywh_to_xyxy(x):
8
+ x_c, y_c, w, h = x.unbind(-1)
9
+ b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
10
+ return torch.stack(b, dim=1)
11
+
12
+ def rescale_bboxes(out_bbox, size):
13
+ width, height = size
14
+ boxes = box_cxcywh_to_xyxy(out_bbox)
15
+ boxes = boxes * torch.tensor(
16
+ [width, height, width, height], dtype=torch.float32
17
+ )
18
+ return boxes
19
+
20
+ def outputs_to_objects(outputs, img_size, id2label):
21
+ m = outputs.logits.softmax(-1).max(-1)
22
+ pred_labels = list(m.indices.detach().cpu().numpy())[0]
23
+ pred_scores = list(m.values.detach().cpu().numpy())[0]
24
+ pred_bboxes = outputs["pred_boxes"].detach().cpu()[0]
25
+ pred_bboxes = [
26
+ elem.tolist() for elem in rescale_bboxes(pred_bboxes, img_size)
27
+ ]
28
+
29
+ objects = []
30
+ for label, score, bbox in zip(pred_labels, pred_scores, pred_bboxes):
31
+ class_label = id2label[int(label)]
32
+ if not class_label == "no object":
33
+ objects.append(
34
+ {
35
+ "label": class_label,
36
+ "score": float(score),
37
+ "bbox": [float(elem) for elem in bbox],
38
+ }
39
+ )
40
+
41
+ return objects
42
+
43
+ class MaxResize(object):
44
+ def __init__(self, max_size=800):
45
+ self.max_size = max_size
46
+
47
+ def __call__(self, image):
48
+ width, height = image.size
49
+ current_max_size = max(width, height)
50
+ scale = self.max_size / current_max_size
51
+ resized_image = image.resize(
52
+ (int(round(scale * width)), int(round(scale * height)))
53
+ )
54
+
55
+ return resized_image
56
+
57
+ detection_transform = transforms.Compose(
58
+ [
59
+ MaxResize(800),
60
+ transforms.ToTensor(),
61
+ transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
62
+ ]
63
+ )
64
+
65
+ device = "cuda" if torch.cuda.is_available() else "cpu"
66
+ model = AutoModelForObjectDetection.from_pretrained("microsoft/table-transformer-detection", revision="no_timm").to(device)
67
+
68
+ ocr = DocTR()