fadliaulawi commited on
Commit
31eb136
1 Parent(s): 63bec36

Tidy up interface

Browse files
Files changed (2) hide show
  1. app.py +118 -108
  2. resources/experiment.ipynb +584 -77
app.py CHANGED
@@ -11,17 +11,20 @@ from langchain_text_splitters import TokenTextSplitter
11
  from process import Process
12
  from tempfile import NamedTemporaryFile
13
  from stqdm import stqdm
 
14
 
15
  buffer = io.BytesIO()
16
 
17
  st.cache_data()
18
  st.set_page_config(page_title="NutriGenMe Paper Extractor")
19
- st.title("NutriGenMe - Paper Extraction")
20
- st.markdown("<div style='text-align: left; color: white; font-size: 16px'>In its latest version, the app is equipped to extract essential information from papers, including tables in both horizontal and vertical orientations, images, and text exclusively.</div><br>", unsafe_allow_html=True)
 
21
 
22
- uploaded_files = st.file_uploader("Upload Paper(s) here :", type="pdf", accept_multiple_files=True)
23
-
24
- col1, col2, col3 = st.columns(3)
 
25
 
26
  with col1:
27
  models = (
@@ -30,9 +33,7 @@ with col1:
30
  # 'llama-3-sonar-large-32k-chat',
31
  # 'mixtral-8x7b-instruct',
32
  )
33
- model = st.selectbox(
34
- 'Model selection:', models, key='model'
35
- )
36
 
37
  with col2:
38
  tokens = (
@@ -40,118 +41,127 @@ with col2:
40
  16000,
41
  24000
42
  )
43
- chunk_option = st.selectbox(
44
- 'Token amounts per process:', tokens, key='token'
45
- )
46
  chunk_overlap = 0
47
 
48
  with col3:
49
  models_val = (
50
- 'gemini-1.5-pro-latest',
51
  'gpt-4-turbo',
 
52
  'mixtral-8x7b-instruct',
53
  # 'llama-3-sonar-large-32k-chat',
54
  )
55
- model_val = st.selectbox(
56
- 'Model validator selection:', models_val, key='model_val'
57
- )
 
 
 
 
 
 
58
 
 
 
 
59
 
60
  if uploaded_files:
61
- journals = []
62
- parseButtonHV = st.button("Get Result", key='table_HV')
63
 
64
- if parseButtonHV:
65
- with st.status("Extraction in progress ...", expanded=True) as status:
66
- start_time = datetime.now()
67
 
68
- for uploaded_file in stqdm(uploaded_files):
69
- with NamedTemporaryFile(dir='.', suffix=".pdf", delete=eval(os.getenv('DELETE_TEMP_PDF', 'True'))) as pdf:
70
- pdf.write(uploaded_file.getbuffer())
71
-
72
- # Load Documents
73
- loader = PyPDFLoader(pdf.name)
74
- pages = loader.load()
75
-
76
- chunk_size = 120000
77
- chunk_overlap = 0
78
- docs = pages
79
-
80
- # Split Documents
81
- if chunk_option:
82
- docs = [Document('\n'.join([page.page_content for page in pages]))]
83
- docs[0].metadata = {'source': pages[0].metadata['source']}
84
-
85
- chunk_size = chunk_option
86
- chunk_overlap = int(0.25 * chunk_size)
87
-
88
- text_splitter = TokenTextSplitter.from_tiktoken_encoder(
89
- chunk_size=chunk_size, chunk_overlap=chunk_overlap
90
- )
91
- chunks = text_splitter.split_documents(docs)
92
-
93
- # Start extraction process in parallel
94
- process = Process(model, model_val)
95
- with ThreadPoolExecutor() as executor:
96
- result_gsd = executor.submit(process.get_entity, (chunks, 'gsd'))
97
- result_summ = executor.submit(process.get_entity, (chunks, 'summ'))
98
- result = executor.submit(process.get_entity, (chunks, 'all'))
99
- result_one = executor.submit(process.get_entity_one, [c.page_content for c in chunks[:1]])
100
- result_table = executor.submit(process.get_table, pdf.name)
101
-
102
- result_gsd = result_gsd.result()
103
- result_summ = result_summ.result()
104
- result = result.result()
105
- result_one = result_one.result()
106
- res_gene, res_snp, res_dis = result_table.result()
107
-
108
- # Combine Result
109
- result['Genes'] = res_gene + result_gsd['Genes']
110
- result['SNPs'] = res_snp + result_gsd['SNPs']
111
- result['Diseases'] = res_dis + result_gsd['Diseases']
112
- result['Conclusion'] = result_summ
113
- for k in result_one.keys():
114
- result[k] = result_one[k]
115
-
116
- if len(result['Genes']) == 0:
117
- result['Genes'] = ['']
118
-
119
- num_rows = max(max(len(result['Genes']), len(result['SNPs'])), len(result['Diseases']))
120
-
121
- # Adjust Genes, SNPs, Diseases
122
- for k in ['Genes', 'SNPs', 'Diseases']:
123
- while len(result[k]) < num_rows:
124
- result[k].append('')
125
-
126
- # Temporary handling
127
- result[k] = result[k][:num_rows]
128
-
129
- # Key Column
130
- result = {key: value if isinstance(value, list) else [value] * num_rows for key, value in result.items()}
131
-
132
- dataframe = pd.DataFrame(result)
133
- dataframe = dataframe[['Genes', 'SNPs', 'Diseases', 'Title', 'Authors', 'Publisher Name', 'Publication Year', 'Population', 'Sample Size', 'Study Methodology', 'Study Level', 'Conclusion']]
134
- dataframe = dataframe[dataframe['Genes'].astype(bool)].reset_index(drop=True)
135
- dataframe.drop_duplicates(['Genes', 'SNPs'], inplace=True)
136
- dataframe.reset_index(drop=True, inplace=True)
137
-
138
- # Validate Result
139
- df, df_no_llm, df_clean = process.validate(dataframe)
140
-
141
- end_time = datetime.now()
142
- st.write("Success in ", round((end_time.timestamp() - start_time.timestamp()) / 60, 2), "minutes")
143
-
144
- st.dataframe(df)
145
- with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
 
 
 
146
  df.to_excel(writer, sheet_name='Result Cleaned API LLM')
147
  df_no_llm.to_excel(writer, sheet_name='Result Cleaned API')
148
- df_clean.to_excel(writer, sheet_name='Result Cleaned')
149
- dataframe.to_excel(writer, sheet_name='Original')
150
- writer.close()
151
-
152
- st.download_button(
153
- label="Save Result",
154
- data=buffer,
155
- file_name=f"{uploaded_file.name.replace('.pdf', '')}_{chunk_option}_{model.split('-')[0]}_{model_val.split('-')[0]}.xlsx",
156
- mime='application/vnd.ms-excel'
157
- )
 
 
 
11
  from process import Process
12
  from tempfile import NamedTemporaryFile
13
  from stqdm import stqdm
14
+ from validate import Validation
15
 
16
  buffer = io.BytesIO()
17
 
18
  st.cache_data()
19
  st.set_page_config(page_title="NutriGenMe Paper Extractor")
20
+ st.title("NutriGenMe - Paper Extractor")
21
+ st.markdown("<div style='text-align: justify;text-justify: inter-word;'>NutriGenMe Paper Extractor is a tool designed to extract relevant information from genomic papers related to the NutriGenMe project. It utilizes natural language processing techniques to parse through documents and extract key data points, enabling researchers and practitioners to efficiently gather insights from a large corpus of literature.</div>", unsafe_allow_html=True)
22
+ st.divider()
23
 
24
+ st.markdown("<h4>Extraction</h4>", unsafe_allow_html=True)
25
+ col1, col2 = st.columns(2)
26
+ st.markdown("<h4>Validation</h4>", unsafe_allow_html=True)
27
+ col3, col4 = st.columns(2)
28
 
29
  with col1:
30
  models = (
 
33
  # 'llama-3-sonar-large-32k-chat',
34
  # 'mixtral-8x7b-instruct',
35
  )
36
+ model = st.selectbox('Model selection:', models, key='model')
 
 
37
 
38
  with col2:
39
  tokens = (
 
41
  16000,
42
  24000
43
  )
44
+ chunk_option = st.selectbox('Token amounts per process:', tokens, key='token')
 
 
45
  chunk_overlap = 0
46
 
47
  with col3:
48
  models_val = (
 
49
  'gpt-4-turbo',
50
+ 'gemini-1.5-pro-latest',
51
  'mixtral-8x7b-instruct',
52
  # 'llama-3-sonar-large-32k-chat',
53
  )
54
+ model_val = st.selectbox('Model validator selection:', models_val, key='model_val')
55
+
56
+ with col4:
57
+ api = st.toggle('Validate with API')
58
+
59
+ if api:
60
+ st.warning("""This validation process leverage external application programming interfaces (APIs) from NCBI and EBI to verify information.
61
+ These APIs may have limitations on their usage, so please exercise responsible use of this functionality.
62
+ If you opt to employ API validation and the process takes a long time (more than 1 hour), consider refreshing the page and proceeding without API validation.""", icon="⚠️")
63
 
64
+ st.divider()
65
+ st.markdown("<h4>Process</h4>", unsafe_allow_html=True)
66
+ uploaded_files = st.file_uploader("Upload Paper(s) here :", type="pdf", accept_multiple_files=True)
67
 
68
  if uploaded_files:
69
+ submit = st.button("Get Result", key='submit')
 
70
 
71
+ if uploaded_files and submit:
 
 
72
 
73
+ with st.status("Extraction in progress ...", expanded=True) as status:
74
+ for uploaded_file in stqdm(uploaded_files):
75
+ start_time = datetime.now()
76
+ with NamedTemporaryFile(dir='.', suffix=".pdf", delete=eval(os.getenv('DELETE_TEMP_PDF', 'True'))) as pdf:
77
+
78
+ pdf.write(uploaded_file.getbuffer())
79
+ st.markdown(f"Start Extraction process at <code>{datetime.now().strftime('%H:%M')}</code>", unsafe_allow_html=True)
80
+
81
+ # Load Documents
82
+ loader = PyPDFLoader(pdf.name)
83
+ pages = loader.load()
84
+
85
+ chunk_size = 120000
86
+ chunk_overlap = 0
87
+ docs = pages
88
+
89
+ # Split Documents
90
+ if chunk_option:
91
+ docs = [Document('\n'.join([page.page_content for page in pages]))]
92
+ docs[0].metadata = {'source': pages[0].metadata['source']}
93
+
94
+ chunk_size = chunk_option
95
+ chunk_overlap = int(0.25 * chunk_size)
96
+
97
+ text_splitter = TokenTextSplitter.from_tiktoken_encoder(
98
+ chunk_size=chunk_size, chunk_overlap=chunk_overlap
99
+ )
100
+ chunks = text_splitter.split_documents(docs)
101
+
102
+ # Start extraction process in parallel
103
+ process = Process(model)
104
+ with ThreadPoolExecutor() as executor:
105
+ result_gsd = executor.submit(process.get_entity, (chunks, 'gsd'))
106
+ result_summ = executor.submit(process.get_entity, (chunks, 'summ'))
107
+ result = executor.submit(process.get_entity, (chunks, 'all'))
108
+ result_one = executor.submit(process.get_entity_one, [c.page_content for c in chunks[:1]])
109
+ result_table = executor.submit(process.get_table, pdf.name)
110
+
111
+ result_gsd = result_gsd.result()
112
+ result_summ = result_summ.result()
113
+ result = result.result()
114
+ result_one = result_one.result()
115
+ res_gene, res_snp, res_dis = result_table.result()
116
+
117
+ # Combine Result
118
+ result['Genes'] = res_gene + result_gsd['Genes']
119
+ result['SNPs'] = res_snp + result_gsd['SNPs']
120
+ result['Diseases'] = res_dis + result_gsd['Diseases']
121
+ result['Conclusion'] = result_summ
122
+ for k in result_one.keys():
123
+ result[k] = result_one[k]
124
+
125
+ if len(result['Genes']) == 0:
126
+ result['Genes'] = ['']
127
+
128
+ # Adjust Genes, SNPs, Diseases
129
+ num_rows = max(max(len(result['Genes']), len(result['SNPs'])), len(result['Diseases']))
130
+ for k in ['Genes', 'SNPs', 'Diseases']:
131
+ while len(result[k]) < num_rows:
132
+ result[k].append('')
133
+
134
+ # Temporary handling
135
+ result[k] = result[k][:num_rows]
136
+
137
+ # Arrange Column
138
+ result = {key: value if isinstance(value, list) else [value] * num_rows for key, value in result.items()}
139
+ dataframe = pd.DataFrame(result)
140
+ dataframe = dataframe[['Genes', 'SNPs', 'Diseases', 'Title', 'Authors', 'Publisher Name', 'Publication Year', 'Population', 'Sample Size', 'Study Methodology', 'Study Level', 'Conclusion']]
141
+ dataframe = dataframe[dataframe['Genes'].astype(bool)].reset_index(drop=True)
142
+ dataframe.reset_index(drop=True, inplace=True)
143
+
144
+ # Validate Result
145
+ st.markdown(f"Start Validation process at <code>{datetime.now().strftime('%H:%M')}</code>", unsafe_allow_html=True)
146
+ validation = Validation(model_val)
147
+ df, df_no_llm, df_clean = validation.validate(dataframe, api)
148
+ df.drop_duplicates(['Genes', 'SNPs'], inplace=True)
149
+ st.write("Success in ", round((datetime.now().timestamp() - start_time.timestamp()) / 60, 2), "minutes")
150
+
151
+ st.dataframe(df)
152
+ with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
153
+ if api:
154
  df.to_excel(writer, sheet_name='Result Cleaned API LLM')
155
  df_no_llm.to_excel(writer, sheet_name='Result Cleaned API')
156
+ else:
157
+ df.to_excel(writer, sheet_name='Result Cleaned LLM')
158
+ df_clean.to_excel(writer, sheet_name='Result Cleaned')
159
+ dataframe.to_excel(writer, sheet_name='Original')
160
+ writer.close()
161
+
162
+ st.download_button(
163
+ label="Save Result",
164
+ data=buffer,
165
+ file_name=f"{uploaded_file.name.replace('.pdf', '')}_{chunk_option}_{model.split('-')[0]}_{model_val.split('-')[0]}.xlsx",
166
+ mime='application/vnd.ms-excel'
167
+ )
resources/experiment.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 2,
6
  "metadata": {},
7
  "outputs": [
8
  {
@@ -38,23 +38,23 @@
38
  },
39
  {
40
  "cell_type": "code",
41
- "execution_count": 50,
42
  "metadata": {},
43
  "outputs": [],
44
  "source": [
45
- "image = Image('../NutriGenMe-Testing/monogenic-1.png')"
46
  ]
47
  },
48
  {
49
  "cell_type": "code",
50
- "execution_count": 51,
51
  "metadata": {},
52
  "outputs": [
53
  {
54
  "name": "stdout",
55
  "output_type": "stream",
56
  "text": [
57
- "2\n"
58
  ]
59
  },
60
  {
@@ -83,108 +83,196 @@
83
  " <th>2</th>\n",
84
  " <th>3</th>\n",
85
  " <th>4</th>\n",
 
 
 
 
 
 
 
 
86
  " </tr>\n",
87
  " </thead>\n",
88
  " <tbody>\n",
89
  " <tr>\n",
90
  " <th>0</th>\n",
91
- " <td>None</td>\n",
92
- " <td>None</td>\n",
93
- " <td>Monogenic Diabetes or</td>\n",
94
- " <td>Associated With Common</td>\n",
95
- " <td>None</td>\n",
 
 
 
 
 
 
 
 
96
  " </tr>\n",
97
  " <tr>\n",
98
  " <th>1</th>\n",
99
- " <td>Gene Name</td>\n",
100
- " <td>Major Function</td>\n",
101
- " <td>Syndromes</td>\n",
102
- " <td>T1D and/or T2D</td>\n",
103
- " <td>Refs.</td>\n",
 
 
 
 
 
 
 
 
104
  " </tr>\n",
105
  " <tr>\n",
106
  " <th>2</th>\n",
107
- " <td>KCNJ11</td>\n",
108
- " <td>Encodes pore-forming inwardly-rectifying</td>\n",
109
- " <td>PNDM (most common cause)</td>\n",
110
- " <td>E23K</td>\n",
111
- " <td>42-46</td>\n",
 
 
 
 
 
 
 
 
112
  " </tr>\n",
113
  " <tr>\n",
114
  " <th>3</th>\n",
115
  " <td>None</td>\n",
116
- " <td>potassium channel subunits (Kir6.2)</td>\n",
117
- " <td>and TNDM, CHI, MODY</td>\n",
118
  " <td>None</td>\n",
119
  " <td>None</td>\n",
 
 
 
 
 
 
 
 
 
 
120
  " </tr>\n",
121
  " <tr>\n",
122
  " <th>4</th>\n",
123
- " <td>ABCC8</td>\n",
124
- " <td>Encodes regulatory SUR1 subunits</td>\n",
125
- " <td>PNDM and TNDM, CHI, MODY</td>\n",
126
- " <td>A1369S, 1273AGA, R1420H</td>\n",
127
- " <td>46,47,52</td>\n",
 
 
 
 
 
 
 
 
128
  " </tr>\n",
129
  " <tr>\n",
130
  " <th>5</th>\n",
131
- " <td>GCK</td>\n",
132
- " <td>A key glucose-phosphoryating enzyme;</td>\n",
133
- " <td>GCK-MODY (MODY2), PNDM,</td>\n",
134
- " <td>rs1799884 (G/A), rs4607517 (A/G),</td>\n",
135
- " <td>75,78,79</td>\n",
 
 
 
 
 
 
 
 
136
  " </tr>\n",
137
  " <tr>\n",
138
  " <th>6</th>\n",
139
- " <td>None</td>\n",
140
- " <td>a glucose sensor</td>\n",
141
- " <td>CHI</td>\n",
142
- " <td>3'UTR SNP, chr7:44184184-G/A</td>\n",
143
- " <td>None</td>\n",
 
 
 
 
 
 
 
 
144
  " </tr>\n",
145
  " <tr>\n",
146
  " <th>7</th>\n",
147
- " <td>SLC2A2</td>\n",
148
- " <td>Encodes GLUT2, a high-capacity facilitative</td>\n",
149
- " <td>FBS</td>\n",
150
- " <td>SNPS rs5393 (AA) and rs5394</td>\n",
151
- " <td>93-100</td>\n",
 
 
 
 
 
 
 
 
152
  " </tr>\n",
153
  " <tr>\n",
154
  " <th>8</th>\n",
155
  " <td>None</td>\n",
156
- " <td>glucose transporter</td>\n",
157
  " <td>None</td>\n",
158
- " <td>(CC) in the promoter region</td>\n",
159
  " <td>None</td>\n",
 
 
 
 
 
 
 
 
 
 
160
  " </tr>\n",
161
  " <tr>\n",
162
  " <th>9</th>\n",
163
  " <td>None</td>\n",
164
  " <td>None</td>\n",
165
  " <td>None</td>\n",
166
- " <td>and SNPS rs5400 (T1101) and</td>\n",
167
  " <td>None</td>\n",
 
 
 
 
 
 
 
 
 
168
  " </tr>\n",
169
  " </tbody>\n",
170
  "</table>\n",
171
  "</div>"
172
  ],
173
  "text/plain": [
174
- " 0 1 2 3 4\n",
175
- "0 None None Monogenic Diabetes or Associated With Common None\n",
176
- "1 Gene Name Major Function Syndromes T1D and/or T2D Refs.\n",
177
- "2 KCNJ11 Encodes pore-forming inwardly-rectifying PNDM (most common cause) E23K 42-46\n",
178
- "3 None potassium channel subunits (Kir6.2) and TNDM, CHI, MODY None None\n",
179
- "4 ABCC8 Encodes regulatory SUR1 subunits PNDM and TNDM, CHI, MODY A1369S, 1273AGA, R1420H 46,47,52\n",
180
- "5 GCK A key glucose-phosphoryating enzyme; GCK-MODY (MODY2), PNDM, rs1799884 (G/A), rs4607517 (A/G), 75,78,79\n",
181
- "6 None a glucose sensor CHI 3'UTR SNP, chr7:44184184-G/A None\n",
182
- "7 SLC2A2 Encodes GLUT2, a high-capacity facilitative FBS SNPS rs5393 (AA) and rs5394 93-100\n",
183
- "8 None glucose transporter None (CC) in the promoter region None\n",
184
- "9 None None None and SNPS rs5400 (T1101) and None"
185
  ]
186
  },
187
- "execution_count": 51,
188
  "metadata": {},
189
  "output_type": "execute_result"
190
  }
@@ -204,30 +292,15 @@
204
  },
205
  {
206
  "cell_type": "code",
207
- "execution_count": 52,
208
  "metadata": {},
209
  "outputs": [
210
  {
211
  "name": "stdout",
212
  "output_type": "stream",
213
  "text": [
214
- " 0 1 2 3 4\n",
215
- "0 Monogenic Diabetes or Associated With Common \n",
216
- "1 Gene Name Major Function Syndromes T1D and/or T2D Refs.\n",
217
- "2 KCNJ11 Encodes pore-forming inwardly-rectifying potas... PNDM (most common cause) and TNDM, CHI, MODY E23K 42-46\n",
218
- "3 ABCC8 Encodes regulatory SUR1 subunits PNDM and TNDM, CHI, MODY A1369S, 1273AGA, R1420H 46,47,52\n",
219
- "4 GCK A key glucose-phosphoryating enzyme; a glucose... GCK-MODY (MODY2), PNDM, CHI rs1799884 (G/A), rs4607517 (A/G), 3'UTR SNP, c... 75,78,79\n",
220
- "5 SLC2A2 Encodes GLUT2, a high-capacity facilitative gl... FBS SNPS rs5393 (AA) and rs5394 (CC) in the promot... 93-100\n",
221
- "6 HNF1A/TCF1 TF; regulator of pancreatic B-cell differentia... HNF1A-MODY (MODY3), most common cause of MODY,... G319S, C.1522G>A (p.E508K) 114, 118, 119\n",
222
- "7 HNF4A Key TF for early fetal development HNF4A MODY (MODY1), CHI SNPS rs2144908, rs3818247 and rs884614, rs4810... 121-124, 274\n",
223
- "8 HNF1B/TCF2 TF; required for the generation of pancreatic ... RCAD syndrome, or MODY5; TNDM and PNDM (rare) SNP rs757210 A, TS4430796 A, and TS7501939 C 141, 144\n",
224
- "9 PDX1 TF; required for pancreas development, B-cell ... PNDM, MODY4 C18R, Q59L, D76N, R197H, G212R, P239Q, InsCCG2... 163-165, 167\n",
225
- "10 PAX4 Islet TF that functions mainly as a transcript... MODY9 R121W, R133W, R37W, rs10229583 G 180, 181, 187\n",
226
- "11 NEUROD1/BETA2 TF; required for the development of the endocr... MODY6 and PNDM R111L and 206 + C; A45T variant at rs1801262 (... 204-208\n",
227
- "12 WFS1 A transmembrane protein; a negative regulator ... WFS1, sometimes referred to as DIDMOAD R456 and H611, SNPS at rs10010131, rs6446482; ... 223-225\n",
228
- "13 PPARG TF; master regulator of adipogenesis, energy b... Monogenic diabetes Monogenic Diabetes Genes ... Pro12Ala variant (rs1801282), SNP at rs4684847... 240-243, 250\n",
229
- "14 INS Predominant glucose-lowering hormone PNDM (2nd most common cause), TNDM, MODY10 Class I alleles of INS VNTR associated with T1... 273, 274, 276-281\n",
230
- "15 GLIS3 TF; regulator of islet development, insulin ge... Neonatal diabetes syndrome associated with con... rs7020673 G associated with T1D; rs7034200 A a... 78, 214, 289, 291, 292, 295-308\n"
231
  ]
232
  }
233
  ],
@@ -254,6 +327,440 @@
254
  "print(dfc)"
255
  ]
256
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  {
258
  "cell_type": "markdown",
259
  "metadata": {},
@@ -263,14 +770,14 @@
263
  },
264
  {
265
  "cell_type": "code",
266
- "execution_count": 12,
267
  "metadata": {},
268
  "outputs": [
269
  {
270
  "name": "stdout",
271
  "output_type": "stream",
272
  "text": [
273
- "118 [('rs5393', 'GLUT2'), ('rs5404', 'SNPS'), ('rs757210', 'HNF1B'), ('rs884614', 'SNPS'), ('rs2144908', 'MODY'), ('rs2144908', 'CHI'), ('rs4684847', 'T1D'), ('rs1884613', 'MODY'), ('rs1884613', 'CHI'), ('rs5393', 'SNPS'), ('rs734312', 'SNPS'), ('rs5394', 'GLUT2'), ('rs757210', 'TS4430796'), ('rs7041847', 'T1D'), ('rs6446482', 'SNPS'), ('rs7020673', 'GLIS3'), ('rs4684847', 'TZDS'), ('rs757210', 'PNDM'), ('rs5400', 'GLUT2'), ('rs7020673', 'T2D'), ('rs3818247', 'HNF4A'), ('rs4810424', 'MODY'), ('rs4810424', 'CHI'), ('rs10229583', 'R133W'), ('rs1801262', 'R111L'), ('rs1801262', 'BETA2'), ('rs10010131', 'SNPS'), ('rs10229583', 'MODY9'), ('rs5400', 'SNPS'), ('rs1801282', 'T1D'), ('rs2144908', 'HNF4A'), ('rs5393', 'FBS'), ('rs757210', 'RCAD'), ('rs10229583', 'R121W'), ('rs1801262', 'INS'), ('rs10010131', 'R456'), ('rs4684847', 'SNP'), ('rs7034200', 'T2D'), ('rs5404', 'T1101'), ('rs4607517', 'MODY'), ('rs1799884', 'MODY'), ('rs1799884', 'CHI'), ('rs4607517', 'PNDM'), ('rs6446482', 'WFS1'), ('rs1799884', 'PNDM'), ('rs5404', 'SLC2A2'), ('rs1801282', 'TZDS'), ('rs5404', 'T198T'), ('rs884614', 'MODY1'), ('rs734312', 'DIDMOAD'), ('rs5394', 'FBS'), ('rs4810424', 'HNF4A'), ('rs7020673', 'T1D'), ('rs757210', 'TCF2'), ('rs5393', 'T1101'), ('rs6446482', 'DIDMOAD'), ('rs1801262', 'A45T'), ('rs5394', 'SNPS'), ('rs5393', 'SLC2A2'), ('rs884614', 'CHI'), ('rs884614', 'MODY'), ('rs5393', 'T198T'), ('rs5400', 'FBS'), ('rs3818247', 'SNPS'), ('rs757210', 'SNP'), ('rs10229583', 'R37W'), ('rs10229583', 'PAX4'), ('rs4684847', 'T2D'), ('rs1801282', 'SNP'), ('rs7034200', 'GLIS3'), ('rs1884613', 'HNF4A'), ('rs4607517', 'GCK'), ('rs757210', 'TS7501939'), ('rs1799884', 'GCK'), ('rs10010131', 'DIDMOAD'), ('rs734312', 'WFS1'), ('rs2144908', 'SNPS'), ('rs5394', 'T198T'), ('rs4684847', 'PPARG'), ('rs734312', 'H611'), ('rs1801262', 'MODY6'), ('rs4607517', 'CHI'), ('rs7041847', 'T2D'), ('rs5404', 'GLUT2'), ('rs5400', 'T1101'), ('rs4607517', 'UTR'), ('rs1799884', 'UTR'), ('rs5400', 'SLC2A2'), ('rs6446482', 'H611'), ('rs5400', 'T198T'), ('rs1799884', 'SNP'), ('rs884614', 'HNF4A'), ('rs4810424', 'SNPS'), ('rs10010131', 'WFS1'), ('rs1801282', 'T2D'), ('rs10010131', 'H611'), ('rs1801262', 'PNDM'), ('rs4607517', 'SNP'), ('rs5394', 'T1101'), ('rs757210', 'TNDM'), ('rs4810424', 'MODY1'), ('rs1801282', 'PPARG'), ('rs7034200', 'T1D'), ('rs7041847', 'GLIS3'), ('rs4607517', 'MODY2'), ('rs5394', 'SLC2A2'), ('rs3818247', 'MODY1'), ('rs1799884', 'MODY2'), ('rs1884613', 'SNPS'), ('rs757210', 'MODY5'), ('rs734312', 'R456'), ('rs3818247', 'MODY'), ('rs3818247', 'CHI'), ('rs6446482', 'R456'), ('rs5404', 'FBS'), ('rs1801262', 'NEUROD1'), ('rs2144908', 'MODY1'), ('rs1884613', 'MODY1')]\n"
274
  ]
275
  }
276
  ],
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 1,
6
  "metadata": {},
7
  "outputs": [
8
  {
 
38
  },
39
  {
40
  "cell_type": "code",
41
+ "execution_count": 24,
42
  "metadata": {},
43
  "outputs": [],
44
  "source": [
45
+ "image = Image('testing/ukmss-2.png')"
46
  ]
47
  },
48
  {
49
  "cell_type": "code",
50
+ "execution_count": 25,
51
  "metadata": {},
52
  "outputs": [
53
  {
54
  "name": "stdout",
55
  "output_type": "stream",
56
  "text": [
57
+ "1\n"
58
  ]
59
  },
60
  {
 
83
  " <th>2</th>\n",
84
  " <th>3</th>\n",
85
  " <th>4</th>\n",
86
+ " <th>5</th>\n",
87
+ " <th>6</th>\n",
88
+ " <th>7</th>\n",
89
+ " <th>8</th>\n",
90
+ " <th>9</th>\n",
91
+ " <th>10</th>\n",
92
+ " <th>11</th>\n",
93
+ " <th>12</th>\n",
94
  " </tr>\n",
95
  " </thead>\n",
96
  " <tbody>\n",
97
  " <tr>\n",
98
  " <th>0</th>\n",
99
+ " <td>SNP</td>\n",
100
+ " <td>Chr.</td>\n",
101
+ " <td>Position\\nB36\\n(bp)</td>\n",
102
+ " <td>Nearby\\ngenea</td>\n",
103
+ " <td>Risk\\nalleleb</td>\n",
104
+ " <td>Gene (transcript)</td>\n",
105
+ " <td>Tissue</td>\n",
106
+ " <td>Effect (s.e.m.)C</td>\n",
107
+ " <td>P value</td>\n",
108
+ " <td>P d\\nadj</td>\n",
109
+ " <td>SNP(2f</td>\n",
110
+ " <td>Pvalue</td>\n",
111
+ " <td>P g\\nadj</td>\n",
112
  " </tr>\n",
113
  " <tr>\n",
114
  " <th>1</th>\n",
115
+ " <td>Novel loci reported in this study</td>\n",
116
+ " <td>Novel loci reported in this study</td>\n",
117
+ " <td>Novel loci reported in this study</td>\n",
118
+ " <td>None</td>\n",
119
+ " <td>None</td>\n",
120
+ " <td>None</td>\n",
121
+ " <td>None</td>\n",
122
+ " <td>None</td>\n",
123
+ " <td>None</td>\n",
124
+ " <td>None</td>\n",
125
+ " <td>None</td>\n",
126
+ " <td>None</td>\n",
127
+ " <td>None</td>\n",
128
  " </tr>\n",
129
  " <tr>\n",
130
  " <th>2</th>\n",
131
+ " <td>rs4457053</td>\n",
132
+ " <td>5</td>\n",
133
+ " <td>76,460,705</td>\n",
134
+ " <td>ZBED3</td>\n",
135
+ " <td>G</td>\n",
136
+ " <td>PDE8B(NM 003719)</td>\n",
137
+ " <td>Adipose</td>\n",
138
+ " <td>0.302 (0.070)</td>\n",
139
+ " <td>2.8 X 10-5</td>\n",
140
+ " <td>0.80</td>\n",
141
+ " <td>rs6864250 (0.18)</td>\n",
142
+ " <td>3.1 X 10-17</td>\n",
143
+ " <td>5.8 X 10-13</td>\n",
144
  " </tr>\n",
145
  " <tr>\n",
146
  " <th>3</th>\n",
147
  " <td>None</td>\n",
 
 
148
  " <td>None</td>\n",
149
  " <td>None</td>\n",
150
+ " <td>None</td>\n",
151
+ " <td>None</td>\n",
152
+ " <td>ZBED3(NM 032367)</td>\n",
153
+ " <td>Adipose</td>\n",
154
+ " <td>0.429 (0.068)</td>\n",
155
+ " <td>1.0: x 10-9</td>\n",
156
+ " <td>0.011</td>\n",
157
+ " <td>rs4704389 (0.20)</td>\n",
158
+ " <td>3.9 x 10-16</td>\n",
159
+ " <td>6.0 X 10-9</td>\n",
160
  " </tr>\n",
161
  " <tr>\n",
162
  " <th>4</th>\n",
163
+ " <td>rs972283</td>\n",
164
+ " <td>7</td>\n",
165
+ " <td>130,117,394</td>\n",
166
+ " <td>KLF14</td>\n",
167
+ " <td>G</td>\n",
168
+ " <td>KLF14(NM_138693)</td>\n",
169
+ " <td>Adipose</td>\n",
170
+ " <td>-0.387 (0.058)</td>\n",
171
+ " <td>8.1 X 10-11</td>\n",
172
+ " <td>0.058</td>\n",
173
+ " <td>rs738134 (0.30)</td>\n",
174
+ " <td>2.2 X 10-12</td>\n",
175
+ " <td>0.0014</td>\n",
176
  " </tr>\n",
177
  " <tr>\n",
178
  " <th>5</th>\n",
179
+ " <td>rs896854</td>\n",
180
+ " <td>8</td>\n",
181
+ " <td>96,029,687</td>\n",
182
+ " <td>TP53INPI</td>\n",
183
+ " <td>T</td>\n",
184
+ " <td>CCNE2 (NM 057749)</td>\n",
185
+ " <td>Blood</td>\n",
186
+ " <td>0.225 (0.053)</td>\n",
187
+ " <td>3.8 X 10-5</td>\n",
188
+ " <td>0.78</td>\n",
189
+ " <td>rs4735339 (0.61)</td>\n",
190
+ " <td>5.8 X 10-7</td>\n",
191
+ " <td>0.0051</td>\n",
192
  " </tr>\n",
193
  " <tr>\n",
194
  " <th>6</th>\n",
195
+ " <td>rs1552224</td>\n",
196
+ " <td>11</td>\n",
197
+ " <td>72,110,746</td>\n",
198
+ " <td>CENTD2</td>\n",
199
+ " <td>A</td>\n",
200
+ " <td>STARDIO(NM 006645)</td>\n",
201
+ " <td>Blood</td>\n",
202
+ " <td>0.337 (0.066)</td>\n",
203
+ " <td>8.6 x 10-7</td>\n",
204
+ " <td>0.026</td>\n",
205
+ " <td>rs519790 (0.04)</td>\n",
206
+ " <td>2.7x 10-24</td>\n",
207
+ " <td>1.6 X 10-1</td>\n",
208
  " </tr>\n",
209
  " <tr>\n",
210
  " <th>7</th>\n",
211
+ " <td>rs7957197</td>\n",
212
+ " <td>12</td>\n",
213
+ " <td>119,945,069</td>\n",
214
+ " <td>HNFIA</td>\n",
215
+ " <td>T</td>\n",
216
+ " <td>ACADS (NM 000017)</td>\n",
217
+ " <td>Adipose</td>\n",
218
+ " <td>0.248 (0.067)</td>\n",
219
+ " <td>3.7 x 10-4</td>\n",
220
+ " <td>0.29</td>\n",
221
+ " <td>rs9204\\n(0.02)</td>\n",
222
+ " <td>1.3x 10-53</td>\n",
223
+ " <td>5.9 X 10-50</td>\n",
224
  " </tr>\n",
225
  " <tr>\n",
226
  " <th>8</th>\n",
227
  " <td>None</td>\n",
 
228
  " <td>None</td>\n",
 
229
  " <td>None</td>\n",
230
+ " <td>None</td>\n",
231
+ " <td>None</td>\n",
232
+ " <td>PSMD9 (NM 002813)</td>\n",
233
+ " <td>Blood</td>\n",
234
+ " <td>0.240 (0.065)</td>\n",
235
+ " <td>3.9 X 10-4</td>\n",
236
+ " <td>0.0088</td>\n",
237
+ " <td>rs3741593\\n(0.00)</td>\n",
238
+ " <td>8.3x 10-8</td>\n",
239
+ " <td>1.7 X 10-6</td>\n",
240
  " </tr>\n",
241
  " <tr>\n",
242
  " <th>9</th>\n",
243
  " <td>None</td>\n",
244
  " <td>None</td>\n",
245
  " <td>None</td>\n",
 
246
  " <td>None</td>\n",
247
+ " <td>None</td>\n",
248
+ " <td>OASL (NM_003733)</td>\n",
249
+ " <td>Adipose</td>\n",
250
+ " <td>0.318 (0.068)</td>\n",
251
+ " <td>6.4 X 10-6</td>\n",
252
+ " <td>0.13</td>\n",
253
+ " <td>rs2259883\\n(0.19)</td>\n",
254
+ " <td>1.1x1 10-7</td>\n",
255
+ " <td>0.0018</td>\n",
256
  " </tr>\n",
257
  " </tbody>\n",
258
  "</table>\n",
259
  "</div>"
260
  ],
261
  "text/plain": [
262
+ " 0 1 2 3 4 5 6 7 8 9 10 11 12\n",
263
+ "0 SNP Chr. Position\\nB36\\n(bp) Nearby\\ngenea Risk\\nalleleb Gene (transcript) Tissue Effect (s.e.m.)C P value P d\\nadj SNP(2f Pvalue P g\\nadj\n",
264
+ "1 Novel loci reported in this study Novel loci reported in this study Novel loci reported in this study None None None None None None None None None None\n",
265
+ "2 rs4457053 5 76,460,705 ZBED3 G PDE8B(NM 003719) Adipose 0.302 (0.070) 2.8 X 10-5 0.80 rs6864250 (0.18) 3.1 X 10-17 5.8 X 10-13\n",
266
+ "3 None None None None None ZBED3(NM 032367) Adipose 0.429 (0.068) 1.0: x 10-9 0.011 rs4704389 (0.20) 3.9 x 10-16 6.0 X 10-9\n",
267
+ "4 rs972283 7 130,117,394 KLF14 G KLF14(NM_138693) Adipose -0.387 (0.058) 8.1 X 10-11 0.058 rs738134 (0.30) 2.2 X 10-12 0.0014\n",
268
+ "5 rs896854 8 96,029,687 TP53INPI T CCNE2 (NM 057749) Blood 0.225 (0.053) 3.8 X 10-5 0.78 rs4735339 (0.61) 5.8 X 10-7 0.0051\n",
269
+ "6 rs1552224 11 72,110,746 CENTD2 A STARDIO(NM 006645) Blood 0.337 (0.066) 8.6 x 10-7 0.026 rs519790 (0.04) 2.7x 10-24 1.6 X 10-1\n",
270
+ "7 rs7957197 12 119,945,069 HNFIA T ACADS (NM 000017) Adipose 0.248 (0.067) 3.7 x 10-4 0.29 rs9204\\n(0.02) 1.3x 10-53 5.9 X 10-50\n",
271
+ "8 None None None None None PSMD9 (NM 002813) Blood 0.240 (0.065) 3.9 X 10-4 0.0088 rs3741593\\n(0.00) 8.3x 10-8 1.7 X 10-6\n",
272
+ "9 None None None None None OASL (NM_003733) Adipose 0.318 (0.068) 6.4 X 10-6 0.13 rs2259883\\n(0.19) 1.1x1 10-7 0.0018"
273
  ]
274
  },
275
+ "execution_count": 25,
276
  "metadata": {},
277
  "output_type": "execute_result"
278
  }
 
292
  },
293
  {
294
  "cell_type": "code",
295
+ "execution_count": 8,
296
  "metadata": {},
297
  "outputs": [
298
  {
299
  "name": "stdout",
300
  "output_type": "stream",
301
  "text": [
302
+ " 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18\n",
303
+ "0 SNP rs584438 IGFBP4 rs6662509 H6PD rs2362965 R... Gene C T T C A C T A T T A A Effect Other allele allele A C A T G T C G C... OR 0.98 1.00 0.95 1.03 1.09 1.08 1.27 1.09 1... BMI tails 0.52 0.95 0.02 0.33 0.0001 0.0001... P 1.02 1.11 0.97 1.06 1.11 1.125 5.41 X 10-5 ... OR 0.64 0.07 0.25 0.11 0.0006 1.125 5.41 X 10... Obesity class III P 1.01 1.01 0.98 1.01 1.10... OR 0.47 0.83 0.20 0.58 1.10 1.06 X 10-8 1.125... Obesity class II P 1.00 0.99 0.99 1.00 1.04 ... OR 0.75 1.00 0.34 0.99 0.37 0.99 0.82 1.01 9.... Obesity class I P 0.59 0.005 0.35 -0.006 0.21... Overweight OR 0.22 0.27 0.05 0.33 8.80 X 10... Overweight class P 1.18 1.23 1.12 1.15 1.00 ... BMI (continuous)a Effect P 5.22 X 10-12 3.19 ... OR 0.025 9.43 X 10-11 0.031 7.76 X 10-12 0.017... Height tails 0.025 9.43 X 10-11 0.031 7.76 X... Height P Height (continuous)a Effect P\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  ]
305
  }
306
  ],
 
327
  "print(dfc)"
328
  ]
329
  },
330
+ {
331
+ "cell_type": "code",
332
+ "execution_count": 26,
333
+ "metadata": {},
334
+ "outputs": [
335
+ {
336
+ "data": {
337
+ "text/html": [
338
+ "<div>\n",
339
+ "<style scoped>\n",
340
+ " .dataframe tbody tr th:only-of-type {\n",
341
+ " vertical-align: middle;\n",
342
+ " }\n",
343
+ "\n",
344
+ " .dataframe tbody tr th {\n",
345
+ " vertical-align: top;\n",
346
+ " }\n",
347
+ "\n",
348
+ " .dataframe thead th {\n",
349
+ " text-align: right;\n",
350
+ " }\n",
351
+ "</style>\n",
352
+ "<table border=\"1\" class=\"dataframe\">\n",
353
+ " <thead>\n",
354
+ " <tr style=\"text-align: right;\">\n",
355
+ " <th></th>\n",
356
+ " <th>0</th>\n",
357
+ " <th>1</th>\n",
358
+ " <th>2</th>\n",
359
+ " <th>3</th>\n",
360
+ " <th>4</th>\n",
361
+ " <th>5</th>\n",
362
+ " <th>6</th>\n",
363
+ " <th>7</th>\n",
364
+ " <th>8</th>\n",
365
+ " <th>9</th>\n",
366
+ " <th>10</th>\n",
367
+ " <th>11</th>\n",
368
+ " <th>12</th>\n",
369
+ " </tr>\n",
370
+ " </thead>\n",
371
+ " <tbody>\n",
372
+ " <tr>\n",
373
+ " <th>0</th>\n",
374
+ " <td>SNP</td>\n",
375
+ " <td>Chr.</td>\n",
376
+ " <td>Position\\nB36\\n(bp)</td>\n",
377
+ " <td>Nearby\\ngenea</td>\n",
378
+ " <td>Risk\\nalleleb</td>\n",
379
+ " <td>Gene (transcript)</td>\n",
380
+ " <td>Tissue</td>\n",
381
+ " <td>Effect (s.e.m.)C</td>\n",
382
+ " <td>P value</td>\n",
383
+ " <td>P d\\nadj</td>\n",
384
+ " <td>SNP(2f</td>\n",
385
+ " <td>Pvalue</td>\n",
386
+ " <td>P g\\nadj</td>\n",
387
+ " </tr>\n",
388
+ " <tr>\n",
389
+ " <th>1</th>\n",
390
+ " <td>Novel loci reported in this study</td>\n",
391
+ " <td>Novel loci reported in this study</td>\n",
392
+ " <td>Novel loci reported in this study</td>\n",
393
+ " <td></td>\n",
394
+ " <td></td>\n",
395
+ " <td></td>\n",
396
+ " <td></td>\n",
397
+ " <td></td>\n",
398
+ " <td></td>\n",
399
+ " <td></td>\n",
400
+ " <td></td>\n",
401
+ " <td></td>\n",
402
+ " <td></td>\n",
403
+ " </tr>\n",
404
+ " <tr>\n",
405
+ " <th>2</th>\n",
406
+ " <td>rs4457053</td>\n",
407
+ " <td>5</td>\n",
408
+ " <td>76,460,705</td>\n",
409
+ " <td>ZBED3</td>\n",
410
+ " <td>G</td>\n",
411
+ " <td>PDE8B(NM 003719)</td>\n",
412
+ " <td>Adipose</td>\n",
413
+ " <td>0.302 (0.070)</td>\n",
414
+ " <td>2.8 X 10-5</td>\n",
415
+ " <td>0.80</td>\n",
416
+ " <td>rs6864250 (0.18)</td>\n",
417
+ " <td>3.1 X 10-17</td>\n",
418
+ " <td>5.8 X 10-13</td>\n",
419
+ " </tr>\n",
420
+ " <tr>\n",
421
+ " <th>3</th>\n",
422
+ " <td></td>\n",
423
+ " <td></td>\n",
424
+ " <td></td>\n",
425
+ " <td></td>\n",
426
+ " <td></td>\n",
427
+ " <td>ZBED3(NM 032367)</td>\n",
428
+ " <td>Adipose</td>\n",
429
+ " <td>0.429 (0.068)</td>\n",
430
+ " <td>1.0: x 10-9</td>\n",
431
+ " <td>0.011</td>\n",
432
+ " <td>rs4704389 (0.20)</td>\n",
433
+ " <td>3.9 x 10-16</td>\n",
434
+ " <td>6.0 X 10-9</td>\n",
435
+ " </tr>\n",
436
+ " <tr>\n",
437
+ " <th>4</th>\n",
438
+ " <td>rs972283</td>\n",
439
+ " <td>7</td>\n",
440
+ " <td>130,117,394</td>\n",
441
+ " <td>KLF14</td>\n",
442
+ " <td>G</td>\n",
443
+ " <td>KLF14(NM_138693)</td>\n",
444
+ " <td>Adipose</td>\n",
445
+ " <td>-0.387 (0.058)</td>\n",
446
+ " <td>8.1 X 10-11</td>\n",
447
+ " <td>0.058</td>\n",
448
+ " <td>rs738134 (0.30)</td>\n",
449
+ " <td>2.2 X 10-12</td>\n",
450
+ " <td>0.0014</td>\n",
451
+ " </tr>\n",
452
+ " <tr>\n",
453
+ " <th>5</th>\n",
454
+ " <td>rs896854</td>\n",
455
+ " <td>8</td>\n",
456
+ " <td>96,029,687</td>\n",
457
+ " <td>TP53INPI</td>\n",
458
+ " <td>T</td>\n",
459
+ " <td>CCNE2 (NM 057749)</td>\n",
460
+ " <td>Blood</td>\n",
461
+ " <td>0.225 (0.053)</td>\n",
462
+ " <td>3.8 X 10-5</td>\n",
463
+ " <td>0.78</td>\n",
464
+ " <td>rs4735339 (0.61)</td>\n",
465
+ " <td>5.8 X 10-7</td>\n",
466
+ " <td>0.0051</td>\n",
467
+ " </tr>\n",
468
+ " <tr>\n",
469
+ " <th>6</th>\n",
470
+ " <td>rs1552224</td>\n",
471
+ " <td>11</td>\n",
472
+ " <td>72,110,746</td>\n",
473
+ " <td>CENTD2</td>\n",
474
+ " <td>A</td>\n",
475
+ " <td>STARDIO(NM 006645)</td>\n",
476
+ " <td>Blood</td>\n",
477
+ " <td>0.337 (0.066)</td>\n",
478
+ " <td>8.6 x 10-7</td>\n",
479
+ " <td>0.026</td>\n",
480
+ " <td>rs519790 (0.04)</td>\n",
481
+ " <td>2.7x 10-24</td>\n",
482
+ " <td>1.6 X 10-1</td>\n",
483
+ " </tr>\n",
484
+ " <tr>\n",
485
+ " <th>7</th>\n",
486
+ " <td>rs7957197</td>\n",
487
+ " <td>12</td>\n",
488
+ " <td>119,945,069</td>\n",
489
+ " <td>HNFIA</td>\n",
490
+ " <td>T</td>\n",
491
+ " <td>ACADS (NM 000017)</td>\n",
492
+ " <td>Adipose</td>\n",
493
+ " <td>0.248 (0.067)</td>\n",
494
+ " <td>3.7 x 10-4</td>\n",
495
+ " <td>0.29</td>\n",
496
+ " <td>rs9204\\n(0.02)</td>\n",
497
+ " <td>1.3x 10-53</td>\n",
498
+ " <td>5.9 X 10-50</td>\n",
499
+ " </tr>\n",
500
+ " <tr>\n",
501
+ " <th>8</th>\n",
502
+ " <td></td>\n",
503
+ " <td></td>\n",
504
+ " <td></td>\n",
505
+ " <td></td>\n",
506
+ " <td></td>\n",
507
+ " <td>PSMD9 (NM 002813)</td>\n",
508
+ " <td>Blood</td>\n",
509
+ " <td>0.240 (0.065)</td>\n",
510
+ " <td>3.9 X 10-4</td>\n",
511
+ " <td>0.0088</td>\n",
512
+ " <td>rs3741593\\n(0.00)</td>\n",
513
+ " <td>8.3x 10-8</td>\n",
514
+ " <td>1.7 X 10-6</td>\n",
515
+ " </tr>\n",
516
+ " <tr>\n",
517
+ " <th>9</th>\n",
518
+ " <td></td>\n",
519
+ " <td></td>\n",
520
+ " <td></td>\n",
521
+ " <td></td>\n",
522
+ " <td></td>\n",
523
+ " <td>OASL (NM_003733)</td>\n",
524
+ " <td>Adipose</td>\n",
525
+ " <td>0.318 (0.068)</td>\n",
526
+ " <td>6.4 X 10-6</td>\n",
527
+ " <td>0.13</td>\n",
528
+ " <td>rs2259883\\n(0.19)</td>\n",
529
+ " <td>1.1x1 10-7</td>\n",
530
+ " <td>0.0018</td>\n",
531
+ " </tr>\n",
532
+ " <tr>\n",
533
+ " <th>10</th>\n",
534
+ " <td></td>\n",
535
+ " <td></td>\n",
536
+ " <td></td>\n",
537
+ " <td></td>\n",
538
+ " <td></td>\n",
539
+ " <td>OASL (NM_ _003733)</td>\n",
540
+ " <td>Blood</td>\n",
541
+ " <td>0.319 (0.064)</td>\n",
542
+ " <td>1.3 X 10-6</td>\n",
543
+ " <td>0.37</td>\n",
544
+ " <td>rs4556628\\n(0.21)</td>\n",
545
+ " <td>4.4&gt; X 10-22</td>\n",
546
+ " <td>1.4 X 10-16</td>\n",
547
+ " </tr>\n",
548
+ " <tr>\n",
549
+ " <th>11</th>\n",
550
+ " <td></td>\n",
551
+ " <td></td>\n",
552
+ " <td></td>\n",
553
+ " <td></td>\n",
554
+ " <td></td>\n",
555
+ " <td>COQ5(NM_032314)</td>\n",
556
+ " <td>Blood</td>\n",
557
+ " <td>0.248 (0.065)</td>\n",
558
+ " <td>2.1 x1 10-4</td>\n",
559
+ " <td>0.92</td>\n",
560
+ " <td>rs10774561\\n(0.02)</td>\n",
561
+ " <td>8.7x 10-39</td>\n",
562
+ " <td>4.9 X 10 -</td>\n",
563
+ " </tr>\n",
564
+ " <tr>\n",
565
+ " <th>12</th>\n",
566
+ " <td></td>\n",
567
+ " <td></td>\n",
568
+ " <td></td>\n",
569
+ " <td></td>\n",
570
+ " <td></td>\n",
571
+ " <td>UNCI19B(NM 032661)</td>\n",
572
+ " <td>Blood</td>\n",
573
+ " <td>0.254 (0.064)</td>\n",
574
+ " <td>1.4x 10-4</td>\n",
575
+ " <td>0.048</td>\n",
576
+ " <td>rs11065202\\n(0.09)</td>\n",
577
+ " <td>7.8 x 10-12</td>\n",
578
+ " <td>2.3 X 10-9</td>\n",
579
+ " </tr>\n",
580
+ " <tr>\n",
581
+ " <th>13</th>\n",
582
+ " <td></td>\n",
583
+ " <td></td>\n",
584
+ " <td></td>\n",
585
+ " <td></td>\n",
586
+ " <td></td>\n",
587
+ " <td>CAMKK2 (NM 17 72215)</td>\n",
588
+ " <td>Adipose</td>\n",
589
+ " <td>0.497 (0.068)</td>\n",
590
+ " <td>1.2 x 10-12</td>\n",
591
+ " <td>0.18</td>\n",
592
+ " <td>rs11065504\\n(0.08)</td>\n",
593
+ " <td>2.7x 10-117</td>\n",
594
+ " <td>3.8 X 10-98</td>\n",
595
+ " </tr>\n",
596
+ " <tr>\n",
597
+ " <th>14</th>\n",
598
+ " <td></td>\n",
599
+ " <td></td>\n",
600
+ " <td></td>\n",
601
+ " <td></td>\n",
602
+ " <td></td>\n",
603
+ " <td>CAMKK2 (NM_ 1 172215)</td>\n",
604
+ " <td>Blood</td>\n",
605
+ " <td>0.360 (0.063)</td>\n",
606
+ " <td>3.4 X 10-8</td>\n",
607
+ " <td>0.68</td>\n",
608
+ " <td>rs11065504\\n(0.08)</td>\n",
609
+ " <td>7.0 X 10-105</td>\n",
610
+ " <td>5.7 X 10-94</td>\n",
611
+ " </tr>\n",
612
+ " <tr>\n",
613
+ " <th>15</th>\n",
614
+ " <td></td>\n",
615
+ " <td></td>\n",
616
+ " <td></td>\n",
617
+ " <td></td>\n",
618
+ " <td></td>\n",
619
+ " <td>P2RX4(NM 175568)</td>\n",
620
+ " <td>Blood</td>\n",
621
+ " <td>0.312 (0.065)</td>\n",
622
+ " <td>3.4 x 10-6</td>\n",
623
+ " <td>2.0 x 10-6</td>\n",
624
+ " <td>rs25644\\n(0.03)</td>\n",
625
+ " <td>3.4 x 10-17</td>\n",
626
+ " <td>1.9 x 10-17</td>\n",
627
+ " </tr>\n",
628
+ " <tr>\n",
629
+ " <th>16</th>\n",
630
+ " <td>rs8042680</td>\n",
631
+ " <td>15</td>\n",
632
+ " <td>89,322,341</td>\n",
633
+ " <td>PRCI</td>\n",
634
+ " <td>A</td>\n",
635
+ " <td>VPS33B (NM_018668)</td>\n",
636
+ " <td>Blood</td>\n",
637
+ " <td>0.371 (0.057)</td>\n",
638
+ " <td>2.9 x 10-10</td>\n",
639
+ " <td>0.50</td>\n",
640
+ " <td>rs12595616\\n(0.57)</td>\n",
641
+ " <td>2.3 x 10-21</td>\n",
642
+ " <td>4.5 X 10-1</td>\n",
643
+ " </tr>\n",
644
+ " <tr>\n",
645
+ " <th>17</th>\n",
646
+ " <td>Previously reported loci</td>\n",
647
+ " <td>Previously reported loci</td>\n",
648
+ " <td>Previously reported loci</td>\n",
649
+ " <td></td>\n",
650
+ " <td></td>\n",
651
+ " <td></td>\n",
652
+ " <td></td>\n",
653
+ " <td></td>\n",
654
+ " <td></td>\n",
655
+ " <td></td>\n",
656
+ " <td></td>\n",
657
+ " <td></td>\n",
658
+ " <td></td>\n",
659
+ " </tr>\n",
660
+ " <tr>\n",
661
+ " <th>18</th>\n",
662
+ " <td>rs7578326</td>\n",
663
+ " <td>2</td>\n",
664
+ " <td>226,728,897</td>\n",
665
+ " <td>IRSI</td>\n",
666
+ " <td>A</td>\n",
667
+ " <td>IRS/(Contig50189RC)</td>\n",
668
+ " <td>Adipose</td>\n",
669
+ " <td>-0.251 (0.059)</td>\n",
670
+ " <td>3.7 x 10-5</td>\n",
671
+ " <td>0.89</td>\n",
672
+ " <td>rs2943653 (0.93)</td>\n",
673
+ " <td>3.4 X 10-5</td>\n",
674
+ " <td>0.69</td>\n",
675
+ " </tr>\n",
676
+ " <tr>\n",
677
+ " <th>19</th>\n",
678
+ " <td></td>\n",
679
+ " <td></td>\n",
680
+ " <td></td>\n",
681
+ " <td></td>\n",
682
+ " <td></td>\n",
683
+ " <td>IRSI(NM 005544)</td>\n",
684
+ " <td>Adipose</td>\n",
685
+ " <td>0.331 (0.059)</td>\n",
686
+ " <td>5.7 X 10-8</td>\n",
687
+ " <td>0.58</td>\n",
688
+ " <td>rs2176040 (0.74)</td>\n",
689
+ " <td>7.8 X 10-10</td>\n",
690
+ " <td>0.0042</td>\n",
691
+ " </tr>\n",
692
+ " <tr>\n",
693
+ " <th>20</th>\n",
694
+ " <td>rs13081389</td>\n",
695
+ " <td>3</td>\n",
696
+ " <td>12,264,800</td>\n",
697
+ " <td>PPARG</td>\n",
698
+ " <td>A</td>\n",
699
+ " <td>IQSECI (NM 014869)</td>\n",
700
+ " <td>Adipose</td>\n",
701
+ " <td>-0.630(0.131)</td>\n",
702
+ " <td>2.9 x 10-6</td>\n",
703
+ " <td>1.4&gt; x 10-4</td>\n",
704
+ " <td>rs9211\\n(0.01)</td>\n",
705
+ " <td>1.1x 10-96</td>\n",
706
+ " <td>7.4 X 10-94</td>\n",
707
+ " </tr>\n",
708
+ " <tr>\n",
709
+ " <th>21</th>\n",
710
+ " <td>rs6795735</td>\n",
711
+ " <td>3</td>\n",
712
+ " <td>64,680,405</td>\n",
713
+ " <td>ADAMTS9</td>\n",
714
+ " <td>C</td>\n",
715
+ " <td>BC040632(AK022320)</td>\n",
716
+ " <td>Adipose</td>\n",
717
+ " <td>0.229 (0.056)</td>\n",
718
+ " <td>7.6 X 10-5</td>\n",
719
+ " <td>0.28</td>\n",
720
+ " <td>rs4521216\\n(0.02)</td>\n",
721
+ " <td>3.0 X 10-13</td>\n",
722
+ " <td>8.7 x 10-10</td>\n",
723
+ " </tr>\n",
724
+ " </tbody>\n",
725
+ "</table>\n",
726
+ "</div>"
727
+ ],
728
+ "text/plain": [
729
+ " 0 1 2 3 4 5 6 7 8 9 10 11 12\n",
730
+ "0 SNP Chr. Position\\nB36\\n(bp) Nearby\\ngenea Risk\\nalleleb Gene (transcript) Tissue Effect (s.e.m.)C P value P d\\nadj SNP(2f Pvalue P g\\nadj\n",
731
+ "1 Novel loci reported in this study Novel loci reported in this study Novel loci reported in this study \n",
732
+ "2 rs4457053 5 76,460,705 ZBED3 G PDE8B(NM 003719) Adipose 0.302 (0.070) 2.8 X 10-5 0.80 rs6864250 (0.18) 3.1 X 10-17 5.8 X 10-13\n",
733
+ "3 ZBED3(NM 032367) Adipose 0.429 (0.068) 1.0: x 10-9 0.011 rs4704389 (0.20) 3.9 x 10-16 6.0 X 10-9\n",
734
+ "4 rs972283 7 130,117,394 KLF14 G KLF14(NM_138693) Adipose -0.387 (0.058) 8.1 X 10-11 0.058 rs738134 (0.30) 2.2 X 10-12 0.0014\n",
735
+ "5 rs896854 8 96,029,687 TP53INPI T CCNE2 (NM 057749) Blood 0.225 (0.053) 3.8 X 10-5 0.78 rs4735339 (0.61) 5.8 X 10-7 0.0051\n",
736
+ "6 rs1552224 11 72,110,746 CENTD2 A STARDIO(NM 006645) Blood 0.337 (0.066) 8.6 x 10-7 0.026 rs519790 (0.04) 2.7x 10-24 1.6 X 10-1\n",
737
+ "7 rs7957197 12 119,945,069 HNFIA T ACADS (NM 000017) Adipose 0.248 (0.067) 3.7 x 10-4 0.29 rs9204\\n(0.02) 1.3x 10-53 5.9 X 10-50\n",
738
+ "8 PSMD9 (NM 002813) Blood 0.240 (0.065) 3.9 X 10-4 0.0088 rs3741593\\n(0.00) 8.3x 10-8 1.7 X 10-6\n",
739
+ "9 OASL (NM_003733) Adipose 0.318 (0.068) 6.4 X 10-6 0.13 rs2259883\\n(0.19) 1.1x1 10-7 0.0018\n",
740
+ "10 OASL (NM_ _003733) Blood 0.319 (0.064) 1.3 X 10-6 0.37 rs4556628\\n(0.21) 4.4> X 10-22 1.4 X 10-16\n",
741
+ "11 COQ5(NM_032314) Blood 0.248 (0.065) 2.1 x1 10-4 0.92 rs10774561\\n(0.02) 8.7x 10-39 4.9 X 10 -\n",
742
+ "12 UNCI19B(NM 032661) Blood 0.254 (0.064) 1.4x 10-4 0.048 rs11065202\\n(0.09) 7.8 x 10-12 2.3 X 10-9\n",
743
+ "13 CAMKK2 (NM 17 72215) Adipose 0.497 (0.068) 1.2 x 10-12 0.18 rs11065504\\n(0.08) 2.7x 10-117 3.8 X 10-98\n",
744
+ "14 CAMKK2 (NM_ 1 172215) Blood 0.360 (0.063) 3.4 X 10-8 0.68 rs11065504\\n(0.08) 7.0 X 10-105 5.7 X 10-94\n",
745
+ "15 P2RX4(NM 175568) Blood 0.312 (0.065) 3.4 x 10-6 2.0 x 10-6 rs25644\\n(0.03) 3.4 x 10-17 1.9 x 10-17\n",
746
+ "16 rs8042680 15 89,322,341 PRCI A VPS33B (NM_018668) Blood 0.371 (0.057) 2.9 x 10-10 0.50 rs12595616\\n(0.57) 2.3 x 10-21 4.5 X 10-1\n",
747
+ "17 Previously reported loci Previously reported loci Previously reported loci \n",
748
+ "18 rs7578326 2 226,728,897 IRSI A IRS/(Contig50189RC) Adipose -0.251 (0.059) 3.7 x 10-5 0.89 rs2943653 (0.93) 3.4 X 10-5 0.69\n",
749
+ "19 IRSI(NM 005544) Adipose 0.331 (0.059) 5.7 X 10-8 0.58 rs2176040 (0.74) 7.8 X 10-10 0.0042\n",
750
+ "20 rs13081389 3 12,264,800 PPARG A IQSECI (NM 014869) Adipose -0.630(0.131) 2.9 x 10-6 1.4> x 10-4 rs9211\\n(0.01) 1.1x 10-96 7.4 X 10-94\n",
751
+ "21 rs6795735 3 64,680,405 ADAMTS9 C BC040632(AK022320) Adipose 0.229 (0.056) 7.6 X 10-5 0.28 rs4521216\\n(0.02) 3.0 X 10-13 8.7 x 10-10"
752
+ ]
753
+ },
754
+ "execution_count": 26,
755
+ "metadata": {},
756
+ "output_type": "execute_result"
757
+ }
758
+ ],
759
+ "source": [
760
+ "dfc = df.fillna('')\n",
761
+ "dfc"
762
+ ]
763
+ },
764
  {
765
  "cell_type": "markdown",
766
  "metadata": {},
 
770
  },
771
  {
772
  "cell_type": "code",
773
+ "execution_count": 19,
774
  "metadata": {},
775
  "outputs": [
776
  {
777
  "name": "stdout",
778
  "output_type": "stream",
779
  "text": [
780
+ "41 [('rs4607517', 'MODY2'), ('rs5400', 'T1101'), ('rs1799884', 'PNDM'), ('rs4607517', 'MODY'), ('rs5400', 'SNPS'), ('rs5394', 'SNPS'), ('rs2144908', 'MODY'), ('rs7020673', 'GLIS3'), ('rs5393', 'SLC2A2'), ('rs4684847', 'SNP'), ('rs5394', 'GLUT2'), ('rs7034200', 'T1D'), ('rs4607517', 'GCK'), ('rs3818247', 'SNPS'), ('rs2144908', 'CHI'), ('rs2144908', 'MODY1'), ('rs3818247', 'HNF4A'), ('rs6446482', 'DIDMOAD'), ('rs757210', 'MODY5'), ('rs1799884', 'MODY2'), ('rs757210', 'HNF1B'), ('rs5393', 'FBS'), ('rs757210', 'RCAD'), ('rs1799884', 'MODY'), ('rs757210', 'TS4430796'), ('rs5394', 'SLC2A2'), ('rs5404', 'T198T'), ('rs1799884', 'GCK'), ('rs4607517', 'PNDM'), ('rs3818247', 'MODY'), ('rs2144908', 'SNPS'), ('rs10010131', 'DIDMOAD'), ('rs5393', 'SNPS'), ('rs2144908', 'HNF4A'), ('rs5394', 'FBS'), ('rs3818247', 'CHI'), ('rs5393', 'GLUT2'), ('rs757210', 'SNP'), ('rs3818247', 'MODY1'), ('rs757210', 'TCF2'), ('rs1801282', 'PPARG')]\n"
781
  ]
782
  }
783
  ],