jinysun commited on
Commit
2ba59d0
·
1 Parent(s): 36c5570

Upload 13 files

Browse files
Files changed (8) hide show
  1. .gitattributes +1 -0
  2. .gitignore +1 -1
  3. 15data.h5 +1 -1
  4. RF.py +209 -0
  5. app.py +14 -7
  6. dict.json +1 -0
  7. predict.dat +3 -0
  8. requirements.txt +1 -1
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ predict.dat filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -50,7 +50,7 @@ coverage.xml
50
  .hypothesis/
51
  .pytest_cache/
52
  cover/
53
-
54
  # Translations
55
  *.mo
56
  *.pot
 
50
  .hypothesis/
51
  .pytest_cache/
52
  cover/
53
+ .streamlit/secrets.toml
54
  # Translations
55
  *.mo
56
  *.pot
15data.h5 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ec80795633fe96e7226a7e63909138e6f4fc37654dcff6831627b1670986497
3
  size 17610752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28b4c29f1d71c3287dce9f64264a320dbb08227273131dd1135a67ea9d358f53
3
  size 17610752
RF.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Mon Sep 4 10:38:59 2023
4
+
5
+ @author: BM109X32G-10GPU-02
6
+ """
7
+
8
+
9
+ from sklearn.metrics import confusion_matrix
10
+ import matplotlib.pyplot as plt
11
+ import numpy as np
12
+
13
+ from sklearn.datasets import make_blobs
14
+ import json
15
+ import numpy as np
16
+ import math
17
+ from tqdm import tqdm
18
+ from scipy import sparse
19
+ from sklearn.metrics import median_absolute_error,r2_score, mean_absolute_error,mean_squared_error
20
+ import pickle
21
+
22
+
23
+ import pandas as pd
24
+ import matplotlib.pyplot as plt
25
+ from rdkit import Chem
26
+
27
+ from sklearn.ensemble import RandomForestRegressor
28
+ from sklearn.model_selection import train_test_split
29
+ from sklearn.preprocessing import MinMaxScaler
30
+ from sklearn.neural_network import MLPClassifier
31
+ from sklearn.svm import SVC
32
+ from tensorflow.keras.models import Model, load_model
33
+ from tensorflow.keras.layers import Dense, Input, Flatten, Conv1D, MaxPooling1D, concatenate
34
+ from tensorflow.keras import metrics, optimizers
35
+ from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
36
+
37
+ def split_smiles(smiles, kekuleSmiles=True):
38
+ try:
39
+ mol = Chem.MolFromSmiles(smiles)
40
+ smiles = Chem.MolToSmiles(mol, kekuleSmiles=kekuleSmiles)
41
+ except:
42
+ pass
43
+ splitted_smiles = []
44
+ for j, k in enumerate(smiles):
45
+ if len(smiles) == 1:
46
+ return [smiles]
47
+ if j == 0:
48
+ if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c":
49
+ splitted_smiles.append(k + smiles[j + 1])
50
+ else:
51
+ splitted_smiles.append(k)
52
+ elif j != 0 and j < len(smiles) - 1:
53
+ if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c":
54
+ splitted_smiles.append(k + smiles[j + 1])
55
+ elif k.islower() and smiles[j - 1].isupper() and k != "c":
56
+ pass
57
+ else:
58
+ splitted_smiles.append(k)
59
+
60
+ elif j == len(smiles) - 1:
61
+ if k.islower() and smiles[j - 1].isupper() and k != "c":
62
+ pass
63
+ else:
64
+ splitted_smiles.append(k)
65
+ return splitted_smiles
66
+
67
+ def get_maxlen(all_smiles, kekuleSmiles=True):
68
+ maxlen = 0
69
+ for smi in tqdm(all_smiles):
70
+ spt = split_smiles(smi, kekuleSmiles=kekuleSmiles)
71
+ if spt is None:
72
+ continue
73
+ maxlen = max(maxlen, len(spt))
74
+ return maxlen
75
+ def get_dict(all_smiles, save_path, kekuleSmiles=True):
76
+ words = [' ']
77
+ for smi in tqdm(all_smiles):
78
+ spt = split_smiles(smi, kekuleSmiles=kekuleSmiles)
79
+ if spt is None:
80
+ continue
81
+ for w in spt:
82
+ if w in words:
83
+ continue
84
+ else:
85
+ words.append(w)
86
+ with open(save_path, 'w') as js:
87
+ json.dump(words, js)
88
+ return words
89
+
90
+ def one_hot_coding(smi, words, kekuleSmiles=True, max_len=1000):
91
+ coord_j = []
92
+ coord_k = []
93
+ spt = split_smiles(smi, kekuleSmiles=kekuleSmiles)
94
+ if spt is None:
95
+ return None
96
+ for j,w in enumerate(spt):
97
+ if j >= max_len:
98
+ break
99
+ try:
100
+ k = words.index(w)
101
+ except:
102
+ continue
103
+ coord_j.append(j)
104
+ coord_k.append(k)
105
+ data = np.repeat(1, len(coord_j))
106
+ output = sparse.csr_matrix((data, (coord_j, coord_k)), shape=(max_len, len(words)))
107
+ return output
108
+ def split_dataset(dataset, ratio):
109
+ """Shuffle and split a dataset."""
110
+ # np.random.seed(111) # fix the seed for shuffle.
111
+ #np.random.shuffle(dataset)
112
+ n = int(ratio * len(dataset))
113
+ return dataset[:n], dataset[n:]
114
+ def plot_confusion_matrix(cm, savename, title='Confusion Matrix'):
115
+
116
+ plt.figure(figsize=(12, 8), dpi=100)
117
+ np.set_printoptions(precision=2)
118
+
119
+ ind_array = [np.arange(3)]
120
+ x, y = np.meshgrid(ind_array, ind_array)
121
+ for x_val, y_val in zip(x.flatten(), y.flatten()):
122
+ c = cm[y_val][x_val]
123
+ if c > 0.001:
124
+ plt.text(x_val, y_val, "%0.2f" % (c,), color='red', fontsize=15, va='center', ha='center')
125
+
126
+ plt.imshow(cm, interpolation='nearest', cmap=plt.cm.binary)
127
+ plt.title(title)
128
+ plt.colorbar()
129
+ xlocations = np.array(range(len(classes)))
130
+ plt.xticks(xlocations, classes, rotation=90)
131
+ plt.yticks(xlocations, classes)
132
+ plt.ylabel('Actual label')
133
+ plt.xlabel('Predict label')
134
+
135
+ # offset the tick
136
+ tick_marks = np.array(range(len(classes))) + 0.5
137
+ plt.gca().set_xticks(tick_marks, minor=True)
138
+ plt.gca().set_yticks(tick_marks, minor=True)
139
+ plt.gca().xaxis.set_ticks_position('none')
140
+ plt.gca().yaxis.set_ticks_position('none')
141
+ plt.grid(True, which='minor', linestyle='-')
142
+ plt.gcf().subplots_adjust(bottom=0.15)
143
+
144
+ # show confusion matrix
145
+ plt.savefig(savename, format='png')
146
+ plt.show()
147
+ def main(sm):
148
+ with open("dict.json", "r", encoding="utf-8") as f:
149
+ words = json.load(f)
150
+
151
+ inchis = list([sm])
152
+ rts = list([0])
153
+
154
+ smiles, targets = [], []
155
+ for i, inc in enumerate(tqdm(inchis)):
156
+ mol = Chem.MolFromSmiles(inc)
157
+ if mol is None:
158
+ continue
159
+ else:
160
+ smi = Chem.MolToSmiles(mol)
161
+ smiles.append(smi)
162
+ targets.append(rts[i])
163
+
164
+
165
+
166
+ features = []
167
+ for i, smi in enumerate(tqdm(smiles)):
168
+ xi = one_hot_coding(smi, words, max_len=600)
169
+ if xi is not None:
170
+ features.append(xi.todense())
171
+ features = np.asarray(features)
172
+ targets = np.asarray(targets)
173
+ X_test=features
174
+ Y_test=targets
175
+ n_features=10
176
+
177
+ model = RandomForestRegressor(n_estimators=100, criterion='friedman_mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=1.0, max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None)
178
+
179
+ from tensorflow.keras import backend as K
180
+
181
+ load_model = pickle.load(open(r"predict.dat","rb"))
182
+
183
+ # model = load_model('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/single_model.h5')
184
+ Y_predict = load_model.predict(K.cast_to_floatx(X_test).reshape((np.size(X_test,0),np.size(X_test,1)*np.size(X_test,2))))
185
+ #Y_predict = model.predict(X_test)
186
+ x = list(Y_test)
187
+ y = list(Y_predict)
188
+
189
+ return Y_predict
190
+
191
+ def edit_dataset(drug,non_drug,task):
192
+ # np.random.seed(111) # fix the seed for shuffle.
193
+
194
+ # np.random.shuffle(non_drug)
195
+ non_drug=non_drug[0:len(drug)]
196
+
197
+
198
+ # np.random.shuffle(non_drug)
199
+ # np.random.shuffle(drug)
200
+ dataset_train_drug, dataset_test_drug = split_dataset(drug, 0.9)
201
+ # dataset_train_drug,dataset_dev_drug = split_dataset(dataset_train_drug, 0.9)
202
+ dataset_train_no, dataset_test_no = split_dataset(non_drug, 0.9)
203
+ # dataset_train_no,dataset_dev_no = split_dataset(dataset_train_no, 0.9)
204
+ dataset_train = pd.concat([dataset_train_drug,dataset_train_no], axis=0)
205
+ dataset_test=pd.concat([ dataset_test_drug,dataset_test_no], axis=0)
206
+ # dataset_dev = dataset_dev_drug+dataset_dev_no
207
+ return dataset_train, dataset_test
208
+ if __name__ == "__main__":
209
+ x = main("CCCCCCC1=CC=C(C2(C3=CC=C(CCCCCC)C=C3)C3=CC4=C(C=C3C3=C2C=C(/C=C2\SC(=S)N(CC)C2=O)S3)C(C2=CC=C(CCCCCC)C=C2)(C2=CC=C(CCCCCC)C=C2)C2=C4SC(/C=C3\SC(=S)N(CC)C3=O)=C2)C=C1")
app.py CHANGED
@@ -1,37 +1,44 @@
 
1
  import streamlit as st
2
  import pandas as pd
3
  import rdkit
4
  import streamlit_ketcher
5
  from streamlit_ketcher import st_ketcher
6
  import abcBERT
 
 
7
 
8
  # Page setup
9
  st.set_page_config(page_title="DeepAcceptor", page_icon="🔋", layout="wide")
10
- st.title("DeepAcceptor")
11
 
12
  # Connect to the Google Sheet
13
  url1 = r"https://docs.google.com/spreadsheets/d/1YOEIg0nMTSPkAOr8wkqxQRLuUhys3-J0I-KPEpmzPLw/gviz/tq?tqx=out:csv&sheet=accept"
14
  url = r"https://docs.google.com/spreadsheets/d/1YOEIg0nMTSPkAOr8wkqxQRLuUhys3-J0I-KPEpmzPLw/gviz/tq?tqx=out:csv&sheet=111"
15
  df1 = pd.read_csv(url1, dtype=str, encoding='utf-8')
16
 
17
- text_search = st.text_input("Search papers or molecules", value="")
18
  m1 = df1["name"].str.contains(text_search)
19
  m2 = df1["reference"].str.contains(text_search)
20
  df_search = df1[m1 | m2]
21
  if text_search:
22
  st.write(df_search)
23
- st.download_button( "Download edited files as .csv", df_search.to_csv(), "df_search.csv", use_container_width=True)
24
  edited_df = st.data_editor(df1, num_rows="dynamic")
25
  edited_df.to_csv(url)
26
  st.download_button(
27
  "⬇️ Download edited files as .csv", edited_df.to_csv(), "edited_df.csv", use_container_width=True
28
  )
29
 
30
- molecule = st.text_input("Molecule")
31
  smile_code = st_ketcher(molecule)
32
- st.markdown(f"Smile code: ``{smile_code}``")
 
 
 
33
  try:
34
  pce = abcBERT.main( str(smile_code ) )
35
- st.markdown(f"PCE: ``{pce}``")
36
  except:
37
- st.markdown(f"PCE: None ")
 
 
1
+ # -*- coding: utf-8 -*-
2
  import streamlit as st
3
  import pandas as pd
4
  import rdkit
5
  import streamlit_ketcher
6
  from streamlit_ketcher import st_ketcher
7
  import abcBERT
8
+ import RF
9
+ from streamlit_gsheets import GSheetsConnection
10
 
11
  # Page setup
12
  st.set_page_config(page_title="DeepAcceptor", page_icon="🔋", layout="wide")
13
+ st.title("🔋DeepAcceptor")
14
 
15
  # Connect to the Google Sheet
16
  url1 = r"https://docs.google.com/spreadsheets/d/1YOEIg0nMTSPkAOr8wkqxQRLuUhys3-J0I-KPEpmzPLw/gviz/tq?tqx=out:csv&sheet=accept"
17
  url = r"https://docs.google.com/spreadsheets/d/1YOEIg0nMTSPkAOr8wkqxQRLuUhys3-J0I-KPEpmzPLw/gviz/tq?tqx=out:csv&sheet=111"
18
  df1 = pd.read_csv(url1, dtype=str, encoding='utf-8')
19
 
20
+ text_search = st.text_input("🔍Search papers or molecules", value="")
21
  m1 = df1["name"].str.contains(text_search)
22
  m2 = df1["reference"].str.contains(text_search)
23
  df_search = df1[m1 | m2]
24
  if text_search:
25
  st.write(df_search)
26
+ st.download_button( "⬇️ Download edited files as .csv", df_search.to_csv(), "df_search.csv", use_container_width=True)
27
  edited_df = st.data_editor(df1, num_rows="dynamic")
28
  edited_df.to_csv(url)
29
  st.download_button(
30
  "⬇️ Download edited files as .csv", edited_df.to_csv(), "edited_df.csv", use_container_width=True
31
  )
32
 
33
+ molecule = st.text_input("📋Molecule")
34
  smile_code = st_ketcher(molecule)
35
+ st.markdown(f"✨Smiles code: {smile_code}")
36
+ P = RF.main( str(smile_code ) )
37
+ st.markdown(f"⚡PCE predicted by RF: {P}")
38
+
39
  try:
40
  pce = abcBERT.main( str(smile_code ) )
41
+ st.markdown(f"PCE predicted by abcBERT: {pce}")
42
  except:
43
+ st.markdown(f"PCE predicted by abcBERT: Running")
44
+
dict.json ADDED
@@ -0,0 +1 @@
 
 
1
+ [" ", "C", "1", "=", "(", "2", "F", ")", "3", "4", "5", "#", "N", "S", "/", "\\", "O", "6", "7", "8", "9", "%", "0", "[", "Se", "]", "Cl", "Br", "B", ".", "P", "I", "@", "H"]
predict.dat ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6503b5a8cf1fe5423d460dc05d3949b4e592a1cc79c3c01be9d9bd172053948
3
+ size 6265331
requirements.txt CHANGED
@@ -6,5 +6,5 @@ pandas
6
  rdkit
7
  scikit-learn
8
  matplotlib
9
-
10
 
 
6
  rdkit
7
  scikit-learn
8
  matplotlib
9
+ st-gsheets-connection
10