arikat commited on
Commit
39406f0
1 Parent(s): fac7a9a

Upload 2 files

Browse files
Files changed (2) hide show
  1. Glydentify.py +501 -0
  2. requirements.txt +329 -0
Glydentify.py ADDED
@@ -0,0 +1,501 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from IPython.display import clear_output
3
+ import torch
4
+ from transformers import EsmForSequenceClassification, AdamW, AutoTokenizer
5
+ from torch.utils.data import DataLoader, TensorDataset, random_split
6
+ from sklearn.preprocessing import LabelEncoder
7
+ from tqdm import tqdm
8
+ import numpy as np
9
+ import seaborn as sns
10
+ from sklearn.model_selection import train_test_split
11
+ import matplotlib.pyplot as plt
12
+ import pickle
13
+ import torch.nn.functional as F
14
+ import gradio as gr
15
+ import io
16
+ from PIL import Image
17
+ import Bio
18
+ from Bio import SeqIO
19
+ import zipfile
20
+ import os
21
+
22
+ # Load the model from the file
23
+ with open('family_labels.pkl', 'rb') as filefam:
24
+ yfam = pickle.load(filefam)
25
+
26
+ tokenizerfam = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D") #facebook/esm2_t33_650M_UR50D
27
+ label_encoderfam = LabelEncoder()
28
+ encoded_labelsfam = label_encoderfam.fit_transform(yfam)
29
+ labelsfam = torch.tensor(encoded_labelsfam)
30
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
31
+ device
32
+
33
+ modelfam = EsmForSequenceClassification.from_pretrained("facebook/esm2_t33_650M_UR50D", num_labels=len(set(labelsfam.tolist())))
34
+ modelfam = modelfam.to('cpu')
35
+
36
+ modelfam.load_state_dict(torch.load("/home/aarya/Documents/paper3/model_650M.pth"))
37
+ modelfam.eval()
38
+
39
+ x_testfam = ["""MAEVLRTLAGKPKCHALRPMILFLIMLVLVLFGYGVLSPRSLMPGSLERGFCMAVREPDH
40
+ LQRVSLPRMVYPQPKVLTPCRKDVLVVTPWLAPIVWEGTFNIDILNEQFRLQNTTIGLTV
41
+ FAIKKYVAFLKLFLETAEKHFMVGHRVHYYVFTDQPAAVPRVTLGTGRQLSVLEVRAYKR
42
+ WQDVSMRRMEMISDFCERRFLSEVDYLVCVDVDMEFRDHVGVEILTPLFGTLHPGFYGSS
43
+ REAFTYERRPQSQAYIPKDEGDFYYLGGFFGGSVQEVQRLTRACHQAMMVDQANGIEAVW
44
+ HDESHLNKYLLRHKPTKVLSPEYLWDQQLLGWPAVLRKLRFTAVPKNHQAVRNP
45
+ """]
46
+
47
+ encoded_inputfam = tokenizerfam(x_testfam, padding=True, truncation=True, max_length=512, return_tensors="pt")
48
+ input_idsfam = encoded_inputfam["input_ids"]
49
+ attention_maskfam = encoded_inputfam["attention_mask"]
50
+
51
+ with torch.no_grad():
52
+ outputfam = modelfam(input_idsfam, attention_mask=attention_maskfam)
53
+ logitsfam = outputfam.logits
54
+ probabilitiesfam = F.softmax(logitsfam, dim=1)
55
+ _, predicted_labelsfam = torch.max(logitsfam, dim=1)
56
+ probabilitiesfam[0]
57
+
58
+ decoded_labelsfam = label_encoderfam.inverse_transform(predicted_labelsfam.tolist())
59
+ decoded_labelsfam
60
+
61
+
62
+
63
+ #Load donor model from file
64
+ tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t12_35M_UR50D")
65
+
66
+ with open('donorslabels.pkl', 'rb') as file:
67
+ label_encoder = pickle.load(file)
68
+
69
+ # encoded_labels = label_encoder.fit(y)
70
+ # labels = torch.tensor(encoded_labels)
71
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
72
+ device
73
+
74
+ model = EsmForSequenceClassification.from_pretrained("facebook/esm2_t12_35M_UR50D", num_labels=len(label_encoder.classes_))
75
+ model = model.to('cpu')
76
+
77
+ model.load_state_dict(torch.load("best_model_35M_t12_5v5.pth")) #model_best_35v2M.pth
78
+ model.eval()
79
+
80
+ x_test = ["""MAEVLRTLAGKPKCHALRPMILFLIMLVLVLFGYGVLSPRSLMPGSLERGFCMAVREPDH
81
+ LQRVSLPRMVYPQPKVLTPCRKDVLVVTPWLAPIVWEGTFNIDILNEQFRLQNTTIGLTV
82
+ FAIKKYVAFLKLFLETAEKHFMVGHRVHYYVFTDQPAAVPRVTLGTGRQLSVLEVRAYKR
83
+ WQDVSMRRMEMISDFCERRFLSEVDYLVCVDVDMEFRDHVGVEILTPLFGTLHPGFYGSS
84
+ REAFTYERRPQSQAYIPKDEGDFYYLGGFFGGSVQEVQRLTRACHQAMMVDQANGIEAVW
85
+ HDESHLNKYLLRHKPTKVLSPEYLWDQQLLGWPAVLRKLRFTAVPKNHQAVRNP
86
+ """]
87
+
88
+ encoded_input = tokenizer(x_test, padding=True, truncation=True, max_length=512, return_tensors="pt")
89
+ input_ids = encoded_input["input_ids"]
90
+ attention_mask = encoded_input["attention_mask"]
91
+
92
+ with torch.no_grad():
93
+ output = model(input_ids, attention_mask=attention_mask)
94
+ logits = output.logits
95
+ probabilities = F.softmax(logits, dim=1)
96
+ _, predicted_labels = torch.max(logits, dim=1)
97
+ probabilities[0]
98
+
99
+ decoded_labels = label_encoder.inverse_transform(predicted_labels.tolist())
100
+ decoded_labels
101
+
102
+
103
+ glycosyltransferase_db = {
104
+ "GT31-chsy" : {'CAZy Name': 'GT31', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '8 ', 'More Info': 'http://www.cazy.org/GT31.html'},
105
+ "GT2-CesA2" : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '1 ', 'More Info': 'http://www.cazy.org/GT2.html' },
106
+ "GT43-arath" : {'CAZy Name': 'GT43', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT43.html'},
107
+ "GT8-Met1" : {'CAZy Name': 'GT8 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '9 ', 'More Info': 'http://www.cazy.org/GT8.html' },
108
+ "GT32-higher" : {'CAZy Name': 'GT32', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT32.html'},
109
+ "GT40" : {'CAZy Name': 'GT40', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT40.html'},
110
+ "GT16" : {'CAZy Name': 'GT16', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '6 ', 'More Info': 'http://www.cazy.org/GT16.html'},
111
+ "GT27" : {'CAZy Name': 'GT27', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '5 ', 'More Info': 'http://www.cazy.org/GT27.html'},
112
+ "GT55" : {'CAZy Name': 'GT55', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '2 ', 'More Info': 'http://www.cazy.org/GT55.html'},
113
+ "GT8-Glycogenin" : {'CAZy Name': 'GT8 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '9 ', 'More Info': 'http://www.cazy.org/GT8.html' },
114
+ "GT8-1" : {'CAZy Name': 'GT8 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '9 ', 'More Info': 'http://www.cazy.org/GT8.html' },
115
+ "GT25" : {'CAZy Name': 'GT25', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '6 ', 'More Info': 'http://www.cazy.org/GT25.html'},
116
+ "GT2-DPM_like" : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '2 ', 'More Info': 'http://www.cazy.org/GT2.html' },
117
+ "GT31-fringe" : {'CAZy Name': 'GT31', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '8 ', 'More Info': 'http://www.cazy.org/GT31.html'},
118
+ "GT2-Bact_puta" : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT2.html' },
119
+ "GT84" : {'CAZy Name': 'GT84', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '1 ', 'More Info': 'http://www.cazy.org/GT84.html'},
120
+ "GT13" : {'CAZy Name': 'GT13', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '6 ', 'More Info': 'http://www.cazy.org/GT13.html'},
121
+ "GT43-cele" : {'CAZy Name': 'GT43', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT43.html'},
122
+ "GT2-Bact_LPS1" : {'CAZy Name': 'GT92', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT2.html' },
123
+ "GT2-Bact_Oant" : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': ' ', 'More Info': 'http://www.cazy.org/GT2.html' },
124
+ "GT67" : {'CAZy Name': 'GT67', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '8 ', 'More Info': 'http://www.cazy.org/GT67.html'},
125
+ "GT2-HAS" : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '1 ', 'More Info': 'http://www.cazy.org/GT2.html' },
126
+ "GT82" : {'CAZy Name': 'GT82', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '7 ', 'More Info': 'http://www.cazy.org/GT82.html'},
127
+ "GT24" : {'CAZy Name': 'GT24', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '9 ', 'More Info': 'http://www.cazy.org/GT24.html'},
128
+ "GT31-plant" : {'CAZy Name': 'GT31', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '8 ', 'More Info': 'http://www.cazy.org/GT31.html'},
129
+ "GT81-Bact" : {'CAZy Name': 'GT81', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '2 ', 'More Info': 'http://www.cazy.org/GT81.html'},
130
+ "GT2-Bact_gt25Me": {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': ' ', 'More Info': 'http://www.cazy.org/GT2.html' },
131
+ "GT2-B3GntL" : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '4 ', 'More Info': 'http://www.cazy.org/GT2.html' },
132
+ "GT49" : {'CAZy Name': 'GT49', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT49.html'},
133
+ "GT34" : {'CAZy Name': 'GT34', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT34.html'},
134
+ "GT45" : {'CAZy Name': 'GT45', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT45.html'},
135
+ "GT32-lower" : {'CAZy Name': 'GT32', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT32.html'},
136
+ "GT88" : {'CAZy Name': 'GT88', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '9 ', 'More Info': 'http://www.cazy.org/GT88.html'},
137
+ "GT21" : {'CAZy Name': 'GT21', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '1 ', 'More Info': 'http://www.cazy.org/GT21.html'},
138
+ "GT2-DPG_synt" : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '2 ', 'More Info': 'http://www.cazy.org/GT2.html' },
139
+ "GT43-b3gat2" : {'CAZy Name': 'GT43', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT43.html'},
140
+ "GT2-Chitin_synt": {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '5 ', 'More Info': 'http://www.cazy.org/GT2.html' },
141
+ "GT8-Bact" : {'CAZy Name': 'GT8 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT8.html' },
142
+ "GT8-Met2" : {'CAZy Name': 'GT8 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT8.html' },
143
+ "GT2-Bact_Chlor1": {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': ' ', 'More Info': 'http://www.cazy.org/GT2.html' },
144
+ "GT54" : {'CAZy Name': 'GT54', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '6 ', 'More Info': 'http://www.cazy.org/GT54.html'},
145
+ "GT2-Cel_bre3" : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '1 ', 'More Info': 'http://www.cazy.org/GT2.html' },
146
+ "GT2-Bact_Rham" : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT2.html' },
147
+ "GT6" : {'CAZy Name': 'GT6 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT6.html' },
148
+ "GT2-Bact_puta2" : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': ' ', 'More Info': 'http://www.cazy.org/GT2.html' },
149
+ "GT7-1" : {'CAZy Name': 'GT7 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '5 ', 'More Info': 'http://www.cazy.org/GT7.html' },
150
+ "GT2-Csl" : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '4 ', 'More Info': 'http://www.cazy.org/GT2.html' },
151
+ "GT2-ExoU" : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': ' ', 'More Info': 'http://www.cazy.org/GT2.html' },
152
+ "GT2-Csl2" : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '4 ', 'More Info': 'http://www.cazy.org/GT2.html' },
153
+ "GT64" : {'CAZy Name': 'GT64', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT64.html'},
154
+ "GT2-Bact_Chlor2": {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': ' ', 'More Info': 'http://www.cazy.org/GT2.html' },
155
+ "GT78" : {'CAZy Name': 'GT78', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '2 ', 'More Info': 'http://www.cazy.org/GT78.html'},
156
+ "GT12" : {'CAZy Name': 'GT12', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT12.html'},
157
+ "GT31-gnt" : {'CAZy Name': 'GT31', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '8 ', 'More Info': 'http://www.cazy.org/GT31.html'},
158
+ "GT2-Bact_CHS" : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '5 ', 'More Info': 'http://www.cazy.org/GT2.html' },
159
+ "GT62" : {'CAZy Name': 'GT62', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '3 ', 'More Info': 'http://www.cazy.org/GT62.html'},
160
+ "GT8-Met_Pla" : {'CAZy Name': 'GT8 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT8.html' },
161
+ "GT15" : {'CAZy Name': 'GT15', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '8 ', 'More Info': 'http://www.cazy.org/GT15.html'},
162
+ "GT43-b3gat1" : {'CAZy Name': 'GT43', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT43.html'},
163
+ "GT31-b3glt" : {'CAZy Name': 'GT31', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '8 ', 'More Info': 'http://www.cazy.org/GT31.html'},
164
+ "GT2-CesA1" : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '1 ', 'More Info': 'http://www.cazy.org/GT2.html' },
165
+ "GT60" : {'CAZy Name': 'GT60', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '5 ', 'More Info': 'http://www.cazy.org/GT60.html'},
166
+ "GT14" : {'CAZy Name': 'GT14', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '7 ', 'More Info': 'http://www.cazy.org/GT14.html'},
167
+ "GT2-Bact_DPM_sy": {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '2 ', 'More Info': 'http://www.cazy.org/GT2.html' },
168
+ "GT17" : {'CAZy Name': 'GT17', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '7 ', 'More Info': 'http://www.cazy.org/GT17.html'},
169
+ "GT2-Bact_LPS2" : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '3 ', 'More Info': 'http://www.cazy.org/GT2.html' },
170
+ "GT77" : {'CAZy Name': 'GT77', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '9 ', 'More Info': 'http://www.cazy.org/GT77.html'},
171
+ "GT2-Bact_EpsO" : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': ' ', 'More Info': 'http://www.cazy.org/GT2.html' },
172
+ "GT43-b3gat3" : {'CAZy Name': 'GT43', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT43.html'},
173
+ "GT8-Fun" : {'CAZy Name': 'GT8 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '9 ', 'More Info': 'http://www.cazy.org/GT8.html' },
174
+ "GT75" : {'CAZy Name': 'GT75', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT75.html'},
175
+ "GT2-Bact_GlfT" : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT2.html' },
176
+
177
+ }
178
+
179
+
180
+
181
+
182
+
183
+ def get_family_info(family_name):
184
+ family_info = glycosyltransferase_db.get(family_name, {})
185
+ # convert information to markdown formatted string
186
+ markdown_text = ""
187
+ for key, value in family_info.items():
188
+ if key == "more_info":
189
+ markdown_text += "**{}:**".format(key.title().replace("_", " ")) + "\n"
190
+ for link in value:
191
+ markdown_text += "[{}]({})\n".format(link, link)
192
+ else:
193
+ markdown_text += "**{}:** {}\n".format(key.title().replace("_", " "), value)
194
+ return markdown_text
195
+
196
+ def fig_to_img(fig):
197
+ """Converts a matplotlib figure to a PIL Image and returns it"""
198
+ buf = io.BytesIO()
199
+ fig.savefig(buf, format='png', bbox_inches='tight')
200
+ buf.seek(0)
201
+ img = Image.open(buf)
202
+ return img
203
+
204
+
205
+ def process_family_sequence(protein_fasta):
206
+ lines = protein_fasta.split('\n')
207
+
208
+ headers = [line for line in lines if line.startswith('>')]
209
+ if len(headers) > 1:
210
+ return None, "Multiple fasta sequences detected. Please upload a fasta file with multiple sequences, otherwise only include one fasta sequence.", None
211
+
212
+ protein_sequence = ''.join(line for line in lines if not line.startswith('>'))
213
+
214
+ # Check for invalid characters
215
+ valid_characters = set("ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy") # the 20 standard amino acids
216
+ if not set(protein_sequence).issubset(valid_characters):
217
+ return None, "Invalid protein sequence. It contains characters that are not one of the 20 standard amino acids. Does your sequence contain gaps?", None
218
+
219
+ encoded_input = tokenizer([protein_sequence], padding=True, truncation=True, max_length=512, return_tensors="pt")
220
+ input_idsfam = encoded_input["input_ids"]
221
+ attention_maskfam = encoded_input["attention_mask"]
222
+
223
+ with torch.no_grad():
224
+ outputfam = modelfam(input_idsfam, attention_mask=attention_maskfam)
225
+ logitsfam = outputfam.logits
226
+ probabilitiesfam = F.softmax(logitsfam, dim=1)
227
+ _, predicted_labelsfam = torch.max(logitsfam, dim=1)
228
+
229
+ decoded_labelsfam = label_encoderfam.inverse_transform(predicted_labelsfam.tolist())
230
+ family_info = get_family_info(decoded_labelsfam[0])
231
+
232
+ figfam = plt.figure(figsize=(10, 5))
233
+ labelsfam = label_encoderfam.classes_
234
+ probabilitiesfam = probabilitiesfam.tolist()
235
+
236
+ # Convert the nested list to a flat list of probabilities
237
+ probabilitiesfam_flat = probabilitiesfam[0] if probabilitiesfam else []
238
+
239
+ # Sort labels and probabilities by probability
240
+ labels_probsfam = list(zip(labelsfam, probabilitiesfam_flat))
241
+ labels_probsfam.sort(key=lambda x: x[1], reverse=True)
242
+
243
+ # Select the top 5 fams
244
+ labels_probs_top5fam = labels_probsfam[:5]
245
+ labels_top5, probabilities_top5 = zip(*labels_probs_top5fam)
246
+
247
+ y_posfam = np.arange(len(labels_top5))
248
+
249
+ plt.barh(y_posfam, [prob*100 for prob in probabilities_top5], align='center', alpha=0.5)
250
+ plt.yticks(y_posfam, labels_top5)
251
+ plt.xlabel('Probability (%)')
252
+ plt.title('Top 5 Family Class Probabilities')
253
+ plt.xlim(0, 100)
254
+ plt.close(figfam)
255
+
256
+ img = fig_to_img(figfam)
257
+
258
+ if len(protein_sequence) < 100:
259
+ return decoded_labelsfam[0], img, None, f"**Warning:** The sequence is relatively short. Fragmentary and partial sequences may result in incorrect predictions. \n\n {family_info}"
260
+
261
+
262
+ return decoded_labelsfam[0], img, None, family_info
263
+
264
+
265
+ def process_single_sequence(protein_fasta): #, protein_file
266
+
267
+ lines = protein_fasta.split('\n')
268
+
269
+ headers = [line for line in lines if line.startswith('>')]
270
+ if len(headers) > 1:
271
+ return None, "Multiple fasta sequences detected. Please upload a fasta file with multiple sequences, otherwise only include one fasta sequence.", None
272
+
273
+ protein_sequence = ''.join(line for line in lines if not line.startswith('>'))
274
+
275
+ # Check for invalid characters
276
+ valid_characters = set("ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy") # the 20 standard amino acids
277
+ if not set(protein_sequence).issubset(valid_characters):
278
+ return None, "Invalid protein sequence. It contains characters that are not one of the 20 standard amino acids. Does your sequence contain gaps?", None
279
+
280
+
281
+ encoded_input = tokenizer([protein_sequence], padding=True, truncation=True, max_length=512, return_tensors="pt")
282
+ input_ids = encoded_input["input_ids"]
283
+ attention_mask = encoded_input["attention_mask"]
284
+
285
+ with torch.no_grad():
286
+ output = model(input_ids, attention_mask=attention_mask)
287
+ logits = output.logits
288
+ dprobabilities = F.softmax(logits, dim=1)[0]
289
+ _, predicted_labels = torch.max(logits, dim=1)
290
+
291
+ decoded_labels = label_encoder.inverse_transform(predicted_labels.tolist())
292
+ family_info = get_family_info(decoded_labels[0])
293
+
294
+ fig = plt.figure(figsize=(10, 5))
295
+ labels = label_encoder.classes_
296
+ dprobabilities = dprobabilities.tolist()
297
+
298
+ # Sort labels and probabilities by probability
299
+ labels_probs = list(zip(labels, dprobabilities))
300
+ labels_probs.sort(key=lambda x: x[1], reverse=True)
301
+
302
+ # Select the top 3 donors
303
+ labels_probs_top3 = labels_probs[:3]
304
+ labels_top3, probabilities_top3 = zip(*labels_probs_top3)
305
+
306
+ y_pos = np.arange(len(labels_top3))
307
+
308
+ plt.barh(y_pos, [prob*100 for prob in probabilities_top3], align='center', alpha=0.5)
309
+ plt.yticks(y_pos, labels_top3)
310
+ plt.xlabel('Probability (%)')
311
+ plt.title('Top 3 Donor Class Probabilities')
312
+ plt.xlim(0, 100)
313
+ plt.close(fig)
314
+
315
+ img = fig_to_img(fig)
316
+
317
+ if len(protein_sequence) < 100:
318
+ return decoded_labels[0], img, None, f"**Warning:** The sequence is relatively short. Fragmentary and partial sequences may result in incorrect predictions. \n\n {family_info}"
319
+
320
+
321
+ return decoded_labels[0], img, None, None
322
+
323
+ def process_sequence_file(protein_file): # added progress parameter that is displayed in gradio #, progress=gr.Progress()
324
+ try:
325
+ records = list(SeqIO.parse(protein_file.name, "fasta"))
326
+ except Exception as e:
327
+ return str(e)
328
+
329
+ if not os.path.exists('results'):
330
+ os.makedirs('results')
331
+
332
+ total = len(records)
333
+
334
+ for idx, record in enumerate(records):
335
+ protein_sequence = str(record.seq)
336
+
337
+ valid_characters = set("ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy")
338
+ if not set(protein_sequence).issubset(valid_characters):
339
+ with open(f'results/result_{idx+1}.txt', 'w') as file:
340
+ file.write("Invalid protein sequence. It contains characters that are not one of the 20 standard amino acids. Does your sequence contain gaps?")
341
+ continue
342
+
343
+ label, img, _, info = process_single_sequence(protein_sequence)
344
+ img.save(f'results/result_{idx+1}.png')
345
+ with open(f'results/result_{idx+1}.txt', 'w') as file:
346
+ file.write(f'Predicted Donor: {label}\n\n{info}')
347
+
348
+ # progress(idx/total) # Update the progress bar
349
+
350
+ # Create a zip file w/ results -- To Do: Figure out how to improve compression for large files
351
+ with zipfile.ZipFile('predicted_results.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
352
+ for root, dirs, files in os.walk('results/'):
353
+ for file in files:
354
+ zipf.write(os.path.join(root, file))
355
+
356
+ return 'predicted_results.zip' #Provide indication of how to interpret downloaded zip file? f"**Warning:** The sequence is relatively short. Fragmentary and partial sequences may result in incorrect predictions.
357
+
358
+ # Function to mask a residue at a particular position
359
+ def mask_residue(sequence, position):
360
+ return sequence[:position] + 'X' + sequence[position+1:]
361
+
362
+ def generate_heatmap(protein_fasta):
363
+
364
+ lines = protein_fasta.split('\n')
365
+
366
+ headers = [line for line in lines if line.startswith('>')]
367
+ if len(headers) > 1:
368
+ return None, "Multiple fasta sequences detected. Please upload a fasta file with multiple sequences, otherwise only include one fasta sequence.", None
369
+
370
+ protein_sequence = ''.join(line for line in lines if not line.startswith('>'))
371
+
372
+ # Check for invalid characters
373
+ valid_characters = set("ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy") # the 20 standard amino acids
374
+ if not set(protein_sequence).issubset(valid_characters):
375
+ return None, "Invalid protein sequence. It contains characters that are not one of the 20 standard amino acids. Does your sequence contain gaps?", None
376
+
377
+
378
+ # Tokenize and predict for original sequence
379
+ encoded_input = tokenizer([protein_sequence], padding=True, truncation=True, max_length=512, return_tensors="pt")
380
+ with torch.no_grad():
381
+ original_output = model(encoded_input["input_ids"], attention_mask=encoded_input["attention_mask"])
382
+ original_probabilities = F.softmax(original_output.logits, dim=1).cpu().numpy()[0]
383
+
384
+ # Initialize an array to hold the importance scores
385
+ importance_scores = np.zeros((len(protein_sequence), len(original_probabilities)))
386
+
387
+ # Define the size of each group
388
+ group_size = 10 # You can change this
389
+
390
+ # Initialize an array to hold the importance scores
391
+ num_groups = len(original_sequence) // group_size + (len(original_sequence) % group_size > 0)
392
+ importance_scores = np.zeros((num_groups, len(original_probabilities)))
393
+
394
+ # Initialize tqdm progress bar
395
+ # with tqdm(total=num_groups, desc="Processing groups", position=0, leave=True) as pbar:
396
+ # # Loop through each group of residues in the sequence
397
+ for i in range(0, len(protein_sequence), group_size):
398
+ # Mask the residues in the group at positions [i, i + group_size)
399
+ masked_sequence = protein_sequence[:i] + 'X' * min(group_size, len(protein_sequence) - i) + protein_sequence[i + group_size:]
400
+
401
+ # Tokenize and predict for the masked sequence
402
+ encoded_input = tokenizer([masked_sequence], padding=True, truncation=True, max_length=512, return_tensors="pt")
403
+ with torch.no_grad():
404
+ masked_output = model(encoded_input["input_ids"], attention_mask=encoded_input["attention_mask"])
405
+ masked_probabilities = F.softmax(masked_output.logits, dim=1).cpu().numpy()[0]
406
+
407
+ # Calculate the change in probabilities and store it as the importance score
408
+ group_index = i // group_size
409
+ importance_scores[group_index, :] = np.abs(original_probabilities - masked_probabilities)
410
+
411
+ progress = (i // group_size + 1) / num_groups * 100
412
+ print(f"Progress: {progress:.2f}%")
413
+
414
+ figmap, ax = plt.subplots(figsize=(20, 20))
415
+ sns.heatmap(importance_scores, annot=True, cmap="coolwarm", xticklabels=label_encoder.classes_, yticklabels=[f"{i}-{i+group_size-1}" for i in range(0, len(original_sequence), group_size)], ax=ax)
416
+ ax.set_xlabel("Predicted Labels")
417
+ ax.set_ylabel("Residue Position Groups")
418
+
419
+ img = fig_to_img(figmap)
420
+
421
+ return img
422
+
423
+
424
+ def main_function_single(sequence):
425
+ # Process seq, and return outputs for both fam and don
426
+ family_label, family_img, _, family_info = process_family_sequence(sequence)
427
+ donor_label, donor_img, *_ = process_single_sequence(sequence)
428
+ figmap = generate_heatmap(sequence)
429
+ return family_label, family_img, family_info, donor_label, donor_img, figmap
430
+
431
+ def main_function_upload(protein_file): #, progress=gr.Progress()
432
+ return process_sequence_file(protein_file) #, progress
433
+
434
+ prediction_imagefam = gr.outputs.Image(type='pil', label="Family prediction graph")
435
+ prediction_imagedonor = gr.outputs.Image(type='pil', label="Donor prediction graph")
436
+ prediction_explain = gr.outputs.Image(type='pil', label="Donor prediction explaination")
437
+
438
+
439
+ with gr.Blocks() as app:
440
+ gr.Markdown("# Glydentify")
441
+
442
+ with gr.Tab("Single Sequence Prediction"):
443
+ with gr.Row().style(equal_height=True):
444
+ with gr.Column():
445
+ sequence = gr.inputs.Textbox(lines=15, placeholder='Enter Protein Sequence Here...', label="Protein Sequence")
446
+ with gr.Column():
447
+ with gr.Accordion("Example:"):
448
+ gr.Markdown("""
449
+ \>Q9LTZ9|GALS2_ARATH Galactan beta-1,4-galactosyltransferase GALS2
450
+ MAKERDQNTKDKNLLICFLWNFSAELKLALMALLVLCTLATLLPFLPSSFSISASELRFC
451
+ ISRIAVNSTSVNFTTVVEKPVLDNAVKLTEKPVLDNGVTKQPLTEEKVLNNGVIKRTFTG
452
+ YGWAAYNFVLMNAYRGGVNTFAVIGLSSKPLHVYSHPTYRCEWIPLNQSDNRILTDGTKI
453
+ LTDWGYGRVYTTVVVNCTFPSNTVINPKNTGGTLLLHATTGDTDRNITDSIPVLTETPNT
454
+ VDFALYESNLRRREKYDYLYCGSSLYGNLSPQRIREWIAYHVRFFGERSHFVLHDAGGIT
455
+ EEVFEVLKPWIELGRVTVHDIREQERFDGYYHNQFMVVNDCLHRYRFMAKWMFFFDVDEF
456
+ IYVPAKSSISSVMVSLEEYSQFTIEQMPMSSQLCYDGDGPARTYRKWGFEKLAYRDVKKV
457
+ PRRDRKYAVQPRNVFATGVHMSQHLQGKTYHRAEGKIRYFHYHGSISQRREPCRHLYNGT
458
+ RIVHENN
459
+ """)
460
+ family_prediction = gr.outputs.Textbox(label="Predicted family")
461
+ donor_prediction = gr.outputs.Textbox(label="Predicted donor")
462
+ info_markdown = gr.Markdown()
463
+
464
+ # Predict and Clear buttons
465
+ with gr.Row().style(equal_height=True):
466
+ with gr.Column():
467
+ predict_button = gr.Button("Predict")
468
+ predict_button.click(main_function_single, inputs=sequence,
469
+ outputs=[family_prediction, prediction_imagefam, info_markdown,
470
+ donor_prediction, prediction_imagedonor, prediction_explain])
471
+
472
+ # Family & Donor Section
473
+ with gr.Row().style(equal_height=True):
474
+ with gr.Column():
475
+ with gr.Accordion("Prediction Bar Graphs:"):
476
+ prediction_imagefam.render() # = gr.outputs.Image(type='pil', label="Family prediction graph")
477
+ prediction_imagedonor.render() # = gr.outputs.Image(type='pil', label="Donor prediction graph")
478
+
479
+ # Explain Section
480
+ with gr.Column():
481
+ with gr.Accordion("Donor explanation"):
482
+ prediction_explain.render() # = gr.outputs.Image(type='pil', label="Donor prediction explaination")
483
+
484
+ with gr.Tab("Multiple Sequence Prediction"):
485
+ with gr.Row().style(equal_height=True):
486
+ with gr.Column():
487
+ protein_file = gr.inputs.File(label="Upload FASTA file")
488
+ with gr.Column():
489
+ result_file = gr.outputs.File(label="Download predictions of uploaded sequences")
490
+ with gr.Row().style(equal_height=True):
491
+ with gr.Column():
492
+ process_button = gr.Button("Process")
493
+ process_button.click(main_function_upload, inputs=protein_file, outputs=[result_file])
494
+ with gr.Column():
495
+ clear = gr.Button("Clear")
496
+ clear.click(lambda: None)
497
+ # clear.click()
498
+
499
+ app.launch(show_error=True)
500
+
501
+
requirements.txt ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: glydentify
2
+ channels:
3
+ - conda-forge
4
+ - bioconda
5
+ dependencies:
6
+ - _libgcc_mutex=0.1=conda_forge
7
+ - _openmp_mutex=4.5=2_gnu
8
+ - aiofiles=22.1.0=pyhd8ed1ab_0
9
+ - aiohttp=3.7.4.post0=py39h3811e60_1
10
+ - aiosqlite=0.18.0=pyhd8ed1ab_0
11
+ - altair=4.2.2=pyhd8ed1ab_0
12
+ - anyio=3.6.2=pyhd8ed1ab_0
13
+ - aom=3.5.0=h27087fc_0
14
+ - argon2-cffi=21.3.0=pyhd8ed1ab_0
15
+ - argon2-cffi-bindings=21.2.0=py39hb9d737c_3
16
+ - asttokens=2.2.1=pyhd8ed1ab_0
17
+ - async-timeout=3.0.1=py_1000
18
+ - attrs=22.2.0=pyh71513ae_0
19
+ - aws-c-auth=0.7.0=hf8751d9_2
20
+ - aws-c-cal=0.6.0=h93469e0_0
21
+ - aws-c-common=0.8.23=hd590300_0
22
+ - aws-c-compression=0.2.17=h862ab75_1
23
+ - aws-c-event-stream=0.3.1=h9599702_1
24
+ - aws-c-http=0.7.11=hbe98c3e_0
25
+ - aws-c-io=0.13.28=h3870b5a_0
26
+ - aws-c-mqtt=0.8.14=h2e270ba_2
27
+ - aws-c-s3=0.3.13=heb0bb06_2
28
+ - aws-c-sdkutils=0.1.11=h862ab75_1
29
+ - aws-checksums=0.1.16=h862ab75_1
30
+ - aws-crt-cpp=0.20.3=he9c0e7f_4
31
+ - aws-sdk-cpp=1.10.57=hbc2ea52_17
32
+ - babel=2.12.1=pyhd8ed1ab_1
33
+ - backcall=0.2.0=pyh9f0ad1d_0
34
+ - backports=1.0=pyhd8ed1ab_3
35
+ - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0
36
+ - beautifulsoup4=4.12.2=pyha770c72_0
37
+ - biopython=1.81=py39h72bdee0_0
38
+ - bleach=6.0.0=pyhd8ed1ab_0
39
+ - brotli=1.0.9=h166bdaf_8
40
+ - brotli-bin=1.0.9=h166bdaf_8
41
+ - brotlipy=0.7.0=py39hb9d737c_1005
42
+ - bzip2=1.0.8=h7f98852_4
43
+ - c-ares=1.19.1=hd590300_0
44
+ - ca-certificates=2023.5.7=hbcca054_0
45
+ - certifi=2023.5.7=pyhd8ed1ab_0
46
+ - cffi=1.15.1=py39he91dace_3
47
+ - chardet=4.0.0=py39hf3d152e_3
48
+ - charset-normalizer=3.1.0=pyhd8ed1ab_0
49
+ - click=8.1.3=unix_pyhd8ed1ab_2
50
+ - colorama=0.4.6=pyhd8ed1ab_0
51
+ - comm=0.1.3=pyhd8ed1ab_0
52
+ - contourpy=1.0.7=py39h4b4f3f3_0
53
+ - cryptography=40.0.2=py39h079d5ae_0
54
+ - cycler=0.11.0=pyhd8ed1ab_0
55
+ - dataclasses=0.8=pyhc8e2a94_3
56
+ - datasets=2.13.1=pyhd8ed1ab_0
57
+ - debugpy=1.6.7=py39h227be39_0
58
+ - decorator=5.1.1=pyhd8ed1ab_0
59
+ - defusedxml=0.7.1=pyhd8ed1ab_0
60
+ - dill=0.3.6=pyhd8ed1ab_1
61
+ - entrypoints=0.4=pyhd8ed1ab_0
62
+ - executing=1.2.0=pyhd8ed1ab_0
63
+ - expat=2.5.0=hcb278e6_1
64
+ - fastapi=0.95.1=pyhd8ed1ab_0
65
+ - ffmpeg=5.1.2=gpl_h8dda1f0_106
66
+ - ffmpy=0.3.0=pyhb6f538c_0
67
+ - filelock=3.12.0=pyhd8ed1ab_0
68
+ - flit-core=3.8.0=pyhd8ed1ab_0
69
+ - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
70
+ - font-ttf-inconsolata=3.000=h77eed37_0
71
+ - font-ttf-source-code-pro=2.038=h77eed37_0
72
+ - font-ttf-ubuntu=0.83=hab24e00_0
73
+ - fontconfig=2.14.2=h14ed4e7_0
74
+ - fonts-conda-ecosystem=1=0
75
+ - fonts-conda-forge=1=0
76
+ - fonttools=4.39.3=py39h72bdee0_0
77
+ - freetype=2.12.1=hca18f0e_1
78
+ - fsspec=2023.4.0=pyh1a96a4e_0
79
+ - gflags=2.2.2=he1b5a44_1004
80
+ - glog=0.6.0=h6f12383_0
81
+ - gmp=6.2.1=h58526e2_0
82
+ - gnutls=3.7.8=hf3e180e_0
83
+ - gradio=3.23.0=pyhd8ed1ab_0
84
+ - h11=0.14.0=pyhd8ed1ab_0
85
+ - h2=4.1.0=pyhd8ed1ab_0
86
+ - hpack=4.0.0=pyh9f0ad1d_0
87
+ - httpcore=0.17.0=pyhd8ed1ab_0
88
+ - httpx=0.24.0=pyhd8ed1ab_1
89
+ - huggingface_hub=0.16.4=pyhd8ed1ab_0
90
+ - hyperframe=6.0.1=pyhd8ed1ab_0
91
+ - icu=72.1=hcb278e6_0
92
+ - idna=3.4=pyhd8ed1ab_0
93
+ - importlib-metadata=6.5.0=pyha770c72_0
94
+ - importlib-resources=5.12.0=pyhd8ed1ab_0
95
+ - importlib_metadata=6.5.0=hd8ed1ab_0
96
+ - importlib_resources=5.12.0=pyhd8ed1ab_0
97
+ - ipykernel=6.22.0=pyh210e3f2_0
98
+ - ipython=8.12.0=pyh41d4057_0
99
+ - ipython_genutils=0.2.0=py_1
100
+ - jedi=0.18.2=pyhd8ed1ab_0
101
+ - jinja2=3.1.2=pyhd8ed1ab_1
102
+ - json5=0.9.5=pyh9f0ad1d_0
103
+ - jsonschema=4.17.3=pyhd8ed1ab_0
104
+ - jupyter_client=8.2.0=pyhd8ed1ab_0
105
+ - jupyter_core=5.3.0=py39hf3d152e_0
106
+ - jupyter_events=0.6.3=pyhd8ed1ab_0
107
+ - jupyter_server=2.5.0=pyhd8ed1ab_0
108
+ - jupyter_server_fileid=0.9.0=pyhd8ed1ab_0
109
+ - jupyter_server_terminals=0.4.4=pyhd8ed1ab_1
110
+ - jupyter_server_ydoc=0.8.0=pyhd8ed1ab_0
111
+ - jupyter_ydoc=0.2.3=pyhd8ed1ab_0
112
+ - jupyterlab=3.6.3=pyhd8ed1ab_0
113
+ - jupyterlab_pygments=0.2.2=pyhd8ed1ab_0
114
+ - jupyterlab_server=2.22.1=pyhd8ed1ab_0
115
+ - keyutils=1.6.1=h166bdaf_0
116
+ - kiwisolver=1.4.4=py39hf939315_1
117
+ - krb5=1.21.1=h659d440_0
118
+ - lame=3.100=h166bdaf_1003
119
+ - lcms2=2.15=haa2dc70_1
120
+ - ld_impl_linux-64=2.40=h41732ed_0
121
+ - lerc=4.0.0=h27087fc_0
122
+ - libabseil=20230125.3=cxx17_h59595ed_0
123
+ - libarrow=12.0.1=h657c46f_5_cpu
124
+ - libblas=3.9.0=16_linux64_openblas
125
+ - libbrotlicommon=1.0.9=h166bdaf_8
126
+ - libbrotlidec=1.0.9=h166bdaf_8
127
+ - libbrotlienc=1.0.9=h166bdaf_8
128
+ - libcblas=3.9.0=16_linux64_openblas
129
+ - libcrc32c=1.1.2=h9c3ff4c_0
130
+ - libcurl=8.2.0=hca28451_0
131
+ - libdeflate=1.18=h0b41bf4_0
132
+ - libdrm=2.4.114=h166bdaf_0
133
+ - libedit=3.1.20191231=he28a2e2_2
134
+ - libev=4.33=h516909a_1
135
+ - libevent=2.1.12=hf998b51_1
136
+ - libexpat=2.5.0=hcb278e6_1
137
+ - libffi=3.4.2=h7f98852_5
138
+ - libgcc-ng=12.2.0=h65d4601_19
139
+ - libgfortran-ng=12.2.0=h69a702a_19
140
+ - libgfortran5=12.2.0=h337968e_19
141
+ - libgomp=12.2.0=h65d4601_19
142
+ - libgoogle-cloud=2.12.0=h840a212_1
143
+ - libgrpc=1.56.2=h3905398_0
144
+ - libiconv=1.17=h166bdaf_0
145
+ - libidn2=2.1.0=h470a237_0
146
+ - libjpeg-turbo=2.1.5.1=h0b41bf4_0
147
+ - liblapack=3.9.0=16_linux64_openblas
148
+ - libnghttp2=1.52.0=h61bc06f_0
149
+ - libnsl=2.0.0=h7f98852_0
150
+ - libnuma=2.0.16=h0b41bf4_1
151
+ - libopenblas=0.3.21=pthreads_h78a6416_3
152
+ - libopus=1.3.1=h7f98852_1
153
+ - libpciaccess=0.17=h166bdaf_0
154
+ - libpng=1.6.39=h753d276_0
155
+ - libprotobuf=4.23.3=hd1fb520_0
156
+ - libsodium=1.0.18=h36c2ea0_1
157
+ - libsqlite=3.40.0=h753d276_0
158
+ - libssh2=1.11.0=h0841786_0
159
+ - libstdcxx-ng=12.2.0=h46fd767_19
160
+ - libtasn1=4.19.0=h166bdaf_0
161
+ - libthrift=0.18.1=h8fd135c_2
162
+ - libtiff=4.5.0=ha587672_6
163
+ - libutf8proc=2.8.0=h166bdaf_0
164
+ - libuuid=2.38.1=h0b41bf4_0
165
+ - libva=2.18.0=h0b41bf4_0
166
+ - libvpx=1.11.0=h9c3ff4c_3
167
+ - libwebp-base=1.3.0=h0b41bf4_0
168
+ - libxcb=1.13=h7f98852_1004
169
+ - libxml2=2.10.4=hfdac1af_0
170
+ - libzlib=1.2.13=h166bdaf_4
171
+ - linkify-it-py=2.0.0=pyhd8ed1ab_0
172
+ - lz4-c=1.9.4=hcb278e6_0
173
+ - markdown-it-py=2.2.0=pyhd8ed1ab_0
174
+ - markupsafe=2.1.2=py39h72bdee0_0
175
+ - matplotlib-base=3.7.1=py39he190548_0
176
+ - matplotlib-inline=0.1.6=pyhd8ed1ab_0
177
+ - mdit-py-plugins=0.3.3=pyhd8ed1ab_0
178
+ - mdurl=0.1.0=pyhd8ed1ab_0
179
+ - mistune=2.0.5=pyhd8ed1ab_0
180
+ - multidict=6.0.4=py39h72bdee0_0
181
+ - multiprocess=0.70.14=py39hb9d737c_3
182
+ - munkres=1.1.4=pyh9f0ad1d_0
183
+ - nbclassic=0.5.5=pyhb4ecaf3_1
184
+ - nbclient=0.7.3=pyhd8ed1ab_0
185
+ - nbconvert=7.3.1=pyhd8ed1ab_0
186
+ - nbconvert-core=7.3.1=pyhd8ed1ab_0
187
+ - nbconvert-pandoc=7.3.1=pyhd8ed1ab_0
188
+ - nbformat=5.8.0=pyhd8ed1ab_0
189
+ - ncurses=6.3=h27087fc_1
190
+ - nest-asyncio=1.5.6=pyhd8ed1ab_0
191
+ - nettle=3.8.1=hc379101_1
192
+ - notebook=6.5.4=pyha770c72_0
193
+ - notebook-shim=0.2.2=pyhd8ed1ab_0
194
+ - numpy=1.24.2=py39h7360e5f_0
195
+ - openh264=2.3.1=hcb278e6_2
196
+ - openjpeg=2.5.0=hfec8fc6_2
197
+ - openssl=3.1.1=hd590300_1
198
+ - orc=1.9.0=h385abfd_1
199
+ - orjson=3.8.10=py39hd8b4457_0
200
+ - p11-kit=0.24.1=hc5aa10d_0
201
+ - packaging=23.1=pyhd8ed1ab_0
202
+ - pandas=2.0.0=py39h2ad29b5_0
203
+ - pandoc=2.19.2=h32600fe_2
204
+ - pandocfilters=1.5.0=pyhd8ed1ab_0
205
+ - parso=0.8.3=pyhd8ed1ab_0
206
+ - patsy=0.5.3=pyhd8ed1ab_0
207
+ - pexpect=4.8.0=pyh1a96a4e_2
208
+ - pickleshare=0.7.5=py_1003
209
+ - pillow=9.5.0=py39h7207d5c_0
210
+ - pip=23.1=pyhd8ed1ab_0
211
+ - pkgutil-resolve-name=1.3.10=pyhd8ed1ab_0
212
+ - platformdirs=3.2.0=pyhd8ed1ab_0
213
+ - pooch=1.7.0=pyha770c72_3
214
+ - prometheus_client=0.16.0=pyhd8ed1ab_0
215
+ - prompt-toolkit=3.0.38=pyha770c72_0
216
+ - prompt_toolkit=3.0.38=hd8ed1ab_0
217
+ - psutil=5.9.5=py39h72bdee0_0
218
+ - pthread-stubs=0.4=h36c2ea0_1001
219
+ - ptyprocess=0.7.0=pyhd3deb0d_0
220
+ - pure_eval=0.2.2=pyhd8ed1ab_0
221
+ - pyarrow=12.0.1=py39hfbd5978_5_cpu
222
+ - pycparser=2.21=pyhd8ed1ab_0
223
+ - pydantic=1.10.7=py39h72bdee0_0
224
+ - pydub=0.25.1=pyhd8ed1ab_0
225
+ - pygments=2.15.1=pyhd8ed1ab_0
226
+ - pyopenssl=23.1.1=pyhd8ed1ab_0
227
+ - pyparsing=3.0.9=pyhd8ed1ab_0
228
+ - pyrsistent=0.19.3=py39h72bdee0_0
229
+ - pysocks=1.7.1=pyha2e5f31_6
230
+ - python=3.9.16=h2782a2a_0_cpython
231
+ - python-dateutil=2.8.2=pyhd8ed1ab_0
232
+ - python-fastjsonschema=2.16.3=pyhd8ed1ab_0
233
+ - python-json-logger=2.0.7=pyhd8ed1ab_0
234
+ - python-multipart=0.0.6=pyhd8ed1ab_0
235
+ - python-tzdata=2023.3=pyhd8ed1ab_0
236
+ - python-xxhash=3.2.0=py39h72bdee0_0
237
+ - python_abi=3.9=3_cp39
238
+ - pytz=2023.3=pyhd8ed1ab_0
239
+ - pyyaml=6.0=py39hb9d737c_5
240
+ - pyzmq=25.0.2=py39h0be026e_0
241
+ - rdma-core=28.9=h59595ed_1
242
+ - re2=2023.03.02=h8c504da_0
243
+ - readline=8.2=h8228510_1
244
+ - regex=2023.6.3=py39hd1e30aa_0
245
+ - requests=2.28.2=pyhd8ed1ab_1
246
+ - responses=0.18.0=pyhd8ed1ab_0
247
+ - rfc3339-validator=0.1.4=pyhd8ed1ab_0
248
+ - rfc3986-validator=0.1.1=pyh9f0ad1d_0
249
+ - s2n=1.3.46=h06160fa_0
250
+ - sacremoses=0.0.53=pyhd8ed1ab_0
251
+ - safetensors=0.3.1=py39h9fdd4d6_0
252
+ - scipy=1.10.1=py39he83f1e1_0
253
+ - seaborn=0.12.2=hd8ed1ab_0
254
+ - seaborn-base=0.12.2=pyhd8ed1ab_0
255
+ - semantic_version=2.10.0=pyhd8ed1ab_0
256
+ - send2trash=1.8.0=pyhd8ed1ab_0
257
+ - setuptools=67.6.1=pyhd8ed1ab_0
258
+ - six=1.16.0=pyh6c4a22f_0
259
+ - snappy=1.1.10=h9fff704_0
260
+ - sniffio=1.3.0=pyhd8ed1ab_0
261
+ - soupsieve=2.3.2.post1=pyhd8ed1ab_0
262
+ - stack_data=0.6.2=pyhd8ed1ab_0
263
+ - starlette=0.26.1=pyhd8ed1ab_0
264
+ - statsmodels=0.13.5=py39h2ae25f5_2
265
+ - svt-av1=1.4.1=hcb278e6_0
266
+ - terminado=0.17.1=pyh41d4057_0
267
+ - tinycss2=1.2.1=pyhd8ed1ab_0
268
+ - tk=8.6.12=h27826a3_0
269
+ - tokenizers=0.13.3=py39h585fa2d_0
270
+ - tomli=2.0.1=pyhd8ed1ab_0
271
+ - toolz=0.12.0=pyhd8ed1ab_0
272
+ - tornado=6.3=py39h72bdee0_0
273
+ - tqdm=4.65.0=pyhd8ed1ab_1
274
+ - traitlets=5.9.0=pyhd8ed1ab_0
275
+ - transformers=4.31.0
276
+ - typing-extensions=4.5.0=hd8ed1ab_0
277
+ - typing_extensions=4.5.0=pyha770c72_0
278
+ - tzdata=2023c=h71feb2d_0
279
+ - uc-micro-py=1.0.1=pyhd8ed1ab_0
280
+ - ucx=1.14.1=hf587318_2
281
+ - unicodedata2=15.0.0=py39hb9d737c_0
282
+ - urllib3=1.26.15=pyhd8ed1ab_0
283
+ - uvicorn=0.21.1=py39hf3d152e_0
284
+ - wcwidth=0.2.6=pyhd8ed1ab_0
285
+ - webencodings=0.5.1=py_1
286
+ - websocket-client=1.5.1=pyhd8ed1ab_0
287
+ - websockets=11.0.2=py39h72bdee0_0
288
+ - wheel=0.40.0=pyhd8ed1ab_0
289
+ - wordcloud=1.8.2.2=py39hb9d737c_1
290
+ - x264=1!164.3095=h166bdaf_2
291
+ - x265=3.5=h924138e_3
292
+ - xorg-fixesproto=5.0=h7f98852_1002
293
+ - xorg-kbproto=1.0.7=h7f98852_1002
294
+ - xorg-libx11=1.8.4=h0b41bf4_0
295
+ - xorg-libxau=1.0.9=h7f98852_0
296
+ - xorg-libxdmcp=1.1.3=h7f98852_0
297
+ - xorg-libxext=1.3.4=h0b41bf4_2
298
+ - xorg-libxfixes=5.0.3=h7f98852_1004
299
+ - xorg-xextproto=7.3.0=h0b41bf4_1003
300
+ - xorg-xproto=7.0.31=h7f98852_1007
301
+ - xxhash=0.8.1=h0b41bf4_0
302
+ - xz=5.2.6=h166bdaf_0
303
+ - y-py=0.5.9=py39h50f1755_0
304
+ - yaml=0.2.5=h7f98852_2
305
+ - yarl=1.8.2=py39hb9d737c_0
306
+ - ypy-websocket=0.8.2=pyhd8ed1ab_0
307
+ - zeromq=4.3.4=h9c3ff4c_1
308
+ - zipp=3.15.0=pyhd8ed1ab_0
309
+ - zstd=1.5.2=h3eb15da_6
310
+ - pip:
311
+ - cmake==3.25.0
312
+ - cssselect2==0.7.0
313
+ - glypy==1.0.8
314
+ - hjson==3.1.0
315
+ - joblib==1.3.1
316
+ - lit==15.0.7
317
+ - lxml==4.9.2
318
+ - mpmath==1.2.1
319
+ - networkx==3.0
320
+ - pmw-py3==2.1
321
+ - preppy==4.2.1
322
+ - scikit-learn==1.3.0
323
+ - svglib==1.4.1
324
+ - sympy==1.11.1
325
+ - threadpoolctl==3.2.0
326
+ - torch==2.0.1
327
+ - torchaudio==2.0.2
328
+ - torchvision==0.15.2
329
+ - triton==2.0.0