Spaces:

wenkai
/

FAPM_demo

Runtime error

App Files Files Community

wenkai commited on Jun 24, 2024

Commit

4a1f168

verified ·

1 Parent(s): d376991

Upload 24 files

Browse files

Files changed (25) hide show

.gitattributes +4 -0
data/emb_esm2_3b/P18281.pt +3 -0
data/evaluate_data/evaluate_cases.py +213 -0
data/evaluate_data/evaluate_pretrain.py +282 -0
data/evaluate_data/evaluate_with_ancestors.py +339 -0
data/evaluate_data/evaluate_with_ancestors_exp.py +339 -0
data/evaluate_data/pretrain_output_to_deepgozero.py +477 -0
data/evaluate_data/process_case.py +50 -0
data/evaluate_data/utils.py +280 -0
data/fasta/example.fasta +2 -0
data/fasta/prepare_custom_fasta.py +7 -0
data/go1.4-basic.obo +3 -0
data/go_descriptions1.4.txt +0 -0
data/swissprot_exp/test_exp_prompt_bp_new.csv +0 -0
data/swissprot_exp/test_exp_prompt_cc_new.csv +0 -0
data/swissprot_exp/test_exp_prompt_mf_new.csv +0 -0
data/swissprot_exp/train_exp_prompt_bp_new.csv +3 -0
data/swissprot_exp/train_exp_prompt_cc_new.csv +3 -0
data/swissprot_exp/train_exp_prompt_mf_new.csv +3 -0
data/swissprot_exp/val_exp_prompt_bp_new.csv +0 -0
data/swissprot_exp/val_exp_prompt_cc_new.csv +0 -0
data/swissprot_exp/val_exp_prompt_mf_new.csv +0 -0
data/terms/bp_terms.pkl +3 -0
data/terms/cc_terms.pkl +3 -0
data/terms/mf_terms.pkl +3 -0

.gitattributes CHANGED Viewed

@@ -35,3 +35,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 assets/FAPM.png filter=lfs diff=lfs merge=lfs -text
 assets/LAVIS_technical_report.pdf filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 assets/FAPM.png filter=lfs diff=lfs merge=lfs -text
 assets/LAVIS_technical_report.pdf filter=lfs diff=lfs merge=lfs -text
+data/go1.4-basic.obo filter=lfs diff=lfs merge=lfs -text
+data/swissprot_exp/train_exp_prompt_bp_new.csv filter=lfs diff=lfs merge=lfs -text
+data/swissprot_exp/train_exp_prompt_cc_new.csv filter=lfs diff=lfs merge=lfs -text
+data/swissprot_exp/train_exp_prompt_mf_new.csv filter=lfs diff=lfs merge=lfs -text

data/emb_esm2_3b/P18281.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91714943ae1d08f860e86cfcd098f3973dc14ca63d88556223223fc9687ac7ec
+size 901864

data/evaluate_data/evaluate_cases.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import pandas as pd
+import re
+import random
+import Levenshtein
+import numpy as np
+import difflib
+# from torchmetrics.text import BLEUScore
+import time
+from multiprocessing import Pool, Queue, Process
+import matplotlib.pyplot as plt
+from data.evaluate_data.utils import Ontology
+# bleu = BLEUScore(n_gram=1)
+def fuzzy_match(texts):
+    text_dict = {}
+    for context in texts:
+        if context not in choices:
+            # txt_dict[txt] = process.extractOne(txt, choices)[0]
+            text_dict[context] = difflib.get_close_matches(context, choices, n=1, cutoff=0.)[0]
+    return text_dict
+def get_sim(text, label):
+    all_s = []
+    for x in label:
+        s = 0
+        for y in text:
+            temp = Levenshtein.ratio(x, y)
+            if temp > s:
+                s = temp
+        all_s.append(s)
+    all_s = [round(i, 3) for i in all_s]
+    # bs = [bleu(x, [label]) for x in text]
+    return all_s
+def txt_map(x, txt_dict):
+    if type(x) == str:
+        x = eval(x)
+    x_ = []
+    for i in x:
+        if i == '':
+            continue
+        if i in txt_dict:
+            x_.append(txt_dict[i])
+        else:
+            x_.append(i)
+    return x_
+def go_map(t):
+    if t in GO_dict:
+        return GO_dict[t]
+    else:
+        print(t)
+def get_term(df):
+    from collections import Counter
+    cnt = Counter()
+    for i, row in enumerate(df.itertuples()):
+        for term in row.prop_annotations:
+            cnt[term] += 1
+    terms = list(cnt.keys())
+    # remove top
+    for top_term in ['GO:0005575', 'GO:0003674', 'GO:0008150']:
+        if top_term in terms:
+            terms.remove(top_term)
+    terms_df = pd.DataFrame({'gos': terms})
+    terms_df.to_pickle(f'/cluster/home/wenkai/deepgozero/data/blip2/terms.pkl')
+if __name__ == "__main__":
+    go = Ontology(f'/cluster/home/wenkai/deepgozero/data/data/go.obo', with_rels=True)
+    go_des = pd.read_csv('/cluster/home/wenkai/LAVIS/data/go_descriptions_new.txt', sep='|', header=None)
+    go_des.columns = ['GO', 'function']
+    go_des = go_des[go_des['function'].notnull()]
+    go_des['function'] = go_des['function'].apply(lambda x: x.lower().strip())
+    go_des['GO'] = go_des['GO'].apply(lambda x: re.sub('_', ':', x))
+    GO_dict = dict(zip(go_des['function'], go_des['GO']))
+    data = pd.read_csv('/cluster/home/wenkai/LAVIS/output/output_case.txt', sep='|', header=None)
+    data.columns = ['protein', 'pred', 'label']
+    data['label'] = data['label'].apply(lambda x: x.lower())
+    data['pred'] = data['pred'].apply(lambda x: re.sub('</s>', '', x))
+    data['label_list'] = data['label'].apply(lambda x: [i.strip() for i in x.split(';')])
+    data['pred_list'] = data['pred'].apply(lambda x: [i.strip() for i in x.split(';')])
+    test = pd.read_csv('/cluster/home/wenkai/LAVIS/data/pretrain/test.csv', sep='|')
+    test = test.drop_duplicates()
+    test['function'] = test['function'].apply(lambda x: x.lower().strip())
+    test['function'] = test['function'].apply(lambda x: [i.strip() for i in x.split(';')])
+    test['GO_label'] = test['GO_label'].apply(lambda x: [i.strip() for i in x.split(';')])
+    test_dict = dict()
+    for x, y in zip(test['function'], test['GO_label']):
+        temp = dict(zip(x, y))
+        test_dict.update(temp)
+    GO_dict.update(test_dict)
+    choices = list(test_dict.keys())
+    ### 预测的文本如果不在GO标签词中，则算作最相似的GO标签
+    '''
+    print("找到与预测文本最相似的GO标签......")
+    t0 = time.time()
+    txt_dict = {}
+    all_txt = []
+    for txt in data['pred_list']:
+        if type(txt) == str:
+            all_txt.extend(eval(txt))
+        else:
+            all_txt.extend(txt)
+    all_txt = list(set(all_txt))
+    n = len(all_txt)
+    thread = 10
+    size = int(n/thread)
+    inds = list(range(0, n, size))
+    inds.append(n)
+    all_txt_sep = [all_txt[i: min(i+size, n)] for i in inds[:-1]]
+    with Pool(processes=thread) as pool:
+        result = pool.map(fuzzy_match, all_txt_sep)
+    pool.close()
+    pool.join()
+    for d in result:
+        txt_dict.update(d)
+    # for txt in all_txt[:10]:
+    #     fuzzy_match(txt)
+    data['pred_list'] = data['pred_list'].apply(lambda x: txt_map(x, txt_dict))
+    data['pred_list'] = data['pred_list'].apply(lambda x: list(set(x)))
+    print("fuzzy matching time: {}".format(time.time() - t0))
+    print("calculating f1 score ......")
+    data['label_list_go'] = data['label_list'].apply(lambda x: [go_map(i) for i in x])
+    data['pred_list_go'] = data['pred_list'].apply(lambda x: [go_map(i) for i in x])
+    '''
+    # 准备case测试数据：blip2预测的Go标签作为feature，label加入祖先后作为预测的Y
+    prepare_ancestors = True
+    if prepare_ancestors:
+        print("准备加入祖先后的数据......")
+        def prop(df):
+            prop_annotations = []
+            for i, row in df.iterrows():
+                # Propagate annotations
+                annot_set = set()
+                annots = row['GO_label']
+                for go_id in annots:
+                    annot_set |= go.get_anchestors(go_id)
+                annots = list(annot_set)
+                prop_annotations.append(annots)
+            df['prop_annotations'] = prop_annotations
+            return df
+        def pred_text_to_go(df):
+            df['pred'] = df['pred'].apply(lambda x: re.sub('</s>', '', x))
+            df['pred_list'] = df['pred'].apply(lambda x: [i.strip() for i in x.split(';')])
+            ### 预测的文本如果不在GO标签词中，则算作最相似的GO标签
+            t0 = time.time()
+            txt_dict = {}
+            all_txt = []
+            for txt in df['pred_list']:
+                if type(txt) == str:
+                    all_txt.extend(eval(txt))
+                else:
+                    all_txt.extend(txt)
+            all_txt = list(set(all_txt))
+            if '' in all_txt:
+                all_txt.remove('')
+            n = len(all_txt)
+            thread = 10
+            size = int(n / thread)
+            inds = list(range(0, n, size))
+            inds.append(n)
+            all_txt_sep = [all_txt[i: min(i + size, n)] for i in inds[:-1]]
+            with Pool(processes=thread) as pool:
+                result = pool.map(fuzzy_match, all_txt_sep)
+            pool.close()
+            pool.join()
+            for d in result:
+                txt_dict.update(d)
+            # for txt in all_txt[:10]:
+            #     fuzzy_match(txt)
+            df['pred_list'] = df['pred_list'].apply(lambda x: txt_map(x, txt_dict))
+            df['pred_list'] = df['pred_list'].apply(lambda x: list(set(x)))
+            print("fuzzy matching time: {}".format(time.time() - t0))
+            df['pred_list_go'] = df['pred_list'].apply(lambda x: [go_map(i) for i in x])
+            return df
+        test_pred = pd.read_csv('/cluster/home/wenkai/LAVIS/output/output_case.txt', sep='|', header=None)
+        test_pred.columns = ['protein', 'pred', 'GO_label']
+        test_pred['GO_label'] = test_pred['GO_label'].apply(lambda x: [i.strip() for i in x.split(';')])
+        test_pred = prop(test_pred)
+        test_pred = pred_text_to_go(test_pred)
+        for cat in ['mf', 'bp', 'cc']:
+            test_pred.to_pickle('/cluster/home/wenkai/deepgozero/data/blip2/{}/test_case.pkl'.format(cat))

data/evaluate_data/evaluate_pretrain.py ADDED Viewed

	@@ -0,0 +1,282 @@

+import pandas as pd
+import re
+import random
+import Levenshtein
+import numpy as np
+import difflib
+# from torchmetrics.text import BLEUScore
+import time
+from multiprocessing import Pool, Queue, Process
+import matplotlib.pyplot as plt
+from data.evaluate_data.utils import Ontology
+# bleu = BLEUScore(n_gram=1)
+def fuzzy_match(texts):
+    text_dict = {}
+    for context in texts:
+        if context not in choices:
+            # txt_dict[txt] = process.extractOne(txt, choices)[0]
+            sim_list = difflib.get_close_matches(context, choices, n=1, cutoff=0.93)
+            if len(sim_list) > 0:
+                text_dict[context] = sim_list[0]
+            else:
+                text_dict[context] = ''
+    return text_dict
+def get_sim(text, label):
+    all_s = []
+    for x in label:
+        s = 0
+        for y in text:
+            temp = Levenshtein.ratio(x, y)
+            if temp > s:
+                s = temp
+        all_s.append(s)
+    all_s = [round(i, 3) for i in all_s]
+    # bs = [bleu(x, [label]) for x in text]
+    return all_s
+def txt_map(x, txt_dict):
+    if type(x) == str:
+        x = eval(x)
+    x_ = []
+    for i in x:
+        if i == '':
+            continue
+        if i in txt_dict:
+            x_.append(txt_dict[i])
+        else:
+            x_.append(i)
+    return x_
+def go_map(t):
+    if t in GO_dict:
+        return GO_dict[t]
+    else:
+        pass
+        #print(t)
+def get_term(df):
+    from collections import Counter
+    cnt = Counter()
+    for i, row in enumerate(df.itertuples()):
+        for term in row.prop_annotations:
+            cnt[term] += 1
+    terms = list(cnt.keys())
+    # remove top
+    for top_term in ['GO:0005575', 'GO:0003674', 'GO:0008150']:
+        if top_term in terms:
+            terms.remove(top_term)
+    terms_df = pd.DataFrame({'gos': terms})
+    terms_df.to_pickle(f'/cluster/home/wenkai/deepgozero/data/blip2/terms.pkl')
+if __name__ == "__main__":
+    go = Ontology(f'/cluster/home/wenkai/deepgozero/data/data/go.obo', with_rels=True)
+    go_des = pd.read_csv('/cluster/home/wenkai/LAVIS/data/go_descriptions_new.txt', sep='|', header=None)
+    go_des.columns = ['GO', 'function']
+    go_des = go_des[go_des['function'].notnull()]
+    go_des['function'] = go_des['function'].apply(lambda x: x.lower().strip())
+    go_des['GO'] = go_des['GO'].apply(lambda x: re.sub('_', ':', x))
+    GO_dict = dict(zip(go_des['function'], go_des['GO']))
+    data = pd.read_csv('/cluster/home/wenkai/LAVIS/output/output_go_train.txt', sep='|', header=None, on_bad_lines='skip')
+    data.columns = ['name', 'pred', 'label']
+    #data['label'] = data['label'].apply(lambda x: x.lower())
+    data['pred'] = data['pred'].apply(lambda x: re.sub('</s>', '', x))
+    #data['label_list'] = data['label'].apply(lambda x: [i.strip() for i in x.split(';')])
+    data['pred_list'] = data['pred'].apply(lambda x: list(set([i.strip() for i in x.split(';')])))
+    #train = pd.read_csv('/cluster/home/wenkai/LAVIS/data/pretrain/train_exp.csv', sep='|')
+    test = pd.read_csv('/cluster/home/wenkai/LAVIS/data/pretrain/train_exp.csv', sep='|')
+    test = test.drop_duplicates()
+    test['function'] = test['function'].apply(lambda x: x.lower().strip())
+    test['function'] = test['function'].apply(lambda x: [i.strip() for i in x.split(';')])
+    test['GO_label'] = test['GO_label'].apply(lambda x: [i.strip() for i in x.split(';')])
+    data = pd.merge(data, test[['name', 'function']], on='name', how='left')
+    data['label_list'] = data['function']
+    test_dict = dict()
+    for x, y in zip(test['function'], test['GO_label']):
+        temp = dict(zip(x, y))
+        test_dict.update(temp)
+    GO_dict.update(test_dict)
+    choices = list(test_dict.keys())
+    ### 预测的文本如果不在GO标签词中，则算作最相似的GO标签
+    print("找到与预测文本最相似的GO标签......")
+    t0 = time.time()
+    txt_dict = {}
+    all_txt = []
+    for txt in data['pred_list']:
+        if type(txt) == str:
+            all_txt.extend(eval(txt))
+        else:
+            all_txt.extend(txt)
+    all_txt = list(set(all_txt))
+    n = len(all_txt)
+    thread = 40
+    size = int(n/thread)
+    inds = list(range(0, n, size))
+    inds.append(n)
+    all_txt_sep = [all_txt[i: min(i+size, n)] for i in inds[:-1]]
+    with Pool(processes=thread) as pool:
+        result = pool.map(fuzzy_match, all_txt_sep)
+    pool.close()
+    pool.join()
+    for d in result:
+        txt_dict.update(d)
+    # for txt in all_txt[:10]:
+    #     fuzzy_match(txt)
+    data['pred_list'] = data['pred_list'].apply(lambda x: txt_map(x, txt_dict))
+    data['pred_list'] = data['pred_list'].apply(lambda x: list(set(x)))
+    print("fuzzy matching time: {}".format(time.time() - t0))
+    print("calculating f1 score ......")
+    data['label_list_go'] = data['label_list'].apply(lambda x: [go_map(i) for i in x])
+    data['pred_list_go'] = data['pred_list'].apply(lambda x: [go_map(i) for i in x])
+    labels = []
+    pred_labels = []
+    for l in data['label_list_go']:
+        if type(l) == str:
+            l = eval(l)
+        labels.extend(l)
+    label_count = {}
+    for x in labels:
+        if x not in label_count:
+            label_count[x] = 1
+        else:
+            label_count[x] += 1
+    labels = list(set(labels))
+    total = len(labels)
+    recalls = []
+    precisions = []
+    tp_dict, fp_dict, fn_dict = dict(zip(labels, [0]*len(labels))), dict(zip(labels, [0]*len(labels))), dict(zip(labels, [0]*len(labels)))
+    for preds, label in zip(data['pred_list_go'], data['label_list_go']):
+        if type(label) == str:
+            label = eval(label)
+        if type(preds) == str:
+            txts = eval(preds)
+        ll = len(label)
+        for t in label:
+            # supgo = go.get_anchestors(t)
+            # if supgo.intersection(set(preds)):
+            if t in preds:
+                tp_dict[t] += 1
+            else:
+                fn_dict[t] += 1
+        for p in preds:
+            # supgo = go.get_anchestors(p)
+            # if not supgo.intersection(set(label)):
+            if p not in label:
+                if p in fp_dict:
+                    fp_dict[p] += 1
+                else:
+                    fp_dict[p] = 1
+        pred_labels.extend(preds)
+    p_total = len(set(pred_labels))
+    recall, pr = 0., 0.
+    for x in labels:
+        recall += tp_dict[x] / (1.0 * (tp_dict[x] + fn_dict[x] + 1e-8))
+        pr += tp_dict[x] / (1.0 * (tp_dict[x] + fp_dict[x] + 1e-8))
+    r = recall / total
+    p = pr / p_total
+    f1 = 2 * p * r / (p + r)
+    print("preds not in labels: {}".format(len(list(fp_dict.keys())) - total))
+    print("recall:{}; percision:{}; f1 score: {}".format(r, p, f1))
+    # 准备数据：blip2预测的Go标签作为feature，label加入祖先后作为预测的Y
+    prepare_ancestors = False
+    if prepare_ancestors:
+        print("准备加入祖先后的数据......")
+        def prop(df):
+            prop_annotations = []
+            for i, row in df.iterrows():
+                # Propagate annotations
+                annot_set = set()
+                annots = row['GO_label']
+                for go_id in annots:
+                    annot_set |= go.get_anchestors(go_id)
+                annots = list(annot_set)
+                prop_annotations.append(annots)
+            df['prop_annotations'] = prop_annotations
+            return df
+        def remove_nan(x):
+            if '' in x:
+                x.remove('')
+            return x
+        def pred_text_to_go(df):
+            df['pred'] = df['pred'].apply(lambda x: re.sub('</s>', '', x))
+            df['pred_list'] = df['pred'].apply(lambda x: list(set([i.strip() for i in x.split(';')])))
+            ### 预测的文本如果不在GO标签词中，则算作最相似的GO标签
+            t0 = time.time()
+            txt_dict = {}
+            all_txt = []
+            for txt in df['pred_list']:
+                if type(txt) == str:
+                    all_txt.extend(eval(txt))
+                else:
+                    all_txt.extend(txt)
+            all_txt = list(set(all_txt))
+            if '' in all_txt:
+                all_txt.remove('')
+            n = len(all_txt)
+            thread = 40
+            size = int(n / thread)
+            inds = list(range(0, n, size))
+            inds.append(n)
+            all_txt_sep = [all_txt[i: min(i + size, n)] for i in inds[:-1]]
+            with Pool(processes=thread) as pool:
+                result = pool.map(fuzzy_match, all_txt_sep)
+            pool.close()
+            pool.join()
+            for d in result:
+                txt_dict.update(d)
+            # for txt in all_txt[:10]:
+            #     fuzzy_match(txt)
+            df['pred_list'] = df['pred_list'].apply(lambda x: txt_map(x, txt_dict))
+            df['pred_list'] = df['pred_list'].apply(lambda x: list(set(x)))
+            df['pred_list'] = df['pred_list'].apply(lambda x: remove_nan(x))
+            print("fuzzy matching time: {}".format(time.time() - t0))
+            df['pred_list_go'] = df['pred_list'].apply(lambda x: [go_map(i) for i in x])
+            return df
+        test_pred = pd.read_csv('/cluster/home/wenkai/LAVIS/pretrain/output_pretrain.txt', sep='|', header=None)
+        test_pred.columns = ['protein', 'pred', 'GO_label']
+        test_pred['GO_label'] = test_pred['GO_label'].apply(lambda x: [i.strip() for i in x.split(';')])
+        test_pred = test_pred(test)
+        get_term(test)
+        test_pred = pred_text_to_go(test_pred)
+        test_pred.to_pickle('/cluster/home/wenkai/deepgozero/data/blip2/{}/test_pretrain.pkl')

data/evaluate_data/evaluate_with_ancestors.py ADDED Viewed

	@@ -0,0 +1,339 @@

+import pandas as pd
+import re
+import random
+import Levenshtein
+import numpy as np
+import difflib
+# from torchmetrics.text import BLEUScore
+import time
+from multiprocessing import Pool, Queue, Process
+import matplotlib.pyplot as plt
+from data.evaluate_data.utils import Ontology
+# bleu = BLEUScore(n_gram=1)
+def fuzzy_match(texts):
+    text_dict = {}
+    for context in texts:
+        if context not in choices:
+            # txt_dict[txt] = process.extractOne(txt, choices)[0]
+            text_dict[context] = difflib.get_close_matches(context, choices, n=1, cutoff=0.)[0]
+    return text_dict
+def get_sim(text, label):
+    all_s = []
+    for x in label:
+        s = 0
+        for y in text:
+            temp = Levenshtein.ratio(x, y)
+            if temp > s:
+                s = temp
+        all_s.append(s)
+    all_s = [round(i, 3) for i in all_s]
+    # bs = [bleu(x, [label]) for x in text]
+    return all_s
+def txt_map(x, txt_dict):
+    if type(x) == str:
+        x = eval(x)
+    x_ = []
+    for i in x:
+        if i == '':
+            continue
+        if i in txt_dict:
+            x_.append(txt_dict[i])
+        else:
+            x_.append(i)
+    return x_
+def go_map(t):
+    if t in GO_dict:
+        return GO_dict[t]
+    else:
+        print(t)
+def get_term(df):
+    from collections import Counter
+    cnt = Counter()
+    for i, row in enumerate(df.itertuples()):
+        for term in row.prop_annotations:
+            cnt[term] += 1
+    terms = list(cnt.keys())
+    # remove top
+    for top_term in ['GO:0005575', 'GO:0003674', 'GO:0008150']:
+        if top_term in terms:
+            terms.remove(top_term)
+    terms_df = pd.DataFrame({'gos': terms})
+    terms_df.to_pickle(f'/cluster/home/wenkai/deepgozero/data/blip2/{cat}/terms.pkl')
+if __name__ == "__main__":
+    cat = 'mf'
+    go = Ontology(f'/cluster/home/wenkai/deepgozero/data/data/go.obo', with_rels=True)
+    go_des = pd.read_csv('/cluster/home/wenkai/LAVIS/data/go_descriptions_new.txt', sep='|', header=None)
+    go_des.columns = ['GO', 'function']
+    go_des = go_des[go_des['function'].notnull()]
+    go_des['function'] = go_des['function'].apply(lambda x: x.lower().strip())
+    go_des['GO'] = go_des['GO'].apply(lambda x: re.sub('_', ':', x))
+    GO_dict = dict(zip(go_des['function'], go_des['GO']))
+    data = pd.read_csv('/cluster/home/wenkai/LAVIS/output/predict_concat_test{}.csv'.format(cat), sep='|')
+    data['label'] = data['label'].apply(lambda x: x.lower())
+    data['pred'] = data['pred'].apply(lambda x: re.sub('</s>', '', x))
+    data['label_list'] = data['label'].apply(lambda x: [i.strip() for i in x.split(';')])
+    data['pred_list'] = data['pred'].apply(lambda x: [i.strip() for i in x.split(';')])
+    train = pd.read_csv('/cluster/home/wenkai/LAVIS/data/sim_split/train_{}.csv'.format(cat), sep='|')
+    train = train.drop_duplicates()
+    train['function'] = train['function'].apply(lambda x: x.lower().strip())
+    train_dict = dict(zip(train['function'], train['GO_label']))
+    test = pd.read_csv('/cluster/home/wenkai/LAVIS/data/sim_split/test_{}.csv'.format(cat), sep='|')
+    test = test.drop_duplicates()
+    test['function'] = test['function'].apply(lambda x: x.lower().strip())
+    test_dict = dict(zip(test['function'], test['GO_label']))
+    GO_dict.update(train_dict)
+    GO_dict.update(test_dict)
+    choices = []
+    for x in data['label_list'].tolist() + train['function'].tolist():
+        choices.extend(x)
+    choices = list(set(choices))
+    ### 预测的文本如果不在GO标签词中，则算作最相似的GO标签
+    print("找到与预测文本最相似的GO标签......")
+    t0 = time.time()
+    txt_dict = {}
+    all_txt = []
+    for txt in data['pred_list']:
+        if type(txt) == str:
+            all_txt.extend(eval(txt))
+        else:
+            all_txt.extend(txt)
+    all_txt = list(set(all_txt))
+    n = len(all_txt)
+    thread = 40
+    size = int(n/thread)
+    inds = list(range(0, n, size))
+    inds.append(n)
+    all_txt_sep = [all_txt[i: min(i+size, n)] for i in inds[:-1]]
+    with Pool(processes=thread) as pool:
+        result = pool.map(fuzzy_match, all_txt_sep)
+    pool.close()
+    pool.join()
+    for d in result:
+        txt_dict.update(d)
+    # for txt in all_txt[:10]:
+    #     fuzzy_match(txt)
+    data['pred_list'] = data['pred_list'].apply(lambda x: txt_map(x, txt_dict))
+    data['pred_list'] = data['pred_list'].apply(lambda x: list(set(x)))
+    print("fuzzy matching time: {}".format(time.time() - t0))
+    # sims = []
+    # for text, label in zip(data['pred_list'].tolist(), data['label_list'].tolist()):
+    #     a = get_sim(text, label)
+    #     sims.append(a)
+    #
+    # data['sim'] = sims
+    # data['avg_sim'] = data['sim'].apply(lambda x: round(np.mean(x), 3))
+    # print("simlarity: {}".format(data['avg_sim'].mean()))
+    print("calculating f1 score ......")
+    data['label_list_go'] = data['label_list'].apply(lambda x: [go_map(i) for i in x])
+    data['pred_list_go'] = data['pred_list'].apply(lambda x: [go_map(i) for i in x])
+    labels = []
+    pred_labels = []
+    for l in data['label_list_go']:
+        if type(l) == str:
+            l = eval(l)
+        labels.extend(l)
+    label_count = {}
+    for x in labels:
+        if x not in label_count:
+            label_count[x] = 1
+        else:
+            label_count[x] += 1
+    labels = list(set(labels))
+    total = len(labels)
+    recalls = []
+    precisions = []
+    tp_dict, fp_dict, fn_dict = dict(zip(labels, [0]*len(labels))), dict(zip(labels, [0]*len(labels))), dict(zip(labels, [0]*len(labels)))
+    for preds, label in zip(data['pred_list_go'], data['label_list_go']):
+        if type(label) == str:
+            label = eval(label)
+        if type(preds) == str:
+            txts = eval(preds)
+        ll = len(label)
+        for t in label:
+            supgo = go.get_anchestors(t)
+            if supgo.intersection(set(preds)):
+                tp_dict[t] += 1
+            else:
+                fn_dict[t] += 1
+        for p in preds:
+            supgo = go.get_anchestors(p)
+            if not supgo.intersection(set(label)):
+                if p in fp_dict:
+                    fp_dict[p] += 1
+                else:
+                    fp_dict[p] = 1
+        pred_labels.extend(preds)
+    p_total = len(set(pred_labels))
+    recall, pr = 0., 0.
+    for x in labels:
+        recall += tp_dict[x] / (1.0 * (tp_dict[x] + fn_dict[x] + 1e-8))
+        pr += tp_dict[x] / (1.0 * (tp_dict[x] + fp_dict[x] + 1e-8))
+    r = recall / total
+    p = pr / p_total
+    f1 = 2 * p * r / (p + r)
+    print("preds not in labels: {}".format(len(list(fp_dict.keys())) - total))
+    print("f1 score: {}".format(f1))
+    '''
+    cat_f1 = {}
+    for x in labels:
+        if tp_dict[x] + fn_dict[x] > 0:
+            re = tp_dict[x] / (1.0 * (tp_dict[x] + fn_dict[x] + 1e-8))
+            pr = tp_dict[x] / (1.0 * (tp_dict[x] + fp_dict[x] + 1e-8))
+            cat_f1[x] = 2 * pr * re / (pr + re + 1e-10)
+    plt.xlabel('f score')
+    plt.ylabel('count')
+    print(np.mean(list(cat_f1.values())))
+    plt.hist(list(cat_f1.values()), color='red', bins=30)
+    plt.show()
+    xs, ys = [], []
+    for x in labels:
+        xs.append(label_count[x])
+        ys.append(cat_f1[x])
+    df_count = pd.DataFrame({'xs': xs, 'ys': ys})
+    df_count['xs'].loc[df_count['xs'] > 10] = 11
+    df_count['xs'] = df_count['xs'].astype(str)
+    df_count1 = df_count.groupby('xs').mean().reset_index()
+    df_count2 = df_count.groupby('xs').count().reset_index()
+    plt.xlabel('label count')
+    plt.ylabel('f score mean')
+    df_count1['xs'] = df_count1['xs'].astype(int)
+    plt.scatter(df_count1['xs'], df_count1['ys'], color='red')
+    plt.show()
+    plt.xlabel('label count')
+    plt.ylabel('protein num')
+    df_count2['xs'] = df_count2['xs'].astype(int)
+    plt.bar(df_count2['xs'], df_count2['ys'], color='red')
+    plt.show()
+    '''
+    # 准备数据：blip2预测的Go标签作为feature，label加入祖先后作为预测的Y
+    print("准备加入祖先后的数据......")
+    train = pd.read_csv('/cluster/home/wenkai/LAVIS/data/sim_split/train_{}.csv'.format(cat), sep='|')
+    test = pd.read_csv('/cluster/home/wenkai/LAVIS/data/sim_split/test_{}.csv'.format(cat), sep='|')
+    train = train.groupby('name').agg({'GO_label': list}).reset_index()
+    test = test.groupby('name').agg({'GO_label': list}).reset_index()
+    def prop(df):
+        prop_annotations = []
+        for i, row in df.iterrows():
+            # Propagate annotations
+            annot_set = set()
+            annots = row['GO_label']
+            for go_id in annots:
+                annot_set |= go.get_anchestors(go_id)
+            annots = list(annot_set)
+            prop_annotations.append(annots)
+        df['prop_annotations'] = prop_annotations
+        return df
+    train = prop(train)
+    test = prop(test)
+    train_test = pd.concat([train, test])
+    get_term(train_test)
+    del train_test
+    def pred_text_to_go(df):
+        df['pred'] = df['pred'].apply(lambda x: re.sub('</s>', '', x))
+        df['pred_list'] = df['pred'].apply(lambda x: [i.strip() for i in x.split(';')])
+        ### 预测的文本如果不在GO标签词中，则算作最相似的GO标签
+        t0 = time.time()
+        txt_dict = {}
+        all_txt = []
+        for txt in df['pred_list']:
+            if type(txt) == str:
+                all_txt.extend(eval(txt))
+            else:
+                all_txt.extend(txt)
+        all_txt = list(set(all_txt))
+        if '' in all_txt:
+            all_txt.remove('')
+        n = len(all_txt)
+        thread = 40
+        size = int(n / thread)
+        inds = list(range(0, n, size))
+        inds.append(n)
+        all_txt_sep = [all_txt[i: min(i + size, n)] for i in inds[:-1]]
+        with Pool(processes=thread) as pool:
+            result = pool.map(fuzzy_match, all_txt_sep)
+        pool.close()
+        pool.join()
+        for d in result:
+            txt_dict.update(d)
+        # for txt in all_txt[:10]:
+        #     fuzzy_match(txt)
+        df['pred_list'] = df['pred_list'].apply(lambda x: txt_map(x, txt_dict))
+        df['pred_list'] = df['pred_list'].apply(lambda x: list(set(x)))
+        print("fuzzy matching time: {}".format(time.time() - t0))
+        df['pred_list_go'] = df['pred_list'].apply(lambda x: [go_map(i) for i in x])
+        return df
+    train_pred = pd.read_csv('/cluster/home/wenkai/LAVIS/output/predict_concat_train{}.csv'.format(cat), sep='|')
+    test_pred = pd.read_csv('/cluster/home/wenkai/LAVIS/output/predict_concat_test{}.csv'.format(cat), sep='|')
+    train_pred = pred_text_to_go(train_pred)
+    test_pred = pred_text_to_go(test_pred)
+    train_data = pd.merge(train[['name', 'prop_annotations']],
+                          train_pred[['name', 'pred_list_go']],
+                          on='name', how='inner')
+    train_data = train_data.drop_duplicates('name')
+    train_data.to_pickle('/cluster/home/wenkai/deepgozero/data/blip2/{}/train_data.pkl'.format(cat))
+    test_data = pd.merge(test[['name', 'prop_annotations']],
+                         test_pred[['name', 'pred_list_go']],
+                         on='name', how='inner')
+    test_data = test_data.drop_duplicates('name')
+    test_data.to_pickle('/cluster/home/wenkai/deepgozero/data/blip2/{}/test_data.pkl'.format(cat))
+    test_data.to_pickle('/cluster/home/wenkai/deepgozero/data/blip2/{}/valid_data.pkl'.format(cat))

data/evaluate_data/evaluate_with_ancestors_exp.py ADDED Viewed

	@@ -0,0 +1,339 @@

+import pandas as pd
+import re
+import random
+import Levenshtein
+import numpy as np
+import difflib
+# from torchmetrics.text import BLEUScore
+import time
+from multiprocessing import Pool, Queue, Process
+import matplotlib.pyplot as plt
+from data.evaluate_data.utils import Ontology
+# bleu = BLEUScore(n_gram=1)
+def fuzzy_match(texts):
+    text_dict = {}
+    for context in texts:
+        if context not in choices:
+            # txt_dict[txt] = process.extractOne(txt, choices)[0]
+            text_dict[context] = difflib.get_close_matches(context, choices, n=1, cutoff=0.)[0]
+    return text_dict
+def get_sim(text, label):
+    all_s = []
+    for x in label:
+        s = 0
+        for y in text:
+            temp = Levenshtein.ratio(x, y)
+            if temp > s:
+                s = temp
+        all_s.append(s)
+    all_s = [round(i, 3) for i in all_s]
+    # bs = [bleu(x, [label]) for x in text]
+    return all_s
+def txt_map(x, txt_dict):
+    if type(x) == str:
+        x = eval(x)
+    x_ = []
+    for i in x:
+        if i == '':
+            continue
+        if i in txt_dict:
+            x_.append(txt_dict[i])
+        else:
+            x_.append(i)
+    return x_
+def go_map(t):
+    if t in GO_dict:
+        return GO_dict[t]
+    else:
+        print(t)
+def get_term(df):
+    from collections import Counter
+    cnt = Counter()
+    for i, row in enumerate(df.itertuples()):
+        for term in row.prop_annotations:
+            cnt[term] += 1
+    terms = list(cnt.keys())
+    # remove top
+    for top_term in ['GO:0005575', 'GO:0003674', 'GO:0008150']:
+        if top_term in terms:
+            terms.remove(top_term)
+    terms_df = pd.DataFrame({'gos': terms})
+    terms_df.to_pickle(f'/cluster/home/wenkai/deepgozero/data/blip2/{cat}/terms.pkl')
+if __name__ == "__main__":
+    cat = 'mf'
+    go = Ontology(f'/cluster/home/wenkai/deepgozero/data/data/go.obo', with_rels=True)
+    go_des = pd.read_csv('/cluster/home/wenkai/LAVIS/data/go_descriptions_new.txt', sep='|', header=None)
+    go_des.columns = ['GO', 'function']
+    go_des = go_des[go_des['function'].notnull()]
+    go_des['function'] = go_des['function'].apply(lambda x: x.lower().strip())
+    go_des['GO'] = go_des['GO'].apply(lambda x: re.sub('_', ':', x))
+    GO_dict = dict(zip(go_des['function'], go_des['GO']))
+    data = pd.read_csv('/cluster/home/wenkai/LAVIS/output_exp/predict_concat_test{}.csv'.format(cat), sep='|')
+    data['label'] = data['label'].apply(lambda x: x.lower())
+    data['pred'] = data['pred'].apply(lambda x: re.sub('</s>', '', x))
+    data['label_list'] = data['label'].apply(lambda x: [i.strip() for i in x.split(';')])
+    data['pred_list'] = data['pred'].apply(lambda x: [i.strip() for i in x.split(';')])
+    train = pd.read_csv('/cluster/home/wenkai/LAVIS/data/sim_exp/train_{}.csv'.format(cat), sep='|')
+    train = train.drop_duplicates()
+    train['function'] = train['function'].apply(lambda x: x.lower().strip())
+    train_dict = dict(zip(train['function'], train['GO_label']))
+    test = pd.read_csv('/cluster/home/wenkai/LAVIS/data/sim_exp/test_{}.csv'.format(cat), sep='|')
+    test = test.drop_duplicates()
+    test['function'] = test['function'].apply(lambda x: x.lower().strip())
+    test_dict = dict(zip(test['function'], test['GO_label']))
+    GO_dict.update(train_dict)
+    GO_dict.update(test_dict)
+    choices = []
+    for x in data['label_list'].tolist() + train['function'].tolist():
+        choices.extend(x)
+    choices = list(set(choices))
+    ### 预测的文本如果不在GO标签词中，则算作最相似的GO标签
+    print("找到与预测文本最相似的GO标签......")
+    t0 = time.time()
+    txt_dict = {}
+    all_txt = []
+    for txt in data['pred_list']:
+        if type(txt) == str:
+            all_txt.extend(eval(txt))
+        else:
+            all_txt.extend(txt)
+    all_txt = list(set(all_txt))
+    n = len(all_txt)
+    thread = 40
+    size = int(n/thread)
+    inds = list(range(0, n, size))
+    inds.append(n)
+    all_txt_sep = [all_txt[i: min(i+size, n)] for i in inds[:-1]]
+    with Pool(processes=thread) as pool:
+        result = pool.map(fuzzy_match, all_txt_sep)
+    pool.close()
+    pool.join()
+    for d in result:
+        txt_dict.update(d)
+    # for txt in all_txt[:10]:
+    #     fuzzy_match(txt)
+    data['pred_list'] = data['pred_list'].apply(lambda x: txt_map(x, txt_dict))
+    data['pred_list'] = data['pred_list'].apply(lambda x: list(set(x)))
+    print("fuzzy matching time: {}".format(time.time() - t0))
+    # sims = []
+    # for text, label in zip(data['pred_list'].tolist(), data['label_list'].tolist()):
+    #     a = get_sim(text, label)
+    #     sims.append(a)
+    #
+    # data['sim'] = sims
+    # data['avg_sim'] = data['sim'].apply(lambda x: round(np.mean(x), 3))
+    # print("simlarity: {}".format(data['avg_sim'].mean()))
+    print("calculating f1 score ......")
+    data['label_list_go'] = data['label_list'].apply(lambda x: [go_map(i) for i in x])
+    data['pred_list_go'] = data['pred_list'].apply(lambda x: [go_map(i) for i in x])
+    labels = []
+    pred_labels = []
+    for l in data['label_list_go']:
+        if type(l) == str:
+            l = eval(l)
+        labels.extend(l)
+    label_count = {}
+    for x in labels:
+        if x not in label_count:
+            label_count[x] = 1
+        else:
+            label_count[x] += 1
+    labels = list(set(labels))
+    total = len(labels)
+    recalls = []
+    precisions = []
+    tp_dict, fp_dict, fn_dict = dict(zip(labels, [0]*len(labels))), dict(zip(labels, [0]*len(labels))), dict(zip(labels, [0]*len(labels)))
+    for preds, label in zip(data['pred_list_go'], data['label_list_go']):
+        if type(label) == str:
+            label = eval(label)
+        if type(preds) == str:
+            txts = eval(preds)
+        ll = len(label)
+        for t in label:
+            supgo = go.get_anchestors(t)
+            if supgo.intersection(set(preds)):
+                tp_dict[t] += 1
+            else:
+                fn_dict[t] += 1
+        for p in preds:
+            supgo = go.get_anchestors(p)
+            if not supgo.intersection(set(label)):
+                if p in fp_dict:
+                    fp_dict[p] += 1
+                else:
+                    fp_dict[p] = 1
+        pred_labels.extend(preds)
+    p_total = len(set(pred_labels))
+    recall, pr = 0., 0.
+    for x in labels:
+        recall += tp_dict[x] / (1.0 * (tp_dict[x] + fn_dict[x] + 1e-8))
+        pr += tp_dict[x] / (1.0 * (tp_dict[x] + fp_dict[x] + 1e-8))
+    r = recall / total
+    p = pr / p_total
+    f1 = 2 * p * r / (p + r)
+    print("preds not in labels: {}".format(len(list(fp_dict.keys())) - total))
+    print("f1 score: {}".format(f1))
+    '''
+    cat_f1 = {}
+    for x in labels:
+        if tp_dict[x] + fn_dict[x] > 0:
+            re = tp_dict[x] / (1.0 * (tp_dict[x] + fn_dict[x] + 1e-8))
+            pr = tp_dict[x] / (1.0 * (tp_dict[x] + fp_dict[x] + 1e-8))
+            cat_f1[x] = 2 * pr * re / (pr + re + 1e-10)
+    plt.xlabel('f score')
+    plt.ylabel('count')
+    print(np.mean(list(cat_f1.values())))
+    plt.hist(list(cat_f1.values()), color='red', bins=30)
+    plt.show()
+    xs, ys = [], []
+    for x in labels:
+        xs.append(label_count[x])
+        ys.append(cat_f1[x])
+    df_count = pd.DataFrame({'xs': xs, 'ys': ys})
+    df_count['xs'].loc[df_count['xs'] > 10] = 11
+    df_count['xs'] = df_count['xs'].astype(str)
+    df_count1 = df_count.groupby('xs').mean().reset_index()
+    df_count2 = df_count.groupby('xs').count().reset_index()
+    plt.xlabel('label count')
+    plt.ylabel('f score mean')
+    df_count1['xs'] = df_count1['xs'].astype(int)
+    plt.scatter(df_count1['xs'], df_count1['ys'], color='red')
+    plt.show()
+    plt.xlabel('label count')
+    plt.ylabel('protein num')
+    df_count2['xs'] = df_count2['xs'].astype(int)
+    plt.bar(df_count2['xs'], df_count2['ys'], color='red')
+    plt.show()
+    '''
+    # 准备数据：blip2预测的Go标签作为feature，label加入祖先后作为预测的Y
+    print("准备加入祖先后的数据......")
+    train = pd.read_csv('/cluster/home/wenkai/LAVIS/data/sim_exp/train_{}.csv'.format(cat), sep='|')
+    test = pd.read_csv('/cluster/home/wenkai/LAVIS/data/sim_exp/test_{}.csv'.format(cat), sep='|')
+    train = train.groupby('name').agg({'GO_label': list}).reset_index()
+    test = test.groupby('name').agg({'GO_label': list}).reset_index()
+    def prop(df):
+        prop_annotations = []
+        for i, row in df.iterrows():
+            # Propagate annotations
+            annot_set = set()
+            annots = row['GO_label']
+            for go_id in annots:
+                annot_set |= go.get_anchestors(go_id)
+            annots = list(annot_set)
+            prop_annotations.append(annots)
+        df['prop_annotations'] = prop_annotations
+        return df
+    train = prop(train)
+    test = prop(test)
+    train_test = pd.concat([train, test])
+    get_term(train_test)
+    del train_test
+    def pred_text_to_go(df):
+        df['pred'] = df['pred'].apply(lambda x: re.sub('</s>', '', x))
+        df['pred_list'] = df['pred'].apply(lambda x: [i.strip() for i in x.split(';')])
+        ### 预测的文本如果不在GO标签词中，则算作最相似的GO标签
+        t0 = time.time()
+        txt_dict = {}
+        all_txt = []
+        for txt in df['pred_list']:
+            if type(txt) == str:
+                all_txt.extend(eval(txt))
+            else:
+                all_txt.extend(txt)
+        all_txt = list(set(all_txt))
+        if '' in all_txt:
+            all_txt.remove('')
+        n = len(all_txt)
+        thread = 40
+        size = int(n / thread)
+        inds = list(range(0, n, size))
+        inds.append(n)
+        all_txt_sep = [all_txt[i: min(i + size, n)] for i in inds[:-1]]
+        with Pool(processes=thread) as pool:
+            result = pool.map(fuzzy_match, all_txt_sep)
+        pool.close()
+        pool.join()
+        for d in result:
+            txt_dict.update(d)
+        # for txt in all_txt[:10]:
+        #     fuzzy_match(txt)
+        df['pred_list'] = df['pred_list'].apply(lambda x: txt_map(x, txt_dict))
+        df['pred_list'] = df['pred_list'].apply(lambda x: list(set(x)))
+        print("fuzzy matching time: {}".format(time.time() - t0))
+        df['pred_list_go'] = df['pred_list'].apply(lambda x: [go_map(i) for i in x])
+        return df
+    train_pred = pd.read_csv('/cluster/home/wenkai/LAVIS/output_exp/predict_concat_train{}.csv'.format(cat), sep='|')
+    test_pred = pd.read_csv('/cluster/home/wenkai/LAVIS/output_exp/predict_concat_test{}.csv'.format(cat), sep='|')
+    train_pred = pred_text_to_go(train_pred)
+    test_pred = pred_text_to_go(test_pred)
+    train_data = pd.merge(train[['name', 'prop_annotations']],
+                          train_pred[['name', 'pred_list_go']],
+                          on='name', how='inner')
+    train_data = train_data.drop_duplicates('name')
+    train_data.to_pickle('/cluster/home/wenkai/deepgozero/data/blip2/{}/train_data.pkl'.format(cat))
+    test_data = pd.merge(test[['name', 'prop_annotations']],
+                         test_pred[['name', 'pred_list_go']],
+                         on='name', how='inner')
+    test_data = test_data.drop_duplicates('name')
+    test_data.to_pickle('/cluster/home/wenkai/deepgozero/data/blip2/{}/test_data.pkl'.format(cat))
+    test_data.to_pickle('/cluster/home/wenkai/deepgozero/data/blip2/{}/valid_data.pkl'.format(cat))

data/evaluate_data/pretrain_output_to_deepgozero.py ADDED Viewed

	@@ -0,0 +1,477 @@

+import re
+import pandas as pd
+import time
+from multiprocessing import Pool
+import difflib
+from utils import Ontology
+import os
+def filter(x_list):
+    new_go = []
+    # x_list = [i.strip() for i in x.split(';')]
+    for i in x_list:
+        if i in filter_go:
+            new_go.append(i)
+    return '; '.join(new_go)
+def fuzzy_match(texts):
+    text_dict = {}
+    for context in texts:
+        if context in choices:
+            text_dict[context] = context
+        elif context not in choices:
+            # txt_dict[txt] = process.extractOne(txt, choices)[0]
+            sim_list = difflib.get_close_matches(context.lower(), choices, n=1, cutoff=0.9)
+            if len(sim_list) > 0:
+                text_dict[context] = sim_list[0]
+            else:
+                # text_dict[context] = ''
+                pass
+    return text_dict
+def txt_map(x, txt_dict):
+    if type(x) == str:
+        x = eval(x)
+    x_ = []
+    for i in x:
+        if i == '':
+            continue
+        if i in txt_dict:
+            x_.append(txt_dict[i])
+        else:
+            # x_.append(i)
+            pass
+    return x_
+def go_map_prob(x, GO_dict):
+    res = []
+    for t in x:
+        if t[0] in GO_dict:
+            res.append((GO_dict[t[0]], t[1]))
+        else:
+            pass
+            # print("{} not in GO_dict".format(t[0]))
+    return res
+def txt_map_prob(x, txt_dict):
+    if type(x) == str:
+        x = eval(x)
+    x_ = []
+    temp = set()
+    for i in x:
+        if i[0] == '':
+            continue
+        elif i[0] in txt_dict and txt_dict[i[0]] not in temp:
+            x_.append((txt_dict[i[0]].lower(), i[1]))
+            temp.add(txt_dict[i[0]])
+        # elif i[0] not in txt_dict:
+        #    x_.append((i[0].lower(), i[1]))
+        #    temp.add(i[0])
+        else:
+            continue
+    return x_
+def go_map(x, GO_dict):
+    res = []
+    for t in x:
+        if t in GO_dict:
+            res.append(GO_dict[t])
+        else:
+            # pass
+            print("{} not in GO_dict".format(t))
+    return res
+def prop(df):
+    prop_annotations = []
+    for i, row in df.iterrows():
+        # Propagate annotations
+        annot_set = set()
+        annots = row['GO_label']
+        for go_id in annots:
+            annot_set |= godb.get_anchestors(go_id)
+        annots = list(annot_set)
+        prop_annotations.append(annots)
+    df['prop_annotations'] = prop_annotations
+    return df
+def pred_text_to_go(df, with_prob=False):
+    # df['pred'] = df['pred'].apply(lambda x: re.sub('</s>', '', x))
+    if with_prob:
+        df['pred_list_prob'] = df['pred'].apply(lambda x: [eval(i.strip()) for i in x.split(';')])
+        df['pred_list'] = df['pred_list_prob'].apply(lambda x: [i[0] for i in x])
+    else:
+        df['pred_list'] = df['pred'].apply(lambda x: list(set([i.strip() for i in x.split(';')])))
+    ### 预测的文本如果不在GO标签词中，则算作最相似的GO标签
+    t0 = time.time()
+    txt_dict = {}
+    all_txt = []
+    for txt in df['pred_list']:
+        if type(txt) == str:
+            all_txt.extend(eval(txt))
+        else:
+            all_txt.extend(txt)
+    all_txt = list(set(all_txt))
+    if '' in all_txt:
+        all_txt.remove('')
+    n = len(all_txt)
+    thread = 10
+    size = int(n / thread)
+    inds = list(range(0, n, size))
+    inds.append(n)
+    all_txt_sep = [all_txt[i: min(i + size, n)] for i in inds[:-1]]
+    with Pool(processes=thread) as pool:
+        result = pool.map(fuzzy_match, all_txt_sep)
+    pool.close()
+    pool.join()
+    for d in result:
+        txt_dict.update(d)
+    # print(txt_dict)
+    # for txt in all_txt[:10]:
+    #     fuzzy_match(txt)
+    if with_prob:
+        df['pred_list_prob'] = df['pred_list_prob'].apply(lambda x: txt_map_prob(x, txt_dict))
+        print("fuzzy matching time: {}".format(time.time() - t0))
+        df['pred_list_go_prob'] = df['pred_list_prob'].apply(lambda x: go_map_prob(x, GO_dict))
+        n0 = df.shape[0]
+        df['len'] = df['pred_list_go_prob'].apply(lambda x: len(x))
+        df = df[df['len'] > 0]
+        df = df.drop('len', axis=1)
+        df = df.dropna()
+        print('{}条数据，不为空的预测有{}条'.format(n0, df.shape[0]))
+    else:
+        df['pred_list'] = df['pred_list'].apply(lambda x: txt_map(x, txt_dict))
+        df['pred_list'] = df['pred_list'].apply(lambda x: [i.lower() for i in list(set(x))])
+        print("fuzzy matching time: {}".format(time.time() - t0))
+        df['pred_list_go'] = df['pred_list'].apply(lambda x: go_map(x, GO_dict))
+        n0 = df.shape[0]
+        df['len'] = df['pred_list_go'].apply(lambda x: len(x))
+        df = df[df['len'] > 0]
+        df = df.drop('len', axis=1)
+        df = df.dropna()
+        print('{}条数据，不为空的预测有{}条'.format(n0, df.shape[0]))
+    return df
+def cal_f1(df):
+    df['label_list_go'] = df['label'].apply(lambda x: [i.strip() for i in x.split(';')])
+    df['pred_list_go'] = df['pred_list'].apply(lambda x: [i.strip() for i in x.split(';')])
+    labels = []
+    pred_labels = []
+    for l in df['label_list_go']:
+        labels.extend(l)
+    label_count = {}
+    for x in labels:
+        if x not in label_count:
+            label_count[x] = 1
+        else:
+            label_count[x] += 1
+    labels = list(set(labels))
+    total = len(labels)
+    tp_dict, fp_dict, fn_dict = dict(zip(labels, [0] * len(labels))), dict(zip(labels, [0] * len(labels))), dict(
+        zip(labels, [0] * len(labels)))
+    for preds, label in zip(df['pred_list_go'], df['label_list_go']):
+        for t in label:
+            # supgo = godb.get_anchestors(t)
+            # if supgo.intersection(set(preds)):
+            if t in preds:
+                tp_dict[t] += 1
+            else:
+                fn_dict[t] += 1
+        for p in preds:
+            # supgo = godb.get_anchestors(p)
+            # if not supgo.intersection(set(label)):
+            if p not in label:
+                if p in fp_dict:
+                    fp_dict[p] += 1
+                else:
+                    fp_dict[p] = 1
+        pred_labels.extend(preds)
+    p_total = len(set(pred_labels))
+    recall, pr = 0., 0.
+    for x in labels:
+        recall += tp_dict[x] / (1.0 * (tp_dict[x] + fn_dict[x] + 1e-8))
+        pr += tp_dict[x] / (1.0 * (tp_dict[x] + fp_dict[x] + 1e-8))
+    r = recall / total
+    p = pr / p_total
+    f1 = 2 * p * r / (p + r)
+    print("preds not in labels: {}".format(len(list(fp_dict.keys())) - total))
+    print("recall:{}; percision:{}; f1 score: {}".format(r, p, f1))
+def cat_go(x):
+    try:
+        cat = godb.get_namespace(x)
+    except:
+        print("{} not found".format(x))
+        return
+    if cat == NAMESPACES['mf']:
+        return 'mf'
+    elif cat == NAMESPACES['bp']:
+        return 'bp'
+    elif cat == NAMESPACES['cc']:
+        return 'cc'
+    return
+def remove_root(x):
+    if 'molecular_function' in x:
+        x.remove('molecular_function')
+    if 'biological_process' in x:
+        x.remove('biological_process')
+    if 'cellular_component' in x:
+        x.remove('cellular_component')
+    return x
+if __name__ == "__main__":
+    NAMESPACES = {
+        'cc': 'cellular_component',
+        'mf': 'molecular_function',
+        'bp': 'biological_process'
+    }
+    #if not os.path.exists('/cluster/home/wenkai/LAVIS/data/pretrain/mf_bp_cc/terms.pkl'):
+    if 1==1:
+        data = pd.read_csv('/cluster/home/wenkai/LAVIS/data/pretrain/swissprot_domain_and_train_exp_prompt_new.csv', sep='|')
+        print('数据规模：{}'.format(data.shape[0]))
+        # data['function'] = data['function'].apply(lambda x: re.sub('[FPC]:', '', x))
+        # data.to_csv('swissprot_domain_and_train_exp.csv', sep='|', index=False)
+        godb = Ontology(f'/cluster/home/wenkai/LAVIS/data/go1.4-basic.obo', with_rels=True)
+        go_des = pd.read_csv('/cluster/home/wenkai/LAVIS/data/go_descriptions1.4.txt', sep='|', header=None)
+        go_des.columns = ['id', 'text']
+        go_des = go_des.dropna()
+        go_des['id'] = go_des['id'].apply(lambda x: re.sub('_', ':', x))
+        go_des['ont'] = go_des['id'].apply(lambda x: cat_go(x))
+        go_des = go_des.dropna()
+        go_obo_set = set(go_des['id'].tolist())
+        go_des['text'] = go_des['text'].apply(lambda x: x.lower())
+        data['GO_label'] = data['GO_label'].apply(lambda x: [i.strip() for i in x.split(';')])
+        data = prop(data)
+        # 加入父节点，得到完整的terms,映射表等等
+        go_dict = {}
+        for x_list in data['prop_annotations']:
+            for goid in x_list:
+                if goid in go_dict:
+                    go_dict[goid] += 1
+                else:
+                    go_dict[goid] = 1
+        df_stat = pd.DataFrame({'id': list(go_dict.keys()), 'count': list(go_dict.values())})
+        data_gos = set(df_stat['id'].tolist())
+        go_des = go_des[go_des['id'].isin(data_gos)]
+        filter_go = data_gos.intersection(go_obo_set)
+        print(f"包括父节点的GO有{len(data_gos)}个，其中在go1.4.obo中出现的GO有{len(filter_go)}个")
+        go_des.to_pickle('/cluster/home/wenkai/LAVIS/data/pretrain/mf_bp_cc/go_des.pkl')
+        id2text_dict = dict(zip(go_des['id'], go_des['text']))
+        GO_dict = dict(zip(go_des['text'], go_des['id']))
+        choices_mf = list(set(go_des[go_des['ont'] == 'mf']['text']))
+        choices_bp = list(set(go_des[go_des['ont'] == 'bp']['text']))
+        choices_cc = list(set(go_des[go_des['ont'] == 'cc']['text']))
+        choices_mf = {x.lower(): x for x in choices_mf}
+        choices_bp = {x.lower(): x for x in choices_bp}
+        choices_cc = {x.lower(): x for x in choices_cc}
+        data['GO_label'] = data['GO_label'].apply(lambda x: filter(x))
+        data = data[data['GO_label'] != '']
+        data['function'] = data['GO_label'].apply(lambda x: [id2text_dict[i.strip()] for i in x.split(';')])
+        data['function'] = data['function'].apply(lambda x: '; '.join(x))
+        terms = pd.DataFrame({'gos': list(filter_go)})
+        terms.to_pickle('/cluster/home/wenkai/LAVIS/data/pretrain/mf_bp_cc/terms.pkl')
+        terms.to_pickle('/cluster/home/wenkai/deepgozero/data/blip2/pretrain/terms.pkl')
+        terms_mf = pd.DataFrame({'gos': list(set(go_des[go_des['ont'] == 'mf']['id']))})
+        terms_mf.to_pickle('/cluster/home/wenkai/deepgozero/data/blip2/pretrain/mf/terms.pkl')
+        terms_mf.to_pickle('/cluster/home/wenkai/deepgo2/data/mf/terms.pkl')
+        terms_bp = pd.DataFrame({'gos': list(set(go_des[go_des['ont'] == 'bp']['id']))})
+        terms_bp.to_pickle('/cluster/home/wenkai/deepgozero/data/blip2/pretrain/bp/terms.pkl')
+        terms_bp.to_pickle('/cluster/home/wenkai/deepgo2/data/bp/terms.pkl')
+        terms_cc = pd.DataFrame({'gos': list(set(go_des[go_des['ont'] == 'cc']['id']))})
+        terms_cc.to_pickle('/cluster/home/wenkai/deepgozero/data/blip2/pretrain/cc/terms.pkl')
+        terms_cc.to_pickle('/cluster/home/wenkai/deepgo2/data/cc/terms.pkl')
+    else:
+        godb = Ontology(f'/cluster/home/wenkai/LAVIS/data/go1.4-basic.obo', with_rels=True)
+        terms = pd.read_pickle('/cluster/home/wenkai/LAVIS/data/pretrain/mf_bp_cc/terms.pkl')
+        filter_go = set(terms['gos'].tolist())
+        terms_mf = pd.read_pickle('/cluster/home/wenkai/deepgo2/data/mf/terms.pkl')
+        terms_bp = pd.read_pickle('/cluster/home/wenkai/deepgo2/data/bp/terms.pkl')
+        terms_cc = pd.read_pickle('/cluster/home/wenkai/deepgo2/data/cc/terms.pkl')
+        choices_mf = {x.lower(): x for x in terms_mf['gos'].tolist()}
+        choices_bp = {x.lower(): x for x in terms_bp['gos'].tolist()}
+        choices_cc = {x.lower(): x for x in terms_cc['gos'].tolist()}
+        go_des = pd.read_pickle('/cluster/home/wenkai/LAVIS/data/pretrain/mf_bp_cc/go_des.pkl')
+        id2text_dict = dict(zip(go_des['id'], go_des['text']))
+        GO_dict = dict(zip(go_des['text'], go_des['id']))
+    # 对于预测文件，进行GO筛选，并用相似度算法匹配到filter_go；对于train test val 文件，进行GO筛选、加入祖先、加入interPro特征
+    # 加入interpro特征
+    df_interpro = pd.read_csv('/cluster/home/wenkai/LAVIS/data/uniprot_sprot_blip2_func_data.txt', sep='|',
+                              nrows=546389,
+                              header=None)
+    df_interpro.columns = ['name', 'seq', 'go', 'text', 'evi', 'ipr']
+    df_interpro = df_interpro[df_interpro['ipr'].notnull()]
+    df_interpro['ipr'] = df_interpro['ipr'].apply(lambda x: [i.strip() for i in x.split(';')])
+    iprs = []
+    for x in df_interpro['ipr'].tolist():
+        if len(x) > 0:
+            iprs.extend(x)
+    iprs = list(set(iprs))
+    print("ipr个数：{}".format(len(iprs)))
+    df_ipr = pd.DataFrame({'interpros': iprs})
+    df_ipr.to_pickle('/cluster/home/wenkai/LAVIS/data/interpros.pkl')
+    df_ipr.to_pickle('/cluster/home/wenkai/deepgozero/data/blip2/pretrain/interpros.pkl')
+    '''
+    # test cases
+    df_real = pd.read_csv('/cluster/home/wenkai/LAVIS/data/pretrain/test_2000.csv', sep='|')
+    df_real[col] = df_real[col].apply(lambda x: [i.strip() for i in x.split(';')])
+    #df_real[col] = df_real[col].apply(lambda x: filter(x))
+    df_real = df_real[df_real[col] != '']
+    print(df_real.shape)
+    #df_real['GO_label'] = df_real['GO_label'].apply(lambda x: [id2text_dict[i] for i in x])
+    #df_real['GO_label'] = df_real['GO_label'].apply(lambda x: [GO_dict[i] for i in x])
+    df_real = prop(df_real)
+    #df_real['prop_annotations'] = df_real['prop_annotations'].apply(lambda x: [id2text_dict[i] for i in x])
+    #df_real['prop_annotations'] = df_real['prop_annotations'].apply(lambda x: remove_root(x))
+    #df_real['prop_annotations'] = df_real['prop_annotations'].apply(lambda x: list(set([GO_dict[i] for i in x])))
+    for ont in ['mf', 'bp', 'cc']:
+        file_name = 'output_{}_test_2000'.format(ont)
+        if ont == 'mf':
+            choices = choices_mf
+        elif ont == 'bp':
+            choices = choices_bp
+        elif ont == 'cc':
+            choices = choices_cc
+        print("对{}预测文本进行标准化...".format(file_name))
+        df_pred = pd.read_csv('/cluster/home/wenkai/LAVIS/output/{}.txt'.format(file_name), sep='|', header=None, on_bad_lines='skip')
+        df_pred.columns = ['name', 'pred', 'label']
+        n0 = df_pred.shape[0]
+        df_pred = pred_text_to_go(df_pred, with_prob=True)
+        print("{}中有{}条数据未能找到相似度高的GO描述".format(file_name, n0-df_pred.shape[0]))
+        #df_pred['pred_list'] = df_pred['pred_list'].apply(lambda x: '; '.join(x))
+        #cal_f1(df_pred)
+        df_pred[['name', 'pred_list_prob', 'label']].to_csv('/cluster/home/wenkai/LAVIS/output/{}_standard.csv'.format(file_name), sep='|', index=False)
+        df_pred = pd.merge(df_pred[['name', 'pred_list_go_prob']], df_interpro[['name', 'ipr']], on='name', how='left')
+        df_pred['ipr'] = df_pred['ipr'].fillna("").apply(list)
+        ipr_and_pred = []
+        for x, y in zip(df_pred['ipr'], df_pred['pred_list_go_prob']):
+            try:
+                ipr_and_pred.append(x + y)
+            except:
+                ipr_and_pred.append(y)
+        df_pred['ipr_and_pred'] = ipr_and_pred
+        print(df_real.isnull().sum())
+        df_pred = pd.merge(df_pred, df_real[['name', 'protein', 'prop_annotations']], on='name', how='left')
+        #df_pred = df_pred.dropna()
+        print(df_pred.shape)
+        df_pred[['name', 'protein', 'ipr', 'pred_list_go_prob', 'ipr_and_pred', 'prop_annotations']].to_pickle(
+                '/cluster/home/wenkai/deepgozero/data/blip2/pretrain/{}/test_2000_data.pkl'.format(ont))
+    '''
+    '''
+    df_real = pd.read_csv('/cluster/home/wenkai/LAVIS/data/pretrain/nextprot_mf.csv', sep='|')
+    df_real['GO_label'] = df_real['GO_label'].apply(lambda x: [i.strip() for i in x.split(';')])
+    df_real['GO_label'] = df_real['GO_label'].apply(lambda x: [id2text_dict[i] for i in x])
+    df_real['GO_label'] = df_real['GO_label'].apply(lambda x: [GO_dict[i] for i in x])
+    df_real = prop(df_real)
+    df_real['prop_annotations'] = df_real['prop_annotations'].apply(lambda x: [id2text_dict[i] for i in x])
+    df_real['prop_annotations'] = df_real['prop_annotations'].apply(lambda x: remove_root(x))
+    df_real['prop_annotations'] = df_real['prop_annotations'].apply(lambda x: list(set([GO_dict[i] for i in x])))
+    file = 'output_nextprot'
+    choices = choices_mf
+    df_pred = pd.read_csv('/cluster/home/wenkai/LAVIS/output/{}.txt'.format(file), sep='|', header=None, on_bad_lines='skip')
+    df_pred.columns = ['name', 'pred', 'label']
+    df_pred = pred_text_to_go(df_pred, with_prob=True)
+    df_pred[['name', 'pred_list_prob', 'label']].to_csv('/cluster/home/wenkai/LAVIS/output/{}_standard.csv'.format(file), sep='|', index=False)
+    df_pred = pd.merge(df_pred, df_real[['name', 'protein', 'prop_annotations']], on='name', how='left')
+    df_pred['ipr'] = [[] for _ in range(df_pred.shape[0])]
+    df_pred['ipr_and_pred'] = df_pred['pred_list_go_prob']
+    df_pred[['name', 'protein', 'ipr', 'pred_list_go_prob', 'ipr_and_pred', 'prop_annotations']].to_pickle(
+                '/cluster/home/wenkai/deepgozero/data/blip2/pretrain/mf/nextprot_data.pkl')
+    '''
+    # '''
+    cat_id = {'mf': '445772', 'bp': '496359', 'cc': '505955'}
+    col = 'GO_label'
+    for ont in ['mf', 'bp', 'cc']:
+    #for ont in ['mf']:
+        if ont == 'mf':
+            choices = choices_mf
+        elif ont == 'bp':
+            choices = choices_bp
+        elif ont == 'cc':
+            choices = choices_cc
+        for split in ['train', 'val', 'test']:
+        #for split in ['test']:
+            df_real = pd.read_csv(f'/cluster/home/wenkai/LAVIS/data/pretrain/mf_bp_cc/{split}_exp_{ont}_new.csv',
+                                  sep='|')
+            df_real[col] = df_real[col].apply(lambda x: [i.strip() for i in x.split(';')])
+            df_real[col] = df_real[col].apply(lambda x: filter(x))
+            df_real = df_real[df_real[col] != '']
+            print(df_real.shape)
+            df_real['GO_label'] = df_real['GO_label'].apply(lambda x: [i.strip() for i in x.split(';')])
+            df_real['GO_label'] = df_real['GO_label'].apply(lambda x: [id2text_dict[i] for i in x])
+            df_real['GO_label'] = df_real['GO_label'].apply(lambda x: [GO_dict[i] for i in x])
+            df_real = prop(df_real)
+            df_real['prop_annotations'] = df_real['prop_annotations'].apply(lambda x: [id2text_dict[i] for i in x])
+            df_real['prop_annotations'] = df_real['prop_annotations'].apply(lambda x: remove_root(x))
+            df_real['prop_annotations'] = df_real['prop_annotations'].apply(lambda x: list(set([GO_dict[i] for i in x])))
+            # 预测text转为go
+            df_pred = pd.read_csv(
+                f'/cluster/home/wenkai/LAVIS/output/mf_bp_cc/output_{split}_{ont}_exp_{cat_id[ont]}.txt', sep='|',
+                header=None, on_bad_lines='skip')
+            df_pred.columns = ['name', 'pred', 'label']
+            n0 = df_pred.shape[0]
+            df_pred = pred_text_to_go(df_pred, with_prob=True)
+            print("{}中有{}条数据未能找到相似度高的GO描述".format(ont, n0 - df_pred.shape[0]))
+            df_pred[['name', 'pred_list_prob', 'label']].to_csv(
+                f'/cluster/home/wenkai/LAVIS/output/mf_bp_cc/output_{split}_{ont}_{cat_id[ont]}_standard.csv', sep='|',
+                index=False)
+            df_pred = pd.merge(df_pred[['name', 'pred_list_go_prob']], df_interpro[['name', 'ipr']], on='name', how='left')
+            df_pred['ipr'] = df_pred['ipr'].fillna("").apply(list)
+            ipr_and_pred = []
+            for x, y in zip(df_pred['ipr'], df_pred['pred_list_go_prob']):
+                try:
+                    ipr_and_pred.append(x + y)
+                except:
+                    ipr_and_pred.append(y)
+            df_pred['ipr_and_pred'] = ipr_and_pred
+            df_pred = pd.merge(df_pred, df_real[['name', 'protein', 'prop_annotations']], on='name', how='left')
+            df_pred = df_pred.dropna()
+            df_pred[['name', 'protein', 'ipr', 'pred_list_go_prob', 'ipr_and_pred', 'prop_annotations']].to_pickle(
+                f'/cluster/home/wenkai/deepgozero/data/blip2/pretrain/{ont}/{split}_data_{cat_id[ont]}.pkl')
+            df_pred[['name', 'protein', 'ipr', 'pred_list_go_prob', 'ipr_and_pred', 'prop_annotations']].to_pickle(
+                f'/cluster/home/wenkai/deepgo2/data/{ont}/{split}_data_{cat_id[ont]}.pkl')
+            if split == 'val':
+                df_pred[['name', 'protein', 'ipr', 'pred_list_go_prob', 'ipr_and_pred', 'prop_annotations']].to_pickle(
+                    f'/cluster/home/wenkai/deepgozero/data/blip2/pretrain/{ont}/valid_data_{cat_id[ont]}.pkl')
+                df_pred[['name', 'protein', 'ipr', 'pred_list_go_prob', 'ipr_and_pred', 'prop_annotations']].to_pickle(
+                    f'/cluster/home/wenkai/deepgo2/data/{ont}/valid_data_{cat_id[ont]}.pkl')
+            print(f"{ont} {split} deepgozero propagation data completed")
+    # '''

data/evaluate_data/process_case.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import pandas as pd
+from utils import Ontology
+def prop(df):
+    prop_annotations = []
+    for i, row in df.iterrows():
+        # Propagate annotations
+        annot_set = set()
+        annots = row['GO_label']
+        for go_id in annots:
+            annot_set |= godb.get_anchestors(go_id)
+        annots = list(annot_set)
+        prop_annotations.append(annots)
+    df['prop_annotations'] = prop_annotations
+    return df
+godb = Ontology(f'/cluster/home/wenkai/LAVIS/data/go1.4-basic.obo', with_rels=True)
+case_mf = pd.read_csv('/cluster/home/wenkai/LAVIS/data/pretrain/cases_mf.csv', sep='|')
+# bp case, 包括辣椒受体
+case_bp = pd.read_csv('/cluster/home/wenkai/LAVIS/data/pretrain/cases_bp.csv', sep='|')
+case_bp['GO_label'] = case_bp['GO_label'].apply(lambda x: [i.strip() for i in x.split(';')])
+case_bp = prop(case_bp)
+case_bp['GO_label'] = case_bp['GO_label'].apply(lambda x: '; '.join(x))
+case_bp['prop_annotations'] = case_bp['prop_annotations'].apply(lambda x: '; '.join(x))
+case_bp[['name', 'protein', 'function', 'GO_label', 'id', 'prompt', 'prop_annotations']].to_pickle('/cluster/home/wenkai/deepgo2/data/bp/cases_data.pkl')
+case_mf['GO_label'] = case_mf['GO_label'].apply(lambda x: [i.strip() for i in x.split(';')])
+case_mf = prop(case_mf)
+case_mf['GO_label'] = case_mf['GO_label'].apply(lambda x: '; '.join(x))
+case_mf['prop_annotations'] = case_mf['prop_annotations'].apply(lambda x: '; '.join(x))
+case_bp['GO_label'] = case_bp['GO_label'].apply(lambda x: [i.strip() for i in x.split(';')])
+case_bp = prop(case_bp)
+case_mf[['name', 'protein', 'function', 'GO_label', 'id', 'prompt', 'prop_annotations']].to_pickle('/cluster/home/wenkai/deepgo2/data/mf/cases_data_445772.pkl')

data/evaluate_data/utils.py ADDED Viewed

	@@ -0,0 +1,280 @@

+from collections import deque, Counter
+import warnings
+import pandas as pd
+import numpy as np
+from xml.etree import ElementTree as ET
+import math
+BIOLOGICAL_PROCESS = 'GO:0008150'
+MOLECULAR_FUNCTION = 'GO:0003674'
+CELLULAR_COMPONENT = 'GO:0005575'
+FUNC_DICT = {
+    'cc': CELLULAR_COMPONENT,
+    'mf': MOLECULAR_FUNCTION,
+    'bp': BIOLOGICAL_PROCESS}
+NAMESPACES = {
+    'cc': 'cellular_component',
+    'mf': 'molecular_function',
+    'bp': 'biological_process'
+}
+EXP_CODES = set([
+    'EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP', 'TAS', 'IC',
+    'HTP', 'HDA', 'HMP', 'HGI', 'HEP'])
+# CAFA4 Targets
+CAFA_TARGETS = set([
+    '287', '3702', '4577', '6239', '7227', '7955', '9606', '9823', '10090',
+    '10116', '44689', '83333', '99287', '226900', '243273', '284812', '559292'])
+def is_cafa_target(org):
+    return org in CAFA_TARGETS
+def is_exp_code(code):
+    return code in EXP_CODES
+def get_goplus_defs(filename='data/definitions.txt'):
+    plus_defs = {}
+    with open(filename) as f:
+        for line in f:
+            line = line.strip()
+            go_id, definition = line.split(': ')
+            go_id = go_id.replace('_', ':')
+            definition = definition.replace('_', ':')
+            plus_defs[go_id] = set(definition.split(' and '))
+    return plus_defs
+class Ontology(object):
+    def __init__(self, filename='data/go.obo', with_rels=False):
+        self.ont = self.load(filename, with_rels)
+        self.ic = None
+        self.ic_norm = 0.0
+    def has_term(self, term_id):
+        return term_id in self.ont
+    def get_term(self, term_id):
+        if self.has_term(term_id):
+            return self.ont[term_id]
+        return None
+    def calculate_ic(self, annots):
+        cnt = Counter()
+        for x in annots:
+            cnt.update(x)
+        self.ic = {}
+        for go_id, n in cnt.items():
+            parents = self.get_parents(go_id)
+            if len(parents) == 0:
+                min_n = n
+            else:
+                min_n = min([cnt[x] for x in parents])
+            self.ic[go_id] = math.log(min_n / n, 2)
+            self.ic_norm = max(self.ic_norm, self.ic[go_id])
+    def get_ic(self, go_id):
+        if self.ic is None:
+            raise Exception('Not yet calculated')
+        if go_id not in self.ic:
+            return 0.0
+        return self.ic[go_id]
+    def get_norm_ic(self, go_id):
+        return self.get_ic(go_id) / self.ic_norm
+    def load(self, filename, with_rels):
+        ont = dict()
+        obj = None
+        with open(filename, 'r') as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                if line == '[Term]':
+                    if obj is not None:
+                        ont[obj['id']] = obj
+                    obj = dict()
+                    obj['is_a'] = list()
+                    obj['part_of'] = list()
+                    obj['regulates'] = list()
+                    obj['alt_ids'] = list()
+                    obj['is_obsolete'] = False
+                    continue
+                elif line == '[Typedef]':
+                    if obj is not None:
+                        ont[obj['id']] = obj
+                    obj = None
+                else:
+                    if obj is None:
+                        continue
+                    l = line.split(": ")
+                    if l[0] == 'id':
+                        obj['id'] = l[1]
+                    elif l[0] == 'alt_id':
+                        obj['alt_ids'].append(l[1])
+                    elif l[0] == 'namespace':
+                        obj['namespace'] = l[1]
+                    elif l[0] == 'is_a':
+                        obj['is_a'].append(l[1].split(' ! ')[0])
+                    elif with_rels and l[0] == 'relationship':
+                        it = l[1].split()
+                        # add all types of relationships
+                        obj['is_a'].append(it[1])
+                    elif l[0] == 'name':
+                        obj['name'] = l[1]
+                    elif l[0] == 'is_obsolete' and l[1] == 'true':
+                        obj['is_obsolete'] = True
+            if obj is not None:
+                ont[obj['id']] = obj
+        for term_id in list(ont.keys()):
+            for t_id in ont[term_id]['alt_ids']:
+                ont[t_id] = ont[term_id]
+            if ont[term_id]['is_obsolete']:
+                del ont[term_id]
+        for term_id, val in ont.items():
+            if 'children' not in val:
+                val['children'] = set()
+            for p_id in val['is_a']:
+                if p_id in ont:
+                    if 'children' not in ont[p_id]:
+                        ont[p_id]['children'] = set()
+                    ont[p_id]['children'].add(term_id)
+        return ont
+    def get_anchestors(self, term_id):
+        if term_id not in self.ont:
+            return set()
+        term_set = set()
+        q = deque()
+        q.append(term_id)
+        while (len(q) > 0):
+            t_id = q.popleft()
+            if t_id not in term_set:
+                term_set.add(t_id)
+                for parent_id in self.ont[t_id]['is_a']:
+                    if parent_id in self.ont:
+                        q.append(parent_id)
+        return term_set
+    def get_prop_terms(self, terms):
+        prop_terms = set()
+        for term_id in terms:
+            prop_terms |= self.get_anchestors(term_id)
+        return prop_terms
+    def get_parents(self, term_id):
+        if term_id not in self.ont:
+            return set()
+        term_set = set()
+        for parent_id in self.ont[term_id]['is_a']:
+            if parent_id in self.ont:
+                term_set.add(parent_id)
+        return term_set
+    def get_namespace_terms(self, namespace):
+        terms = set()
+        for go_id, obj in self.ont.items():
+            if obj['namespace'] == namespace:
+                terms.add(go_id)
+        return terms
+    def get_namespace(self, term_id):
+        return self.ont[term_id]['namespace']
+    def get_term_set(self, term_id):
+        if term_id not in self.ont:
+            return set()
+        term_set = set()
+        q = deque()
+        q.append(term_id)
+        while len(q) > 0:
+            t_id = q.popleft()
+            if t_id not in term_set:
+                term_set.add(t_id)
+                for ch_id in self.ont[t_id]['children']:
+                    q.append(ch_id)
+        return term_set
+def read_fasta(filename):
+    seqs = list()
+    info = list()
+    seq = ''
+    inf = ''
+    with open(filename, 'r') as f:
+        for line in f:
+            line = line.strip()
+            if line.startswith('>'):
+                if seq != '':
+                    seqs.append(seq)
+                    info.append(inf)
+                    seq = ''
+                inf = line[1:].split()[0]
+            else:
+                seq += line
+        seqs.append(seq)
+        info.append(inf)
+    return info, seqs
+class DataGenerator(object):
+    def __init__(self, batch_size, is_sparse=False):
+        self.batch_size = batch_size
+        self.is_sparse = is_sparse
+    def fit(self, inputs, targets=None):
+        self.start = 0
+        self.inputs = inputs
+        self.targets = targets
+        if isinstance(self.inputs, tuple) or isinstance(self.inputs, list):
+            self.size = self.inputs[0].shape[0]
+        else:
+            self.size = self.inputs.shape[0]
+        self.has_targets = targets is not None
+    def __next__(self):
+        return self.next()
+    def reset(self):
+        self.start = 0
+    def next(self):
+        if self.start < self.size:
+            batch_index = np.arange(
+                self.start, min(self.size, self.start + self.batch_size))
+            if isinstance(self.inputs, tuple) or isinstance(self.inputs, list):
+                res_inputs = []
+                for inp in self.inputs:
+                    if self.is_sparse:
+                        res_inputs.append(
+                            inp[batch_index, :].toarray())
+                    else:
+                        res_inputs.append(inp[batch_index, :])
+            else:
+                if self.is_sparse:
+                    res_inputs = self.inputs[batch_index, :].toarray()
+                else:
+                    res_inputs = self.inputs[batch_index, :]
+            self.start += self.batch_size
+            if self.has_targets:
+                if self.is_sparse:
+                    labels = self.targets[batch_index, :].toarray()
+                else:
+                    labels = self.targets[batch_index, :]
+                return (res_inputs, labels)
+            return res_inputs
+        else:
+            self.reset()
+            return self.next()

data/fasta/example.fasta ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ >P18281
2	+ MNPELQSAIGQGAALKHAETVDKSAPQIENVTVKKVDRSSFLEEVAKPHELKHAETVDKSGPAIPEDVHVKKVDRGAFLSEIEKAAKQ

data/fasta/prepare_custom_fasta.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# prepare fasta data
+name_list = ['P18281']
+sequence_list = ['MNPELQSAIGQGAALKHAETVDKSAPQIENVTVKKVDRSSFLEEVAKPHELKHAETVDKSGPAIPEDVHVKKVDRGAFLSEIEKAAKQ']
+with open('example.fasta', 'w') as f:
+    for i, j in zip(name_list, sequence_list):
+        f.write('>{}\n'.format(i))
+        f.write('{}\n'.format(j))

data/go1.4-basic.obo ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3da20cc774d666b4338446bc81341eaf536885dc10ccb667480a79f6b964aa3c
+size 31134256

data/go_descriptions1.4.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

data/swissprot_exp/test_exp_prompt_bp_new.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/swissprot_exp/test_exp_prompt_cc_new.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/swissprot_exp/test_exp_prompt_mf_new.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/swissprot_exp/train_exp_prompt_bp_new.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12359211ab95f1ce1962b69f033b55e9f502a7527f49414792d1c117ec50b0be
+size 28503657

data/swissprot_exp/train_exp_prompt_cc_new.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01c6144b0e338d3ce8ce98adfd4f9d09f56dc58cd347f4fbaafb6782d694ffd1
+size 23292609

data/swissprot_exp/train_exp_prompt_mf_new.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca3eee941dfc0ee37f59adec6abf8a7276441f04c484a9275274c7003ef4145e
+size 18791760

data/swissprot_exp/val_exp_prompt_bp_new.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/swissprot_exp/val_exp_prompt_cc_new.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/swissprot_exp/val_exp_prompt_mf_new.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/terms/bp_terms.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4952f3551e4fe205640b81f9a1816c15c14cc889bbe55f57d378fb3c6d57f2f7
+size 274892

data/terms/cc_terms.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:20992c211336c4f876c920c2995ae85c1422e8742b7094c997aa70ddec7fc8fd
+size 39440

data/terms/mf_terms.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:192861bad821ef3523ab2dcdd1db5eac093364e9b9b4869f75587d656864d29b
+size 107802