Spaces:

wenkai
/

FAPM_demo

Runtime error

App Files Files Community

wenkai commited on Jun 25, 2024

Commit

3daa625

verified ·

1 Parent(s): 4e11d5e

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -4

app.py CHANGED Viewed

@@ -10,20 +10,84 @@ import gradio as gr
 from esm_scripts.extract import run_demo
 from esm import pretrained, FastaBatchedDataset
-# from transformers import EsmTokenizer, EsmModel
 # Load the model
 model = Blip2ProteinMistral(config=FAPMConfig(), esm_size='3b')
 model.load_checkpoint("model/checkpoint_mf2.pth")
 model.to('cuda')
 @spaces.GPU
 def generate_caption(protein, prompt):
-    esm_emb = torch.load('data/emb_esm2_3b/P18281.pt')['representations'][36]
-    torch.save(esm_emb, 'data/emb_esm2_3b/example.pt')
     '''
     inputs = tokenizer([protein], return_tensors="pt", padding=True, truncation=True).to('cuda')
     with torch.no_grad():

 from esm_scripts.extract import run_demo
 from esm import pretrained, FastaBatchedDataset
 # Load the model
 model = Blip2ProteinMistral(config=FAPMConfig(), esm_size='3b')
 model.load_checkpoint("model/checkpoint_mf2.pth")
 model.to('cuda')
+model_esm, alphabet = pretrained.load_model_and_alphabet('esm2_t36_3B_UR50D')
+model_esm.to('cuda')
+model_esm.eval()
 @spaces.GPU
 def generate_caption(protein, prompt):
+    # Process the image and the prompt
+    # with open('/home/user/app/example.fasta', 'w') as f:
+    #     f.write('>{}\n'.format("protein_name"))
+    #     f.write('{}\n'.format(protein.strip()))
+    # os.system("python esm_scripts/extract.py esm2_t36_3B_UR50D /home/user/app/example.fasta /home/user/app --repr_layers 36 --truncation_seq_length 1024 --include per_tok")
+    # esm_emb = run_demo(protein_name='protein_name', protein_seq=protein,
+    #                    model=model_esm, alphabet=alphabet,
+    #                    include='per_tok', repr_layers=[36], truncation_seq_length=1024)
+    protein_name = 'protein_name'
+    protein_seq = protein
+    include = 'per_tok'
+    repr_layers = [36]
+    truncation_seq_length = 1024
+    toks_per_batch = 4096
+    print("start")
+    dataset = FastaBatchedDataset([protein_name], [protein_seq])
+    print("dataset prepared")
+    batches = dataset.get_batch_indices(toks_per_batch, extra_toks_per_seq=1)
+    print("batches prepared")
+    data_loader = torch.utils.data.DataLoader(
+        dataset, collate_fn=alphabet.get_batch_converter(truncation_seq_length), batch_sampler=batches
+    )
+    print(f"Read sequences")
+    return_contacts = "contacts" in include
+    assert all(-(model_esm.num_layers + 1) <= i <= model_esm.num_layers for i in repr_layers)
+    repr_layers = [(i + model_esm.num_layers + 1) % (model_esm.num_layers + 1) for i in repr_layers]
+    with torch.no_grad():
+        for batch_idx, (labels, strs, toks) in enumerate(data_loader):
+            print(
+                f"Processing {batch_idx + 1} of {len(batches)} batches ({toks.size(0)} sequences)"
+            )
+            if torch.cuda.is_available():
+                toks = toks.to(device="cuda", non_blocking=True)
+            out = model_esm(toks, repr_layers=repr_layers, return_contacts=return_contacts)
+            representations = {
+                layer: t.to(device="cpu") for layer, t in out["representations"].items()
+            }
+            if return_contacts:
+                contacts = out["contacts"].to(device="cpu")
+            for i, label in enumerate(labels):
+                result = {"label": label}
+                truncate_len = min(truncation_seq_length, len(strs[i]))
+                # Call clone on tensors to ensure tensors are not views into a larger representation
+                # See https://github.com/pytorch/pytorch/issues/1995
+                if "per_tok" in include:
+                    result["representations"] = {
+                        layer: t[i, 1: truncate_len + 1].clone()
+                        for layer, t in representations.items()
+                    }
+                if "mean" in include:
+                    result["mean_representations"] = {
+                        layer: t[i, 1: truncate_len + 1].mean(0).clone()
+                        for layer, t in representations.items()
+                    }
+                if "bos" in include:
+                    result["bos_representations"] = {
+                        layer: t[i, 0].clone() for layer, t in representations.items()
+                    }
+                if return_contacts:
+                    result["contacts"] = contacts[i, : truncate_len, : truncate_len].clone()
+            esm_emb = result['representations'][36]
     '''
     inputs = tokenizer([protein], return_tensors="pt", padding=True, truncation=True).to('cuda')
     with torch.no_grad():