Spaces:
Running
Running
import torch | |
from Bio.PDB import PDBParser | |
def read_pdb(file_path): | |
parser = PDBParser(QUIET=True) | |
structure = parser.get_structure('protein', file_path) | |
return structure | |
def extract_sequence(structure): | |
sequence = [] | |
three_to_one = { | |
'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L', | |
'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R', 'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y' | |
} | |
for model in structure: | |
for chain in model: | |
for residue in chain: | |
if residue.get_resname() in three_to_one: | |
sequence.append(three_to_one[residue.get_resname()]) | |
else: | |
sequence.append('X') | |
return ''.join(sequence) | |
def get_features(sequence, tokenizer, model, modal='sequence'): | |
if modal == 'sequence': | |
inputs = tokenizer(sequence, return_tensors="pt", add_special_tokens=False) | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
return outputs.last_hidden_state[0] | |
else: | |
inputs = tokenizer(sequence, return_tensors="pt", add_special_tokens=False) | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
return outputs.pooler_output[0] | |
def get_emb_dim(esm_version): | |
if 'esm2_t33_650M_UR50D' in esm_version: | |
return 1280 | |
elif 'esm1b_t33_650M_UR50S' in esm_version: | |
return 1280 | |
def merge_ranges(data, max_value=200): | |
result = {} | |
all_values, used_values = set(range(max_value + 1)), set() | |
for key, values in data.items(): | |
if not values: | |
result[key] = [] | |
continue | |
values.sort() | |
used_values.update(values) | |
merged = [] | |
start, end = values[0], values[0] | |
for i in range(1, len(values)): | |
if values[i] == end + 1: | |
end = values[i] | |
else: | |
merged.append(f"{start}-{end}" if start != end else str(start)) | |
start = values[i] | |
end = values[i] | |
merged.append(f"{start}-{end}" if start != end else str(start)) | |
result[key] = merged | |
remaining_values = sorted(all_values - used_values) # 未使用的值 | |
if remaining_values: | |
merged = [] | |
start = remaining_values[0] | |
end = remaining_values[0] | |
for i in range(1, len(remaining_values)): | |
if remaining_values[i] == end + 1: | |
end = remaining_values[i] | |
else: | |
merged.append(f"{start}-{end}" if start != end else str(start)) | |
start = remaining_values[i] | |
end = remaining_values[i] | |
merged.append(f"{start}-{end}" if start != end else str(start)) | |
result["b"] = merged | |
else: | |
result["b"] = [] | |
return result |