import os import re import json import shelve from Bio import PDB from typing import Optional, Tuple, List from dataclasses import dataclass, field @dataclass class EvalTask: in_path: str ref_path: str info: dict structure: str name: str method: str cdr: str ab_chains: List residue_first: Optional[Tuple] = None residue_last: Optional[Tuple] = None scores: dict = field(default_factory=dict) def get_gen_biopython_model(self): parser = PDB.PDBParser(QUIET=True) return parser.get_structure(self.in_path, self.in_path)[0] def get_ref_biopython_model(self): parser = PDB.PDBParser(QUIET=True) return parser.get_structure(self.ref_path, self.ref_path)[0] def save_to_db(self, db: shelve.Shelf): db[self.in_path] = self def to_report_dict(self): return { 'method': self.method, 'structure': self.structure, 'cdr': self.cdr, 'filename': os.path.basename(self.in_path), **self.scores } class TaskScanner: def __init__(self, root, postfix=None, db: Optional[shelve.Shelf]=None): super().__init__() self.root = root self.postfix = postfix self.visited = set() self.db = db if db is not None: for k in db.keys(): self.visited.add(k) def _get_metadata(self, fpath): json_path = os.path.join( os.path.dirname(os.path.dirname(fpath)), 'metadata.json' ) tag_name = os.path.basename(os.path.dirname(fpath)) method_name = os.path.basename( os.path.dirname(os.path.dirname(os.path.dirname(fpath))) ) try: antibody_chains = set() info = None with open(json_path, 'r') as f: metadata = json.load(f) for item in metadata['items']: if item['tag'] == tag_name: info = item antibody_chains.add(item['residue_first'][0]) if info is not None: info['antibody_chains'] = list(antibody_chains) info['structure'] = metadata['identifier'] info['method'] = method_name return info except (json.JSONDecodeError, FileNotFoundError) as e: return None def scan(self) -> List[EvalTask]: tasks = [] if self.postfix is None or not self.postfix: input_fname_pattern = '^\d+\.pdb$' ref_fname = 'REF1.pdb' else: input_fname_pattern = f'^\d+\_{self.postfix}\.pdb$' ref_fname = f'REF1_{self.postfix}.pdb' for parent, _, files in os.walk(self.root): for fname in files: fpath = os.path.join(parent, fname) if not re.match(input_fname_pattern, fname): continue if os.path.getsize(fpath) == 0: continue if fpath in self.visited: continue # Path to the reference structure ref_path = os.path.join(parent, ref_fname) if not os.path.exists(ref_path): continue # CDR information info = self._get_metadata(fpath) if info is None: continue tasks.append(EvalTask( in_path = fpath, ref_path = ref_path, info = info, structure = info['structure'], name = info['name'], method = info['method'], cdr = info['tag'], ab_chains = info['antibody_chains'], residue_first = info.get('residue_first', None), residue_last = info.get('residue_last', None), )) self.visited.add(fpath) return tasks