import re from rdkit import Chem from rdkit.Chem import MolFromSmiles, SDWriter import logging from Bio import SeqIO logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def process_smiles(smiles: str) -> str: mol = MolFromSmiles(smiles) if not mol: raise ValueError(f"Invalid SMILES string: {smiles}") sdf_file = "/tmp/output.sdf" writer = SDWriter(sdf_file) writer.write(mol) writer.close() return sdf_file def process_pdb(file_path: str) -> str: sequences = [] with open(file_path, "r") as handle: for record in SeqIO.parse(handle, "pdb-seqres"): sequences.append(str(record.seq)) return " ".join(sequences) def process_sdf(file_path: str) -> str: return file_path def extract_smiles(text: str) -> str: smiles_pattern = r"([^J][0-9BCOHNSOPrIFla@+\-\[\]\(\)\\\/%=#$]{6,})" matches = re.findall(smiles_pattern, text) if matches: return matches[0] return "" def is_valid_smiles(smiles: str) -> bool: mol = MolFromSmiles(smiles) return mol is not None def extract_and_convert_to_sdf(text: str) -> str: smiles = extract_smiles(text) if smiles and is_valid_smiles(smiles): return process_smiles(smiles) raise ValueError("No valid SMILES string found in the text.")