File size: 1,483 Bytes
4321e7e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
import os
import random
import argparse
import glob
import pandas as pd
import multiprocessing as mp
from foldseek_util import get_struc_seq
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--pdb_dir",
type=str,
default="./pdb_files",
help="Directory containing PDB files.",
)
parser.add_argument(
"--num_processes",
type=int,
default=2,
help="Number of processes to use for multiprocessing. Default is 2.",
)
parser.add_argument(
"--output_dir",
type=str,
default="./data",
help="Output directory.",
)
return parser.parse_args()
def get_foldseek_seq(pdb_path):
parsed_seqs = get_struc_seq(
"bin/foldseek",
pdb_path,
["A"],
process_id=random.randint(0, 10000000),
)["A"]
return parsed_seqs
if __name__ == "__main__":
config = parse_args()
pdb_files = glob.glob(os.path.join(config.pdb_dir, "*.pdb"))
with mp.Pool(config.num_processes) as pool:
output = pool.map(get_foldseek_seq, pdb_files)
aa, foldseek, aa_foldseek = zip(*output)
result = {}
result["file"] = pdb_files
result["aa"] = aa
result["foldseek"] = foldseek
result["aa_foldseek"] = aa_foldseek
df = pd.DataFrame(result)
df.to_csv(os.path.join(config.output_dir, "foldseek_result.csv"), index=False)
|