File size: 1,483 Bytes
4321e7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
import random
import argparse
import glob
import pandas as pd
import multiprocessing as mp
from foldseek_util import get_struc_seq


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--pdb_dir",
        type=str,
        default="./pdb_files",
        help="Directory containing PDB files.",
    )
    parser.add_argument(
        "--num_processes",
        type=int,
        default=2,
        help="Number of processes to use for multiprocessing. Default is 2.",
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        default="./data",
        help="Output directory.",
    )
    return parser.parse_args()


def get_foldseek_seq(pdb_path):
    parsed_seqs = get_struc_seq(
        "bin/foldseek",
        pdb_path,
        ["A"],
        process_id=random.randint(0, 10000000),
    )["A"]
    return parsed_seqs


if __name__ == "__main__":
    config = parse_args()

    pdb_files = glob.glob(os.path.join(config.pdb_dir, "*.pdb"))

    with mp.Pool(config.num_processes) as pool:
        output = pool.map(get_foldseek_seq, pdb_files)

    aa, foldseek, aa_foldseek = zip(*output)

    result = {}
    result["file"] = pdb_files
    result["aa"] = aa
    result["foldseek"] = foldseek
    result["aa_foldseek"] = aa_foldseek

    df = pd.DataFrame(result)

    df.to_csv(os.path.join(config.output_dir, "foldseek_result.csv"), index=False)