|
import multiprocessing as mp |
|
import requests as r |
|
import argparse |
|
from Bio import SeqIO |
|
from io import StringIO |
|
import pandas as pd |
|
import numpy as np |
|
|
|
|
|
def parse_args(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument( |
|
"--file_path", |
|
type=str, |
|
required=True, |
|
help="Path to the file that have a column cotaining uniprotid information.", |
|
) |
|
parser.add_argument( |
|
"--sheet_name", |
|
type=str, |
|
default="Sheet1", |
|
help="Name of the sheet to read. Default is Sheet1.", |
|
) |
|
parser.add_argument( |
|
"--uniprotid_column", |
|
type=str, |
|
help="Name of the column that have uniprotid information. Default is None.", |
|
) |
|
parser.add_argument( |
|
"--uniprotids_column", |
|
type=str, |
|
help="Name of the column that have uniprotids information. Default is None. The ids are expected to be separated by semi-colon, and the first id is used.", |
|
) |
|
parser.add_argument( |
|
"--num_processes", |
|
type=int, |
|
default=2, |
|
help="Number of processes to use.", |
|
) |
|
return parser.parse_args() |
|
|
|
|
|
def fetch_sequence(row, cfg): |
|
try: |
|
baseURL = "http://www.uniprot.org/uniprot/" |
|
uniprot_id = row[cfg.uniprotid_column] |
|
URL = baseURL + uniprot_id + ".fasta" |
|
response = r.post(URL) |
|
Data = "".join(response.text) |
|
Seq = StringIO(Data) |
|
pSeq = list(SeqIO.parse(Seq, "fasta")) |
|
return str(pSeq[0].seq) |
|
except: |
|
return None |
|
|
|
|
|
def process_rows(df_chunk, cfg): |
|
return [fetch_sequence(row, cfg) for idx, row in df_chunk.iterrows()] |
|
|
|
|
|
if __name__ == "__main__": |
|
config = parse_args() |
|
|
|
|
|
if config.file_path.endswith(".xls"): |
|
df = pd.read_excel( |
|
config.file_path, |
|
sheet_name=config.sheet_name, |
|
) |
|
else: |
|
df = pd.read_csv(config.file_path) |
|
|
|
if config.uniprotid_column is None and config.uniprotids_column is None: |
|
raise ValueError( |
|
"Either uniprotid_column or uniprotids_column should be provided." |
|
) |
|
if config.uniprotids_column is not None: |
|
df = df.dropna(subset=[config.uniprotids_column]).reset_index(drop=True) |
|
|
|
df["uniprotid"] = df[config.uniprotids_column].apply( |
|
lambda x: x.split(";")[0].split("-")[0] |
|
) |
|
config.uniprotid_column = "uniprotid" |
|
|
|
df_split = np.array_split(df, config.num_processes) |
|
|
|
with mp.Pool(processes=config.num_processes) as pool: |
|
results = pool.map(lambda x: process_rows(x, config), df_split) |
|
|
|
aas = [seq for result in results for seq in result] |
|
|
|
df["aa"] = aas |
|
df.to_csv(f"{config.file_path.split('.')[0]}_with_aa.csv", index=False) |
|
|