Spaces:

geored
/

gtmio

Running

File size: 13,238 Bytes

fe41391

#!/usr/bin/python
#####################################################################################
# rust_predict_profiles, Correlation between observed and predicted profiles from CDS start + 120 to CDS stop - 60
# Copyright (C) 2015 Patrick O'Connor

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
#####################################################################################

import os, pysam, sys, numpy, argparse, re
from RUST.methods import *


def rank(lsit1):
    lsit2 = []
    lsit1s = lsit1[:]
    lsit1s.sort()
    dict_ranks = {}
    for value_i, value in enumerate(lsit1s):
        dict_ranks.setdefault(value, []).append(value_i)

    for value in lsit1:
        lsit2.append(mean_value(dict_ranks[value]))
    return lsit2


def main(args):

    RUST_file = open(args.rustfile)  # file output of RUST_script.py
    RUST_file.readline()
    codon_rust_dict = {}
    for line in RUST_file:
        linesplit = line.split(",")
        if len(linesplit) == 1:
            break
        codon = linesplit[0]
        if len(codon) != 3 or len(set(codon) - set(["A", "T", "G", "C"])) != 0:
            stop_err("Codon metafootprint file not correct, check input file")
        codon_rust_dict[codon] = {}
        rust_values = list(map(float, linesplit[1:]))
        expected = rust_values[0]
        rust_metafootprint = [ro_value / expected for ro_value in rust_values[1:]]
        for n in range(34, 46):
            codon_rust_dict[codon][n - 40] = rust_metafootprint[
                n
            ]  # for 12 codons positions near A-site
    RUST_file.close()

    mRNA_sequences = args.transcriptome  # path to fastq file of transcripts
    in_seq_handle = open(mRNA_sequences)
    cds_start_dict = {}
    cds_end_dict = {}
    seq_dict = {}
    for line in in_seq_handle:
        if line[0] != ">":
            seq_dict.setdefault(transcript, "")
            seq_dict[transcript] += line[:-1]
            continue
        try:
            transcript_split = line[:-1].split("\t")
            transcript = transcript_split[0][1:]
            cds_start_dict[transcript] = int(transcript_split[1])
            cds_end_dict[transcript] = int(transcript_split[2])
        except:
            pass
    in_seq_handle.close()

    offset = args.offset
    readlen_range = args.lengths
    readlen_rangesplit = readlen_range.split(":")
    if len(readlen_rangesplit) == 1:
        accepted_read_lengths = [int(readlen_rangesplit[0])]
        length_values = "%s" % int(readlen_rangesplit[0])
    elif len(readlen_rangesplit) == 2:
        accepted_read_lengths = [
            readlen
            for readlen in range(
                int(readlen_rangesplit[0]), int(readlen_rangesplit[1]) + 1
            )
        ]
        length_values = "%s_%s" % (
            int(readlen_rangesplit[0]),
            int(readlen_rangesplit[1]),
        )
    else:
        stop_err(
            "Lengths of footprints parameter not in correct format, it should be either colon seperated with the second value greater or equal to the first, (28:32) or a single interger (31)"
        )
    if len(accepted_read_lengths) == 0:
        stop_err(
            "Lengths of footprints parameter not in correct format, it should be either colon seperated with the second value greater or equal to the first, (28:32) or a single interger (31)"
        )

    amino_acids = [
        "A",
        "C",
        "E",
        "D",
        "G",
        "F",
        "I",
        "H",
        "K",
        "M",
        "L",
        "N",
        "Q",
        "P",
        "S",
        "R",
        "T",
        "W",
        "V",
        "Y",
    ]
    aligments_A1 = pysam.Samfile(args.alignment, "rb")

    if not os.path.exists(args.Path):
        os.mkdir(args.Path)
    if args.profiles:
        if not os.path.exists("%s/rust_profile_predictions" % args.Path):
            os.mkdir("%s/rust_profile_predictions" % args.Path)

    if "/" in args.rustfile:
        rustfile_split = args.rustfile.split("/")[-1]
    # elif "\\" in args.rustfile:
    # rustfile_split= args.rustfile.split("\\")[-1]
    else:
        rustfile_split = args.rustfile

    if "RUST_codon_file_" in rustfile_split:
        alignment_filename = rustfile_split[16:]
    else:
        alignment_filename = rustfile_split

    correlations_file = open(
        "%s/predict_profiles_%s_%s_%s"
        % (args.Path, alignment_filename, args.offset, length_values),
        "w",
    )
    correlations_file.write(
        "transcript,average read density,Spearman's coefficient,Pearson's coefficient\n"
    )

    list_transcripts = seq_dict.keys()
    number_transcripts = 0
    list_10_percentile = []
    for value in range(1, 10):
        list_10_percentile.append((len(list_transcripts) * value) / 10)
    for transcript in list_transcripts:
        number_transcripts += 1
        if number_transcripts in list_10_percentile:
            sys.stdout.write(
                "%s percent\n"
                % ((list_10_percentile.index(number_transcripts) + 1) * 10)
            )

        try:
            cds_start = cds_start_dict[transcript]
            cds_end = cds_end_dict[transcript]
            if cds_end < cds_start:
                raise Exception
        except Exception:
            transcript_seq = seq_dict[transcript]
            cds_start = -1
            start_post = []
            end_post = []
            for match in re.finditer(r"(?=(%s))" % re.escape("ATG"), transcript_seq):
                start_post.append(match.start())
            for match in re.finditer(r"(?=(%s))" % re.escape("TAG"), transcript_seq):
                end_post.append(match.start())
            for match in re.finditer(r"(?=(%s))" % re.escape("TAA"), transcript_seq):
                end_post.append(match.start())
            for match in re.finditer(r"(?=(%s))" % re.escape("TGA"), transcript_seq):
                end_post.append(match.start())

            end_post.sort()
            len_max_orf = 0
            for value in start_post:
                for value2 in end_post:
                    if value < value2:
                        if value % 3 == value2 % 3:
                            len_orf = value2 - value
                            if len_orf > len_max_orf:
                                cds_start = value
                                cds_end = value2 + 3
                                len_max_orf = len_orf
                            break
            if cds_start == -1:
                # sys.stdout.write( '%s, AUG codon not found\n'%transcript  )
                continue

        elongation_region_all = seq_dict[transcript][cds_start:cds_end]

        if (
            len(elongation_region_all) % 3 != 0
        ):  # genes with codon region not divisible by 3 skipped
            # sys.stdout.write( '%s, CDS not divisible by 3\n'%transcript  )
            continue

        profile_expect = []
        for n in range(
            0, len(elongation_region_all[120:-60]), 3
        ):  # predicts profile from 120 nts after start to 60 before stop
            minus6_plus5_footprint = elongation_region_all[
                120 + n - 18 : 120 + n + 19
            ]  # contains sequence of region used to predict profile
            value = 1.0
            amino_loc = -6
            for number in range(0, len(minus6_plus5_footprint) - 2, 3):
                codon = minus6_plus5_footprint[number : number + 3]
                if len(set(codon) - set(["A", "T", "G", "C"])) != 0 or codon in [
                    "TAG",
                    "TGA",
                    "TAA",
                ]:
                    amino_loc += 1
                    continue
                value = value * codon_rust_dict[codon][amino_loc]
                amino_loc += 1
            profile_expect.append(value)
        profile_expect_sum = sum(profile_expect)
        profile_expect_probablility = [
            float(value) / profile_expect_sum for value in profile_expect
        ]

        profile_list = [
            0.0 for n in range(cds_start + 120, cds_end - 60)
        ]  # records ribo-seq profile
        if len(profile_list) < 50:
            # sys.stdout.write( '%s, ORF too short\n'%transcript  )
            continue
        all_reads = aligments_A1.fetch(transcript)

        len_elongation_region = len(profile_list)
        for read in all_reads:
            readlen = read.qlen
            if readlen not in accepted_read_lengths:
                continue  # selection of read of acceptable length
            A_site = read.pos + offset - cds_start - 120  # addition of offset
            if len_elongation_region > A_site > -1:
                profile_list[A_site] += 1

        average_gene_density = float(sum(profile_list)) / len(
            profile_list
        )  # average gene density calculated
        if average_gene_density > 0:
            profiles_control_codon = [
                profile_list[codon_ind]
                + profile_list[codon_ind + 1]
                + profile_list[codon_ind + 2]
                for codon_ind in range(0, len(profile_list), 3)
            ]
            spearmanr_value = numpy.corrcoef(
                rank(profiles_control_codon), rank(profile_expect)
            )[0, 1]
            pearsonr_value = numpy.corrcoef(profiles_control_codon, profile_expect)[
                0, 1
            ]
            correlations_file.write(
                "%s,%s,%s,%s\n"
                % (transcript, average_gene_density, spearmanr_value, pearsonr_value)
            )
            if args.profiles:
                open_file = open(
                    "%s/rust_profile_predictions/observed_predicted_%s_%s_%s_%s.csv"
                    % (
                        args.Path,
                        transcript,
                        alignment_filename,
                        args.offset,
                        length_values,
                    ),
                    "w",
                )
                profile_expect_probablility_index = 0
                open_file.write("%s\n" % transcript)
                open_file.write("codon, predicted probability, alignments\n")
                for coordinate_index in range(
                    0, len(elongation_region_all[120:-60]), 3
                ):
                    codon = elongation_region_all[
                        120 + coordinate_index : 120 + coordinate_index + 3
                    ]
                    open_file.write("%s, " % (codon))
                    open_file.write(
                        "%s, "
                        % (
                            profile_expect_probablility[
                                profile_expect_probablility_index
                            ]
                        )
                    )
                    open_file.write(
                        "%s\n"
                        % (profiles_control_codon[profile_expect_probablility_index])
                    )
                    profile_expect_probablility_index += 1
                open_file.close()
    correlations_file.close()


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Correlation between observed and predicted profiles from CDS start + 120 to CDS stop - 60"
    )
    parser.add_argument(
        "-t",
        "--transcriptome",
        help="fasta file of transcripts, CDS start and end may be provided on description line using tab separation e.g. >NM_0001  10  5000, otherwise it searches for longest ORF"
        ", required=True",
    )
    parser.add_argument(
        "-a",
        "--alignment",
        help="sorted bam file of transcriptome alignments",
        required=True,
    )
    parser.add_argument("-o", "--offset", help="nucleotide offset to A-site", type=int)
    parser.add_argument(
        "-l",
        "--lengths",
        help="lengths of footprints included, for example 28:32 is 28,29,30,31,32",
    )
    parser.add_argument(
        "-P",
        "--Path",
        help='path to outputfile, default is "amino"',
        default="predict_profiles",
    )
    parser.add_argument("-r", "--rustfile", help="path to rust file produced by codon")
    parser.add_argument(
        "-o",
        metavar="outfile directory",
        help='path to outputfile, default is "predict_profiles"',
        default="predict_profiles",
    )
    parser.add_argument(
        "-p",
        action="store_true",
        help="writes all profiles in csv files, may produce >10,000 files",
        default=False,
    )
    parser.add_argument("--version", action="version", version="%(prog)s 1.2")
    args = parser.parse_args(None)
    main(args)