Spaces:

geored
/

gtmio

Running

File size: 20,604 Bytes

fe41391

#!/usr/bin/env python
#####################################################################################
# rust_codon, Produces RUST metagene profile of codons
# Copyright (C) 2015 Patrick O'Connor

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
#####################################################################################


import os, re, pysam, sys, math, argparse
from RUST.methods import *

try:
    import matplotlib as mpl

    mpl.use("Agg")
    import matplotlib.pyplot as plt
    from pylab import MaxNLocator
except:
    pass


def RUST_metagene_plot(infileopen36, ax36):
    infileopen36.seek(0)
    infileopen36.readline()
    while 1:
        line = infileopen36.readline()
        linesplit = line.split(",")
        if len(linesplit) == 1:
            break
        codon = linesplit[0]
        coverage = list(map(float, linesplit[1:]))
        coverage_a = coverage[0]
        if coverage_a == 0:
            continue
        coverage_n = [n / coverage_a for n in coverage[1:]]
        log2_values = [math.log(n, 2) if n != 0 else float("-inf") for n in coverage_n]
        ax36.plot(log2_values, color="gray")

    line = infileopen36.readline()
    linesplit = line.split(",")
    if "NA" not in line:
        coverage = map(float, linesplit[2:])
        ax2 = ax36.twinx()
        ax2.plot(coverage, color="blue")
        for tl in ax2.get_yticklabels():
            tl.set_color("blue")
            tl.set_rotation(0)

        ax2.yaxis.set_major_locator(MaxNLocator(3))
        ax2.set_ylim(0, 1.0)
        ax2.set_ylim(-2, 1.0)
        ax2.set_yticks([0, 1], minor=False)
        ax2.set_yticklabels(["0", "1"])
        ax2.set_ylabel("Kullback-Leibler divergence", color="blue")

    ax36.set_xticks([5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55])
    ax36.set_xticklabels([-35, -30, -25, -20, -15, -10, -5, 0, 5, 10, 15])
    ax36.set_xlabel("distance from A-site [codon]")
    ax36.set_ylabel("Codon RUST ratio (observed/expected), log2")
    ax36.axvline(40, color="red")


def A_site_plot(infileopen35, dict_codon75, axis_Asite53, loc2):
    codon_to_amino_dict = {}
    amino_to_codons_dict = {}
    for amino_acid, codons in dict_codon75.items():
        for codon in codons:
            codon_to_amino_dict[codon] = amino_acid
            amino_to_codons_dict.setdefault(amino_acid, []).append(codon)

    list1 = []
    list2 = []
    infileopen35.seek(0)
    infileopen35.readline()
    dict_amino_value = {}
    for line in infileopen35:
        linesplit = line.split(",")
        if len(linesplit) == 1:
            break
        codon = linesplit[0]
        if codon in ["TAA", "TAG", "TGA"]:
            continue
        list1.append(linesplit[0])
        coverage = list(map(float, linesplit[1:]))
        coverage_a = coverage[0]
        coverage_n = [n / coverage_a for n in coverage[1:]]
        list2.append(float(coverage_n[loc2]))

        amino = codon_to_amino_dict[linesplit[0]]
        if amino in dict_amino_value:
            dict_amino_value[codon_to_amino_dict[linesplit[0]]].append(
                float(coverage_n[loc2])
            )
        else:
            dict_amino_value[codon_to_amino_dict[linesplit[0]]] = [
                float(coverage_n[loc2])
            ]

    list_amino_sorted = []
    for key, value in dict_amino_value.items():
        list_amino_sorted.append((mean_value(value), key))
    list_amino_sorted.sort()

    A_site_value_norm = [n / min(list2) for n in list2]
    list3 = list(zip(A_site_value_norm, list1))
    list3.sort()
    A_site_value_norm_dict = {}
    for tupel in list3:
        A_site_value_norm_dict[tupel[1]] = tupel[0]

    used_codons = []
    xloc = []
    xtick_label = []
    n1 = 0

    for _, amino_acid in list_amino_sorted:
        if amino_acid in used_codons:
            continue
        used_codons.append(amino_acid)
        n1 += 1  # len(dict_list_codon[amino_acid])

        xloc.append(n1)
        for amino_acid_codon in amino_to_codons_dict[amino_acid]:
            axis_Asite53.scatter(
                n1,
                A_site_value_norm_dict[amino_acid_codon],
                color="gray",
                s=50,
                edgecolor="gray",
            )
        xtick_label.append(amino_acid)

    axis_Asite53.set_xticks(xloc)
    axis_Asite53.set_xticklabels(xtick_label, rotation=90)
    for tick in axis_Asite53.get_xticklabels():
        if tick.get_text() in ["Phe", "Tyr", "Trp"]:
            a2 = tick.set_backgroundcolor("lightgreen")  # (dict(facecolor = "red"))
            # tick.set_color("white")
        if tick.get_text() in ["Val", "Ala", "Leu", "Met", "Ile"]:
            tick.set_backgroundcolor("lightgrey")
            # tick.set_color("white")
        if tick.get_text() in ["Ser", "Asn", "Thr", "Gln"]:
            tick.set_backgroundcolor("ForestGreen")
            tick.set_color("white")

        if tick.get_text() in ["His", "Lys", "Arg"]:
            tick.set_backgroundcolor("blue")
            tick.set_color("white")
        if tick.get_text() in ["Glu", "Asp"]:
            tick.set_backgroundcolor("red")
            tick.set_color("white")
    axis_Asite53.set_xlim(0, n1 + 1)
    axis_Asite53.set_ylabel("A-site codon RUST ratio")

    red = mpl.patches.Rectangle((0, 0), 1, 1, fc="r")
    blue = mpl.patches.Rectangle((0, 0), 1, 1, fc="b")
    fgreen = mpl.patches.Rectangle((0, 0), 1, 1, fc="ForestGreen")
    lgreen = mpl.patches.Rectangle((0, 0), 1, 1, fc="lightGreen")
    grey = mpl.patches.Rectangle((0, 0), 1, 1, fc="lightgrey")

    axis_Asite53.legend(
        [red, grey, lgreen, blue, fgreen],
        ["acidic", "aliphatic", "aromatic", "basic", "polar\nuncharged"],
        bbox_to_anchor=(0, 0, 0.8, 1.12),
        ncol=3,
    )


def main(args):

    universal_code = {
        "Ala": ["GCT", "GCC", "GCG", "GCA"],
        "Gly": ["GGT", "GGC", "GGG", "GGA"],
        "Pro": ["CCT", "CCC", "CCG", "CCA"],
        "Thr": ["ACT", "ACC", "ACG", "ACA"],
        "Val": ["GTT", "GTC", "GTG", "GTA"],
        "Ser": ["TCT", "TCC", "TCG", "TCA", "AGT", "AGC"],
        "Arg": ["CGT", "CGC", "CGG", "CGA", "AGG", "AGA"],
        "Leu": ["CTT", "CTC", "CTG", "CTA", "TTG", "TTA"],
        "Phe": ["TTT", "TTC"],
        "Asn": ["AAT", "AAC"],
        "Lys": ["AAG", "AAA"],
        "Asp": ["GAT", "GAC"],
        "Glu": ["GAG", "GAA"],
        "His": ["CAT", "CAC"],
        "Gln": ["CAG", "CAA"],
        "Ile": ["ATT", "ATC", "ATA"],
        "Met": ["ATG"],
        "Tyr": ["TAT", "TAC"],
        "Cys": ["TGT", "TGC"],
        "Trp": ["TGG"],
        "Stop": ["TGA", "TAG", "TAA"],
    }

    mRNA_sequences = args.transcriptome  # path to fastq file of transcripts
    in_seq_handle = open(mRNA_sequences)
    cds_start_dict = {}
    cds_end_dict = {}
    seq_dict = {}
    for line in in_seq_handle:
        if line[0] != ">":
            seq_dict.setdefault(transcript, "")
            seq_dict[transcript] += line[:-1]
            continue
        try:
            transcript_split = line[:-1].split("\t")
            transcript = transcript_split[0][1:]
            cds_start_dict[transcript] = int(transcript_split[1])
            cds_end_dict[transcript] = int(transcript_split[2])
        except:
            pass
    in_seq_handle.close()

    offset = args.offset
    readlen_range = args.lengths

    readlen_rangesplit = readlen_range.split(":")
    if len(readlen_rangesplit) == 1:
        accepted_read_lengths = [int(readlen_rangesplit[0])]
        length_values = "%s" % int(readlen_rangesplit[0])
    elif len(readlen_rangesplit) == 2:
        accepted_read_lengths = [
            readlen
            for readlen in range(
                int(readlen_rangesplit[0]), int(readlen_rangesplit[1]) + 1
            )
        ]
        length_values = "%s_%s" % (
            int(readlen_rangesplit[0]),
            int(readlen_rangesplit[1]),
        )
    else:
        stop_err(
            "Lengths of footprints parameter not in correct format, it should be either colon seperated with the second value greater or equal to the first, (28:32) or a single interger (31)"
        )
    if len(accepted_read_lengths) == 0:
        stop_err(
            "Lengths of footprints parameter not in correct format, it should be either colon seperated with the second value greater or equal to the first, (28:32) or a single interger (31)"
        )

    nts = ["A", "G", "C", "T"]
    aligments_A1 = pysam.Samfile(
        args.alignment, "rb"
    )  # path to aligments in bam format

    codon_enrichment_dict = {}
    codon_enrichment_expected_dict = {}
    for nt in nts:
        for nt2 in nts:
            for nt3 in nts:
                codon = "%s%s%s" % (nt, nt2, nt3)
                codon_enrichment_dict[codon] = {}
                codon_enrichment_expected_dict[codon] = []
                for number in range(0, 60, 1):
                    codon_enrichment_dict[codon][number] = [0.0, 0.0]

    list_transcripts = seq_dict.keys()
    number_transcripts = 0
    list_10_percentile = []
    for value in range(1, 10):
        list_10_percentile.append((len(list_transcripts) * value) / 10)
    for transcript in list_transcripts:
        number_transcripts += 1
        if number_transcripts in list_10_percentile:
            sys.stdout.write(
                "%s percent\n"
                % ((list_10_percentile.index(number_transcripts) + 1) * 10)
            )

        try:  # use supplied CDS annotation
            cds_start = cds_start_dict[transcript]
            cds_end = cds_end_dict[transcript]
            if cds_end < cds_start:
                raise Exception
        except Exception:  # find longest ORF
            transcript_seq = seq_dict[transcript]
            cds_start = -1
            start_post = []
            end_post = []
            for match in re.finditer(r"(?=(%s))" % re.escape("ATG"), transcript_seq):
                start_post.append(match.start())
            for match in re.finditer(r"(?=(%s))" % re.escape("TAG"), transcript_seq):
                end_post.append(match.start())
            for match in re.finditer(r"(?=(%s))" % re.escape("TAA"), transcript_seq):
                end_post.append(match.start())
            for match in re.finditer(r"(?=(%s))" % re.escape("TGA"), transcript_seq):
                end_post.append(match.start())

            end_post.sort()
            len_max_orf = 0
            for value in start_post:
                for value2 in end_post:
                    if value < value2:
                        if value % 3 == value2 % 3:
                            len_orf = value2 - value
                            if len_orf > len_max_orf:
                                cds_start = value
                                cds_end = value2 + 3
                                len_max_orf = len_orf
                            break
            if cds_start == -1:
                # sys.stdout.write( '%s, AUG codon not found\n'%transcript  )
                continue

        elongation_region_all = seq_dict[transcript][cds_start:cds_end]
        elongation_region_part = elongation_region_all[
            120:-60
        ]  # first 120 and last 60 nt are not used
        # peptide_sequence        = elongation_region_all.translate()

        if len(elongation_region_part) % 3 != 0:
            # sys.stdout.write( '%s, CDS not divisible by 3\n'%transcript  )
            continue

        profile_list = [
            0.0 for n in range(cds_start + 120, cds_end - 60)
        ]  # records ribo-seq profile
        if len(profile_list) < 50:
            # sys.stdout.write( '%s, ORF too short\n'%transcript  )
            continue
        all_reads = aligments_A1.fetch(transcript)

        len_elongation_region = len(profile_list)
        for read in all_reads:
            readlen = read.qlen
            if readlen not in accepted_read_lengths:
                continue  # selection of read of acceptable length
            A_site = read.pos + offset - cds_start - 120  # addition of offset
            if len_elongation_region > A_site > -1:
                profile_list[A_site] += 1

        average_gene_density = float(sum(profile_list)) / len(
            profile_list
        )  # average gene density calculated
        if average_gene_density != 0:
            num_codon = len(
                [
                    1
                    for number88 in range(0, len(profile_list), 3)
                    if (
                        (
                            profile_list[number88]
                            + profile_list[number88 + 1]
                            + profile_list[number88 + 2]
                        )
                        / 3
                    )
                    > average_gene_density
                ]
            )
            # number of codons that exceed average gene density
            expected_codon_density = float(num_codon) / (
                len(profile_list) / 3
            )  # expected enrichment value

            codon_start = 0
            for sliding_w_n in range(
                0, len(elongation_region_part), 3
            ):  # sliding window using increments of 3 nts
                codon_window = str(
                    elongation_region_all[codon_start : codon_start + 180]
                )  # 60 codon window,
                if len(set(codon_window) - set(["A", "T", "G", "C"])) != 0:
                    codon_start += 3
                    continue

                if (
                    profile_list[sliding_w_n]
                    + profile_list[sliding_w_n + 1]
                    + profile_list[sliding_w_n + 2]
                ) / 3 > average_gene_density:
                    for number in range(0, 60):
                        codon = codon_window[number * 3 : (number + 1) * 3]
                        codon_enrichment_dict[codon][number][0] += 1
                        codon_enrichment_dict[codon][number][1] += 1
                else:
                    for number in range(0, 60):
                        codon = codon_window[number * 3 : (number + 1) * 3]
                        codon_enrichment_dict[codon][number][0] += 1

                codon = codon_window[120:123]  # corresponds to A-site codon
                codon_enrichment_expected_dict[codon].append(expected_codon_density)
                codon_start += 3

    if not os.path.exists(args.Path):
        os.mkdir(args.Path)
    alignment_filename = args.alignment.split("/")[-1]
    outfile = open(
        "%s/RUST_codon_file_%s_%s_%s"
        % (args.Path, alignment_filename, args.offset, length_values),
        "w",
    )
    outfile.write("codon, expected value")
    for number106 in range(-40, 20):
        outfile.write(", %s" % number106)
    outfile.write("\n")

    list_codons = []
    codons = list(codon_enrichment_dict)
    codons.sort()
    rust_expected = []
    rust_observed_metafootprint = []
    for codon in codons:
        if codon in list_codons:
            continue
        if codon in ["TAA", "TGA", "TAG"]:
            continue
        list_codons.append(codon)
        outfile.write("%s" % codon)
        if codon_enrichment_expected_dict[codon] != []:
            outfile.write(", %s" % mean_value(codon_enrichment_expected_dict[codon]))
        list_data = []
        for number in range(0, 60):
            if codon_enrichment_dict[codon][number][0] != 0:
                outfile.write(
                    ", %s"
                    % (
                        codon_enrichment_dict[codon][number][1]
                        / codon_enrichment_dict[codon][number][0]
                    )
                )
                list_data.append(
                    codon_enrichment_dict[codon][number][1]
                    / codon_enrichment_dict[codon][number][0]
                )
            else:
                outfile.write(", 0")
                list_data.append(0)
        outfile.write("\n")
        rust_expected.append(mean_value(codon_enrichment_expected_dict[codon]))
        rust_observed_metafootprint.append(list_data)

    rust_expected_sum = sum(rust_expected)
    q_values = [n / rust_expected_sum for n in rust_expected]

    shannon_values = []
    for loc_i in range(60):
        rust_observed = [n[loc_i] for n in rust_observed_metafootprint]
        rust_observed_sum = sum(rust_observed)
        rust_observed_min = min(rust_observed)
        if rust_observed_min == 0:
            shannon_values.append("NA")
        else:
            p_values = [n / rust_observed_sum for n in rust_observed]
            shannon = []
            list_normalised = []  ####
            for p_value, q_value in zip(p_values, q_values):
                shannon.append(abs(p_value * math.log((p_value / q_value), 2)))
                list_normalised.append(p_value / q_value)  ####
            shannon_values.append(sum(shannon))

    outfile.write("\nKullback Leibler divergence,")
    for value in shannon_values:
        outfile.write(", %s" % value)
    outfile.close()

    try:
        mpl.rcParams["xtick.direction"] = "out"
        mpl.rcParams["ytick.direction"] = "out"
        mpl.rcParams["legend.fontsize"] = 10
        mpl.rcParams["ytick.labelsize"] = 10
        mpl.rcParams["xtick.labelsize"] = 10
        mpl.rcParams["font.size"] = 10
        mpl.rcParams["axes.titlesize"] = 10
        mpl.rcParams["legend.frameon"] = 0
        mpl.rcParams["axes.axisbelow"] = False
        mpl.rcParams["xtick.major.pad"] = 2.0
        mpl.rcParams["ytick.major.pad"] = 2
        mpl.rcParams["xtick.major.size"] = 2.0
        mpl.rcParams["ytick.major.size"] = 2
        mpl.rcParams["axes.linewidth"] = 0.5
        mpl.rcParams["ytick.major.width"] = 0.25
        mpl.rcParams["xtick.major.width"] = 0.25
        mpl.rcParams["lines.linewidth"] = 1
        mpl.rcParams["legend.borderpad"] = 0.01
        mpl.rcParams["legend.labelspacing"] = 0.05
        mpl.rcParams["legend.columnspacing"] = 0.5
        mpl.rcParams["legend.borderaxespad"] = 0.15
        mpl.rcParams["legend.handlelength"] = 1

        fig = plt.figure(figsize=(6.69, 6.0))
        infileopen = open(
            "%s/RUST_codon_file_%s_%s_%s"
            % (args.Path, alignment_filename, args.offset, length_values)
        )
        ax1_metafootprint = fig.add_subplot(111)
        RUST_metagene_plot(infileopen, ax1_metafootprint)
        plt.savefig(
            "%s/RUST_codon_metafootprint_%s_%s_%s.png"
            % (args.Path, alignment_filename, args.offset, length_values)
        )
        plt.clf()

        infileopen = open(
            "%s/RUST_codon_file_%s_%s_%s"
            % (args.Path, alignment_filename, args.offset, length_values)
        )
        ax1codon_Asite = fig.add_subplot(111)
        A_site_plot(infileopen, universal_code, ax1codon_Asite, 40)
        plt.savefig(
            "%s/A_site_%s_%s_%s.png"
            % (args.Path, alignment_filename, args.offset, length_values)
        )

    except:
        sys.stdout.write("Error producing images\n")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Produces RUST metagene profile of codons"
    )
    parser.add_argument("--version", action="version", version="%(prog)s 1.2")

    parser.add_argument(
        "-t",
        "--transcriptome",
        help="fasta file of transcripts, CDS start and end may be provided on description line using tab separation e.g. >NM_0001  10  5000, otherwise it searches for longest ORF"
        ", required=True",
    )
    parser.add_argument(
        "-a",
        "--alignment",
        help="sorted bam file of transcriptome alignments",
        required=True,
    )
    parser.add_argument("-o", "--offset", help="nucleotide offset to A-site", type=int)
    parser.add_argument(
        "-l",
        "--lengths",
        help="lengths of footprints included, for example 28:32 is 28,29,30,31,32",
    )
    parser.add_argument(
        "-P", "--Path", help='path to outputfile, default is "codon"', default="codon"
    )
    args = parser.parse_args(None)

    main(args)