|
# cython: embedsignature=True |
|
# cython: profile=True |
|
############################################################################### |
|
############################################################################### |
|
# Cython wrapper for SAM/BAM/CRAM files based on htslib |
|
############################################################################### |
|
# The principal classes defined in this module are: |
|
# |
|
# class FastaFile random read read/write access to faidx indexd files |
|
# class FastxFile streamed read/write access to fasta/fastq files |
|
# |
|
# Additionally this module defines several additional classes that are part |
|
# of the internal API. These are: |
|
# |
|
# class FastqProxy |
|
# class FastxRecord |
|
# |
|
# For backwards compatibility, the following classes are also defined: |
|
# |
|
# class Fastafile equivalent to FastaFile |
|
# class FastqFile equivalent to FastxFile |
|
# |
|
############################################################################### |
|
# |
|
# The MIT License |
|
# |
|
# Copyright (c) 2015 Andreas Heger |
|
# |
|
# Permission is hereby granted, free of charge, to any person obtaining a |
|
# copy of this software and associated documentation files (the "Software"), |
|
# to deal in the Software without restriction, including without limitation |
|
# the rights to use, copy, modify, merge, publish, distribute, sublicense, |
|
# and/or sell copies of the Software, and to permit persons to whom the |
|
# Software is furnished to do so, subject to the following conditions: |
|
# |
|
# The above copyright notice and this permission notice shall be included in |
|
# all copies or substantial portions of the Software. |
|
# |
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
|
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
|
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
|
# DEALINGS IN THE SOFTWARE. |
|
# |
|
############################################################################### |
|
import sys |
|
import os |
|
import re |
|
|
|
|
|
from libc.errno cimport errno |
|
from libc.string cimport strerror |
|
|
|
from cpython cimport array |
|
|
|
from cpython cimport PyErr_SetString, \ |
|
PyBytes_Check, \ |
|
PyUnicode_Check, \ |
|
PyBytes_FromStringAndSize |
|
|
|
from pysam.libchtslib cimport \ |
|
faidx_nseq, fai_load, fai_load3, fai_destroy, fai_fetch, \ |
|
faidx_seq_len, faidx_iseq, faidx_seq_len, \ |
|
faidx_fetch_seq, hisremote, \ |
|
bgzf_open, bgzf_close |
|
|
|
from pysam.libcutils cimport force_bytes, force_str, charptr_to_str |
|
from pysam.libcutils cimport encode_filename, from_string_and_size |
|
from pysam.libcutils cimport qualitystring_to_array, parse_region |
|
|
|
cdef class FastqProxy |
|
cdef makeFastqProxy(kseq_t * src): |
|
'''enter src into AlignedRead.''' |
|
cdef FastqProxy dest = FastqProxy.__new__(FastqProxy) |
|
dest._delegate = src |
|
return dest |
|
|
|
## TODO: |
|
## add automatic indexing. |
|
## add function to get sequence names. |
|
cdef class FastaFile: |
|
"""Random access to fasta formatted files that |
|
have been indexed by :term:`faidx`. |
|
|
|
The file is automatically opened. The index file of file |
|
``<filename>`` is expected to be called ``<filename>.fai``. |
|
|
|
Parameters |
|
---------- |
|
|
|
filename : string |
|
Filename of fasta file to be opened. |
|
|
|
filepath_index : string |
|
Optional, filename of the index. By default this is |
|
the filename + ".fai". |
|
|
|
filepath_index_compressed : string |
|
Optional, filename of the index if fasta file is. By default this is |
|
the filename + ".gzi". |
|
|
|
Raises |
|
------ |
|
|
|
ValueError |
|
if index file is missing |
|
|
|
IOError |
|
if file could not be opened |
|
|
|
""" |
|
|
|
def __cinit__(self, *args, **kwargs): |
|
self.fastafile = NULL |
|
self._filename = None |
|
self._references = None |
|
self._lengths = None |
|
self.reference2length = None |
|
self._open(*args, **kwargs) |
|
|
|
def is_open(self): |
|
'''return true if samfile has been opened.''' |
|
return self.fastafile != NULL |
|
|
|
def __len__(self): |
|
if self.fastafile == NULL: |
|
raise ValueError("calling len() on closed file") |
|
|
|
return faidx_nseq(self.fastafile) |
|
|
|
def _open(self, filename, filepath_index=None, filepath_index_compressed=None): |
|
'''open an indexed fasta file. |
|
|
|
This method expects an indexed fasta file. |
|
''' |
|
|
|
# close a previously opened file |
|
if self.fastafile != NULL: |
|
self.close() |
|
|
|
self._filename = encode_filename(filename) |
|
cdef char *cfilename = self._filename |
|
cdef char *cindexname = NULL |
|
cdef char *cindexname_compressed = NULL |
|
self.is_remote = hisremote(cfilename) |
|
|
|
# open file for reading |
|
if (self._filename != b"-" |
|
and not self.is_remote |
|
and not os.path.exists(filename)): |
|
raise IOError("file `%s` not found" % filename) |
|
|
|
# 3 modes to open: |
|
# compressed fa: fai_load3 with filename, index_fai and index_gzi |
|
# uncompressed fa: fai_load3 with filename and index_fai |
|
# uncompressed fa: fai_load with default index name |
|
if filepath_index: |
|
# when opening, set flags to 0 - do not automatically |
|
# build index if it does not exist. |
|
|
|
if not os.path.exists(filepath_index): |
|
raise IOError("filename {} does not exist".format(filepath_index)) |
|
cindexname = bindex_filename = encode_filename(filepath_index) |
|
|
|
if filepath_index_compressed: |
|
if not os.path.exists(filepath_index_compressed): |
|
raise IOError("filename {} does not exist".format(filepath_index_compressed)) |
|
cindexname_compressed = bindex_filename_compressed = encode_filename(filepath_index_compressed) |
|
with nogil: |
|
self.fastafile = fai_load3(cfilename, cindexname, cindexname_compressed, 0) |
|
else: |
|
with nogil: |
|
self.fastafile = fai_load3(cfilename, cindexname, NULL, 0) |
|
else: |
|
with nogil: |
|
self.fastafile = fai_load(cfilename) |
|
|
|
if self.fastafile == NULL: |
|
raise IOError("error when opening file `%s`" % filename) |
|
|
|
cdef int nreferences = faidx_nseq(self.fastafile) |
|
cdef int x |
|
cdef const char * s |
|
self._references = [] |
|
self._lengths = [] |
|
for x from 0 <= x < nreferences: |
|
s = faidx_iseq(self.fastafile, x) |
|
ss = force_str(s) |
|
self._references.append(ss) |
|
self._lengths.append(faidx_seq_len(self.fastafile, s)) |
|
self.reference2length = dict(zip(self._references, self._lengths)) |
|
|
|
def close(self): |
|
"""close the file.""" |
|
if self.fastafile != NULL: |
|
fai_destroy(self.fastafile) |
|
self.fastafile = NULL |
|
|
|
def __dealloc__(self): |
|
if self.fastafile != NULL: |
|
fai_destroy(self.fastafile) |
|
self.fastafile = NULL |
|
|
|
# context manager interface |
|
def __enter__(self): |
|
return self |
|
|
|
def __exit__(self, exc_type, exc_value, traceback): |
|
self.close() |
|
return False |
|
|
|
property closed: |
|
"""bool indicating the current state of the file object. |
|
This is a read-only attribute; the close() method changes the value. |
|
""" |
|
def __get__(self): |
|
return not self.is_open() |
|
|
|
property filename: |
|
"""filename associated with this object. This is a read-only attribute.""" |
|
def __get__(self): |
|
return self._filename |
|
|
|
property references: |
|
'''tuple with the names of :term:`reference` sequences.''' |
|
def __get__(self): |
|
return self._references |
|
|
|
property nreferences: |
|
"""int with the number of :term:`reference` sequences in the file. |
|
This is a read-only attribute.""" |
|
def __get__(self): |
|
return len(self._references) if self.references else None |
|
|
|
property lengths: |
|
"""tuple with the lengths of :term:`reference` sequences.""" |
|
def __get__(self): |
|
return self._lengths |
|
|
|
def fetch(self, |
|
reference=None, |
|
start=None, |
|
end=None, |
|
region=None): |
|
"""fetch sequences in a :term:`region`. |
|
|
|
A region can |
|
either be specified by :term:`reference`, `start` and |
|
`end`. `start` and `end` denote 0-based, half-open |
|
intervals. |
|
|
|
Alternatively, a samtools :term:`region` string can be |
|
supplied. |
|
|
|
If any of the coordinates are missing they will be replaced by the |
|
minimum (`start`) or maximum (`end`) coordinate. |
|
|
|
Note that region strings are 1-based, while `start` and `end` denote |
|
an interval in python coordinates. |
|
The region is specified by :term:`reference`, `start` and `end`. |
|
|
|
Returns |
|
------- |
|
|
|
string : a string with the sequence specified by the region. |
|
|
|
Raises |
|
------ |
|
|
|
IndexError |
|
if the coordinates are out of range |
|
|
|
ValueError |
|
if the region is invalid |
|
|
|
""" |
|
|
|
if not self.is_open(): |
|
raise ValueError("I/O operation on closed file" ) |
|
|
|
cdef int length |
|
cdef char *seq |
|
cdef char *ref |
|
cdef int rstart, rend |
|
|
|
contig, rstart, rend = parse_region(reference, start, end, region) |
|
|
|
if contig is None: |
|
raise ValueError("no sequence/region supplied.") |
|
|
|
if rstart == rend: |
|
return "" |
|
|
|
contig_b = force_bytes(contig) |
|
ref = contig_b |
|
with nogil: |
|
length = faidx_seq_len(self.fastafile, ref) |
|
if length == -1: |
|
raise KeyError("sequence '%s' not present" % contig) |
|
if rstart >= length: |
|
return "" |
|
|
|
# fai_fetch adds a '\0' at the end |
|
with nogil: |
|
seq = faidx_fetch_seq(self.fastafile, |
|
ref, |
|
rstart, |
|
rend-1, |
|
&length) |
|
|
|
if not seq: |
|
if errno: |
|
raise IOError(errno, strerror(errno)) |
|
else: |
|
raise ValueError("failure when retrieving sequence on '%s'" % contig) |
|
|
|
try: |
|
return charptr_to_str(seq) |
|
finally: |
|
free(seq) |
|
|
|
cdef char *_fetch(self, char *reference, int start, int end, int *length) except? NULL: |
|
'''fetch sequence for reference, start and end''' |
|
|
|
cdef char *seq |
|
with nogil: |
|
seq = faidx_fetch_seq(self.fastafile, |
|
reference, |
|
start, |
|
end-1, |
|
length) |
|
|
|
if not seq: |
|
if errno: |
|
raise IOError(errno, strerror(errno)) |
|
else: |
|
raise ValueError("failure when retrieving sequence on '%s'" % reference) |
|
|
|
return seq |
|
|
|
def get_reference_length(self, reference): |
|
'''return the length of reference.''' |
|
return self.reference2length[reference] |
|
|
|
def __getitem__(self, reference): |
|
return self.fetch(reference) |
|
|
|
def __contains__(self, reference): |
|
'''return true if reference in fasta file.''' |
|
return reference in self.reference2length |
|
|
|
|
|
cdef class FastqProxy: |
|
"""A single entry in a fastq file.""" |
|
def __init__(self): |
|
raise ValueError("do not instantiate FastqProxy directly") |
|
|
|
property name: |
|
"""The name of each entry in the fastq file.""" |
|
def __get__(self): |
|
return charptr_to_str(self._delegate.name.s) |
|
|
|
property sequence: |
|
"""The sequence of each entry in the fastq file.""" |
|
def __get__(self): |
|
return charptr_to_str(self._delegate.seq.s) |
|
|
|
property comment: |
|
def __get__(self): |
|
if self._delegate.comment.l: |
|
return charptr_to_str(self._delegate.comment.s) |
|
else: |
|
return None |
|
|
|
property quality: |
|
"""The quality score of each entry in the fastq file, represented as a string.""" |
|
def __get__(self): |
|
if self._delegate.qual.l: |
|
return charptr_to_str(self._delegate.qual.s) |
|
else: |
|
return None |
|
|
|
cdef cython.str to_string(self): |
|
if self.comment is None: |
|
comment = "" |
|
else: |
|
comment = " %s" % self.comment |
|
|
|
if self.quality is None: |
|
return ">%s%s\n%s" % (self.name, comment, self.sequence) |
|
else: |
|
return "@%s%s\n%s\n+\n%s" % (self.name, comment, |
|
self.sequence, self.quality) |
|
|
|
cdef cython.str tostring(self): |
|
"""deprecated : use :meth:`to_string`""" |
|
return self.to_string() |
|
|
|
def __str__(self): |
|
return self.to_string() |
|
|
|
cpdef array.array get_quality_array(self, int offset=33): |
|
'''return quality values as integer array after subtracting offset.''' |
|
if self.quality is None: |
|
return None |
|
return qualitystring_to_array(force_bytes(self.quality), |
|
offset=offset) |
|
|
|
cdef class FastxRecord: |
|
"""A fasta/fastq record. |
|
|
|
A record must contain a name and a sequence. If either of them are |
|
None, a ValueError is raised on writing. |
|
|
|
""" |
|
def __init__(self, |
|
name=None, |
|
comment=None, |
|
sequence=None, |
|
quality=None, |
|
FastqProxy proxy=None): |
|
if proxy is not None: |
|
self.comment = proxy.comment |
|
self.quality = proxy.quality |
|
self.sequence = proxy.sequence |
|
self.name = proxy.name |
|
else: |
|
self.comment = comment |
|
self.quality = quality |
|
self.sequence = sequence |
|
self.name = name |
|
|
|
def __copy__(self): |
|
return FastxRecord(self.name, self.comment, self.sequence, self.quality) |
|
|
|
def __deepcopy__(self, memo): |
|
return FastxRecord(self.name, self.comment, self.sequence, self.quality) |
|
|
|
cdef cython.str to_string(self): |
|
if self.name is None: |
|
raise ValueError("can not write record without name") |
|
|
|
if self.sequence is None: |
|
raise ValueError("can not write record without a sequence") |
|
|
|
if self.comment is None: |
|
comment = "" |
|
else: |
|
comment = " %s" % self.comment |
|
|
|
if self.quality is None: |
|
return ">%s%s\n%s" % (self.name, comment, self.sequence) |
|
else: |
|
return "@%s%s\n%s\n+\n%s" % (self.name, comment, |
|
self.sequence, self.quality) |
|
|
|
cdef cython.str tostring(self): |
|
"""deprecated : use :meth:`to_string`""" |
|
return self.to_string() |
|
|
|
def set_name(self, name): |
|
if name is None: |
|
raise ValueError("FastxRecord must have a name and not None") |
|
self.name = name |
|
|
|
def set_comment(self, comment): |
|
self.comment = comment |
|
|
|
def set_sequence(self, sequence, quality=None): |
|
"""set sequence of this record. |
|
|
|
""" |
|
self.sequence = sequence |
|
if quality is not None: |
|
if len(sequence) != len(quality): |
|
raise ValueError("sequence and quality length do not match: {} vs {}".format( |
|
len(sequence), len(quality))) |
|
|
|
self.quality = quality |
|
else: |
|
self.quality = None |
|
|
|
def __str__(self): |
|
return self.to_string() |
|
|
|
cpdef array.array get_quality_array(self, int offset=33): |
|
'''return quality values as array after subtracting offset.''' |
|
if self.quality is None: |
|
return None |
|
return qualitystring_to_array(force_bytes(self.quality), |
|
offset=offset) |
|
|
|
|
|
cdef class FastxFile: |
|
r"""Stream access to :term:`fasta` or :term:`fastq` formatted files. |
|
|
|
The file is automatically opened. |
|
|
|
Entries in the file can be both fastq or fasta formatted or even a |
|
mixture of the two. |
|
|
|
This file object permits iterating over all entries in the |
|
file. Random access is not implemented. The iteration returns |
|
objects of type :class:`FastqProxy` |
|
|
|
Parameters |
|
---------- |
|
|
|
filename : string |
|
Filename of fasta/fastq file to be opened. |
|
|
|
persist : bool |
|
|
|
If True (default) make a copy of the entry in the file during |
|
iteration. If set to False, no copy will be made. This will |
|
permit much faster iteration, but an entry will not persist |
|
when the iteration continues and an entry is read-only. |
|
|
|
Notes |
|
----- |
|
Prior to version 0.8.2, this class was called FastqFile. |
|
|
|
Raises |
|
------ |
|
|
|
IOError |
|
if file could not be opened |
|
|
|
|
|
Examples |
|
-------- |
|
>>> with pysam.FastxFile(filename) as fh: |
|
... for entry in fh: |
|
... print(entry.name) |
|
... print(entry.sequence) |
|
... print(entry.comment) |
|
... print(entry.quality) |
|
>>> with pysam.FastxFile(filename) as fin, open(out_filename, mode='w') as fout: |
|
... for entry in fin: |
|
... fout.write(str(entry) + '\n') |
|
|
|
""" |
|
def __cinit__(self, *args, **kwargs): |
|
# self.fastqfile = <gzFile*>NULL |
|
self._filename = None |
|
self.entry = NULL |
|
self._open(*args, **kwargs) |
|
|
|
def is_open(self): |
|
'''return true if samfile has been opened.''' |
|
return self.entry != NULL |
|
|
|
def _open(self, filename, persist=True): |
|
'''open a fastq/fasta file in *filename* |
|
|
|
Paramentes |
|
---------- |
|
|
|
persist : bool |
|
|
|
if True return a copy of the underlying data (default |
|
True). The copy will persist even if the iteration |
|
on the file continues. |
|
|
|
''' |
|
if self.fastqfile != NULL: |
|
self.close() |
|
|
|
self._filename = encode_filename(filename) |
|
cdef char *cfilename = self._filename |
|
self.is_remote = hisremote(cfilename) |
|
|
|
# open file for reading |
|
if (self._filename != b"-" |
|
and not self.is_remote |
|
and not os.path.exists(filename)): |
|
raise IOError("file `%s` not found" % filename) |
|
|
|
self.persist = persist |
|
|
|
with nogil: |
|
self.fastqfile = bgzf_open(cfilename, "r") |
|
self.entry = kseq_init(self.fastqfile) |
|
self._filename = filename |
|
|
|
def close(self): |
|
'''close the file.''' |
|
if self.fastqfile != NULL: |
|
bgzf_close(self.fastqfile) |
|
self.fastqfile = NULL |
|
if self.entry != NULL: |
|
kseq_destroy(self.entry) |
|
self.entry = NULL |
|
|
|
def __dealloc__(self): |
|
if self.fastqfile != NULL: |
|
bgzf_close(self.fastqfile) |
|
if self.entry: |
|
kseq_destroy(self.entry) |
|
|
|
# context manager interface |
|
def __enter__(self): |
|
return self |
|
|
|
def __exit__(self, exc_type, exc_value, traceback): |
|
self.close() |
|
return False |
|
|
|
property closed: |
|
"""bool indicating the current state of the file object. |
|
This is a read-only attribute; the close() method changes the value. |
|
""" |
|
def __get__(self): |
|
return not self.is_open() |
|
|
|
property filename: |
|
"""string with the filename associated with this object.""" |
|
def __get__(self): |
|
return self._filename |
|
|
|
def __iter__(self): |
|
if not self.is_open(): |
|
raise ValueError("I/O operation on closed file") |
|
return self |
|
|
|
cdef kseq_t * getCurrent(self): |
|
return self.entry |
|
|
|
cdef int cnext(self): |
|
'''C version of iterator |
|
''' |
|
with nogil: |
|
return kseq_read(self.entry) |
|
|
|
def __next__(self): |
|
""" |
|
python version of next(). |
|
""" |
|
cdef int l |
|
with nogil: |
|
l = kseq_read(self.entry) |
|
if (l >= 0): |
|
if self.persist: |
|
return FastxRecord(proxy=makeFastqProxy(self.entry)) |
|
return makeFastqProxy(self.entry) |
|
elif (l == -1): |
|
raise StopIteration |
|
elif (l == -2): |
|
raise ValueError('truncated quality string in {0}' |
|
.format(self._filename)) |
|
else: |
|
raise ValueError('unknown problem parsing {0}' |
|
.format(self._filename)) |
|
|
|
# Compatibility Layer for pysam 0.8.1 |
|
cdef class FastqFile(FastxFile): |
|
"""FastqFile is deprecated: use FastxFile instead""" |
|
pass |
|
|
|
# Compatibility Layer for pysam < 0.8 |
|
cdef class Fastafile(FastaFile): |
|
"""Fastafile is deprecated: use FastaFile instead""" |
|
pass |
|
|
|
__all__ = ["FastaFile", |
|
"FastqFile", |
|
"FastxFile", |
|
"Fastafile", |
|
"FastxRecord", |
|
"FastqProxy"] |
|
|