geored's picture
Upload folder using huggingface_hub
fe41391 verified
raw
history blame
146 kB
# cython: language_level=3
# cython: embedsignature=True
# cython: profile=True
###############################################################################
###############################################################################
## Cython wrapper for htslib VCF/BCF reader/writer
###############################################################################
#
# NOTICE: This code is incomplete and preliminary. It offers a nearly
# complete Pythonic interface to VCF/BCF metadata and data with
# reading and writing capability. Documentation and a unit test suite
# are in the works. The code is best tested under Python 2, but
# should also work with Python 3. Please report any remaining
# str/bytes issues on the github site when using Python 3 and I'll
# fix them promptly.
#
# Here is a minimal example of how to use the API:
#
# $ cat bcfview.py
# import sys
# from pysam import VariantFile
#
# bcf_in = VariantFile(sys.argv[1]) # auto-detect input format
# bcf_out = VariantFile('-', 'w', header=bcf_in.header)
#
# for rec in bcf_in:
# bcf_out.write(rec)
#
# Performance is fairly close to that of bcftools view. Here is an example
# using some 1k Genomes data:
#
# $ time python bcfview.py ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
# 1103799
#
# real 0m56.114s
# user 1m4.489s
# sys 0m3.102s
#
# $ time bcftools view ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
# 1103800 # bcftools adds an extra header
#
# real 0m55.126s
# user 1m3.502s
# sys 0m3.459s
#
###############################################################################
#
# TODO list:
#
# * more genotype methods
# * unit test suite (perhaps py.test based)
# * documentation
# * pickle support
# * left/right locus normalization
# * fix reopen to re-use fd
#
###############################################################################
#
# The MIT License
#
# Copyright (c) 2015,2016 Kevin Jacobs (jacobs@bioinformed.com)
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
#
###############################################################################
from __future__ import division, print_function
import os
import sys
from libc.errno cimport errno, EPIPE
from libc.string cimport strcmp, strpbrk, strerror
from libc.stdint cimport INT8_MAX, INT16_MAX, INT32_MAX
cimport cython
from cpython.object cimport PyObject
from cpython.ref cimport Py_INCREF
from cpython.dict cimport PyDict_GetItemString, PyDict_SetItemString
from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM
from cpython.bytes cimport PyBytes_FromStringAndSize
from cpython.unicode cimport PyUnicode_DecodeUTF8
from pysam.libchtslib cimport HTSFile, hisremote
from pysam.utils import unquoted_str
__all__ = ['VariantFile',
'VariantHeader',
'VariantHeaderRecord',
'VariantHeaderRecords',
'VariantMetadata',
'VariantHeaderMetadata',
'VariantContig',
'VariantHeaderContigs',
'VariantHeaderSamples',
'VariantRecordFilter',
'VariantRecordFormat',
'VariantRecordInfo',
'VariantRecordSamples',
'VariantRecord',
'VariantRecordSample',
'BaseIndex',
'BCFIndex',
'TabixIndex',
'BaseIterator',
'BCFIterator',
'TabixIterator',
'VariantRecord']
########################################################################
########################################################################
## Constants
########################################################################
cdef int MAX_POS = (1 << 31) - 1
cdef tuple VALUE_TYPES = ('Flag', 'Integer', 'Float', 'String')
cdef tuple METADATA_TYPES = ('FILTER', 'INFO', 'FORMAT', 'CONTIG', 'STRUCTURED', 'GENERIC')
cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R')
########################################################################
########################################################################
## Python 3 compatibility functions
########################################################################
from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
from pysam.libcutils cimport encode_filename, from_string_and_size, decode_bytes
########################################################################
########################################################################
## Sentinel object
########################################################################
cdef object _nothing = object()
########################################################################
########################################################################
## VCF/BCF string intern system
########################################################################
cdef dict bcf_str_cache = {}
cdef inline bcf_str_cache_get_charptr(const char* s):
if s == NULL:
return None
cdef PyObject *pystr = PyDict_GetItemString(bcf_str_cache, s)
if pystr:
return <object>pystr
val = PyUnicode_DecodeUTF8(s, strlen(s), NULL)
PyDict_SetItemString(bcf_str_cache, s, val)
return val
########################################################################
########################################################################
## Genotype math
########################################################################
cdef int comb(int n, int k) except -1:
"""Return binomial coefficient: n choose k
>>> comb(5, 1)
5
>>> comb(5, 2)
10
>>> comb(2, 2)
1
>>> comb(100, 2)
4950
"""
if k > n:
return 0
elif k == n:
return 1
elif k > n // 2:
k = n - k
cdef d, result
d = result = n - k + 1
for i in range(2, k + 1):
d += 1
result *= d
result //= i
return result
cdef inline int bcf_geno_combinations(int ploidy, int alleles) except -1:
"""Return the count of genotypes expected for the given ploidy and number of alleles.
>>> bcf_geno_combinations(1, 2)
2
>>> bcf_geno_combinations(2, 2)
3
>>> bcf_geno_combinations(2, 3)
6
>>> bcf_geno_combinations(3, 2)
4
"""
return comb(alleles + ploidy - 1, ploidy)
########################################################################
########################################################################
## Low level type conversion helpers
########################################################################
cdef inline bint check_header_id(bcf_hdr_t *hdr, int hl_type, int id):
return id >= 0 and id < hdr.n[BCF_DT_ID] and bcf_hdr_idinfo_exists(hdr, hl_type, id)
cdef inline int is_gt_fmt(bcf_hdr_t *hdr, int fmt_id):
return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), 'GT') == 0
cdef inline int bcf_genotype_count(bcf_hdr_t *hdr, bcf1_t *rec, int sample) except -1:
if sample < 0:
raise ValueError('genotype is only valid as a format field')
cdef int32_t *gt_arr = NULL
cdef int ngt = 0
ngt = bcf_get_genotypes(hdr, rec, &gt_arr, &ngt)
if ngt <= 0 or not gt_arr:
return 0
assert ngt % rec.n_sample == 0
cdef int max_ploidy = ngt // rec.n_sample
cdef int32_t *gt = gt_arr + sample * max_ploidy
cdef int ploidy = 0
while ploidy < max_ploidy and gt[0] != bcf_int32_vector_end:
gt += 1
ploidy += 1
free(<void*>gt_arr)
return bcf_geno_combinations(ploidy, rec.n_allele)
cdef tuple char_array_to_tuple(const char **a, ssize_t n, int free_after=0):
if not a:
return None
try:
return tuple(charptr_to_str(a[i]) for i in range(n))
finally:
if free_after and a:
free(a)
cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int scalar):
cdef char *datac
cdef int8_t *data8
cdef int16_t *data16
cdef int32_t *data32
cdef float *dataf
cdef int i
cdef bytes b
if not data or n <= 0:
return None
if type == BCF_BT_CHAR:
datac = <char *>data
if not n:
value = ()
else:
# Check if at least one null terminator is present
if datac[n-1] == bcf_str_vector_end:
# If so, create a string up to the first null terminator
b = datac
else:
# Otherwise, copy the entire block
b = datac[:n]
value = tuple(decode_bytes(v, 'utf-8') if v and v != bcf_str_missing else None for v in b.split(b','))
else:
value = []
if type == BCF_BT_INT8:
data8 = <int8_t *>data
for i in range(n):
if data8[i] == bcf_int8_vector_end:
break
value.append(data8[i] if data8[i] != bcf_int8_missing else None)
elif type == BCF_BT_INT16:
data16 = <int16_t *>data
for i in range(n):
if data16[i] == bcf_int16_vector_end:
break
value.append(data16[i] if data16[i] != bcf_int16_missing else None)
elif type == BCF_BT_INT32:
data32 = <int32_t *>data
for i in range(n):
if data32[i] == bcf_int32_vector_end:
break
value.append(data32[i] if data32[i] != bcf_int32_missing else None)
elif type == BCF_BT_FLOAT:
dataf = <float *>data
for i in range(n):
if bcf_float_is_vector_end(dataf[i]):
break
value.append(dataf[i] if not bcf_float_is_missing(dataf[i]) else None)
else:
raise TypeError('unsupported info type code')
# FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do.
if not value:
if scalar:
value = None
elif count <= 0:
value = ()
else:
value = (None,)*count
elif scalar and len(value) == 1:
value = value[0]
else:
value = tuple(value)
return value
cdef bcf_object_to_array(values, void *data, int bt_type, ssize_t n, int vlen):
cdef char *datac
cdef int8_t *data8
cdef int16_t *data16
cdef int32_t *data32
cdef float *dataf
cdef ssize_t i, value_count = len(values)
assert value_count <= n
if bt_type == BCF_BT_CHAR:
if not isinstance(values, (str, bytes)):
values = b','.join(force_bytes(v) if v else bcf_str_missing for v in values)
value_count = len(values)
assert value_count <= n
datac = <char *>data
memcpy(datac, <char *>values, value_count)
for i in range(value_count, n):
datac[i] = 0
elif bt_type == BCF_BT_INT8:
datai8 = <int8_t *>data
for i in range(value_count):
val = values[i]
datai8[i] = val if val is not None else bcf_int8_missing
for i in range(value_count, n):
datai8[i] = bcf_int8_vector_end
elif bt_type == BCF_BT_INT16:
datai16 = <int16_t *>data
for i in range(value_count):
val = values[i]
datai16[i] = val if val is not None else bcf_int16_missing
for i in range(value_count, n):
datai16[i] = bcf_int16_vector_end
elif bt_type == BCF_BT_INT32:
datai32 = <int32_t *>data
for i in range(value_count):
val = values[i]
datai32[i] = val if val is not None else bcf_int32_missing
for i in range(value_count, n):
datai32[i] = bcf_int32_vector_end
elif bt_type == BCF_BT_FLOAT:
dataf = <float *>data
for i in range(value_count):
val = values[i]
if val is None:
bcf_float_set(dataf + i, bcf_float_missing)
else:
dataf[i] = val
for i in range(value_count, n):
bcf_float_set(dataf + i, bcf_float_vector_end)
else:
raise TypeError('unsupported type')
cdef bcf_empty_array(int type, ssize_t n, int vlen):
cdef char *datac
cdef int32_t *data32
cdef float *dataf
cdef int i
if n <= 0:
raise ValueError('Cannot create empty array')
if type == BCF_HT_STR:
value = PyBytes_FromStringAndSize(NULL, sizeof(char)*n)
datac = <char *>value
for i in range(n):
datac[i] = bcf_str_missing if not vlen else bcf_str_vector_end
elif type == BCF_HT_INT:
value = PyBytes_FromStringAndSize(NULL, sizeof(int32_t)*n)
data32 = <int32_t *><char *>value
for i in range(n):
data32[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
elif type == BCF_HT_REAL:
value = PyBytes_FromStringAndSize(NULL, sizeof(float)*n)
dataf = <float *><char *>value
for i in range(n):
bcf_float_set(dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
else:
raise TypeError('unsupported header type code')
return value
cdef bcf_copy_expand_array(void *src_data, int src_type, size_t src_values,
void *dst_data, int dst_type, size_t dst_values,
int vlen):
"""copy data from src to dest where the size of the elements (src_type/dst_type) differ
as well as the number of elements (src_values/dst_values).
"""
cdef char *src_datac
cdef char *dst_datac
cdef int8_t *src_datai8
cdef int16_t *src_datai16
cdef int32_t *src_datai32
cdef int32_t *dst_datai
cdef float *src_dataf
cdef float *dst_dataf
cdef ssize_t src_size, dst_size, i, j
cdef int val
if src_values > dst_values:
raise ValueError('Cannot copy arrays with src_values={} > dst_values={}'.format(src_values, dst_values))
if src_type == dst_type == BCF_BT_CHAR:
src_datac = <char *>src_data
dst_datac = <char *>dst_data
memcpy(dst_datac, src_datac, src_values)
for i in range(src_values, dst_values):
dst_datac[i] = 0
elif src_type == BCF_BT_INT8 and dst_type == BCF_BT_INT32:
src_datai8 = <int8_t *>src_data
dst_datai = <int32_t *>dst_data
for i in range(src_values):
val = src_datai8[i]
if val == bcf_int8_missing:
val = bcf_int32_missing
elif val == bcf_int8_vector_end:
val = bcf_int32_vector_end
dst_datai[i] = val
for i in range(src_values, dst_values):
dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
elif src_type == BCF_BT_INT16 and dst_type == BCF_BT_INT32:
src_datai16 = <int16_t *>src_data
dst_datai = <int32_t *>dst_data
for i in range(src_values):
val = src_datai16[i]
if val == bcf_int16_missing:
val = bcf_int32_missing
elif val == bcf_int16_vector_end:
val = bcf_int32_vector_end
dst_datai[i] = val
for i in range(src_values, dst_values):
dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
elif src_type == BCF_BT_INT32 and dst_type == BCF_BT_INT32:
src_datai32 = <int32_t *>src_data
dst_datai = <int32_t *>dst_data
for i in range(src_values):
dst_datai[i] = src_datai32[i]
for i in range(src_values, dst_values):
dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
elif src_type == BCF_BT_FLOAT and dst_type == BCF_BT_FLOAT:
src_dataf = <float *>src_data
dst_dataf = <float *>dst_data
for i in range(src_values):
dst_dataf[i] = src_dataf[i]
for i in range(src_values, dst_values):
bcf_float_set(dst_dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
else:
raise TypeError('unsupported types')
cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar, int sample):
if record is None:
raise ValueError('record must not be None')
cdef bcf_hdr_t *hdr = record.header.ptr
cdef bcf1_t *r = record.ptr
if not check_header_id(hdr, hl_type, id):
raise ValueError('Invalid header')
cdef int length = bcf_hdr_id2length(hdr, hl_type, id)
cdef int number = bcf_hdr_id2number(hdr, hl_type, id)
scalar[0] = 0
if hl_type == BCF_HL_FMT and is_gt_fmt(hdr, id):
count[0] = number
elif length == BCF_VL_FIXED:
if number == 1:
scalar[0] = 1
count[0] = number
elif length == BCF_VL_R:
count[0] = r.n_allele
elif length == BCF_VL_A:
count[0] = r.n_allele - 1
elif length == BCF_VL_G:
count[0] = bcf_genotype_count(hdr, r, sample)
elif length == BCF_VL_VAR:
count[0] = -1
else:
raise ValueError('Unknown format length')
cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z):
if record is None:
raise ValueError('record must not be None')
cdef bcf_hdr_t *hdr = record.header.ptr
cdef char *s
cdef ssize_t count
cdef int scalar
bcf_get_value_count(record, BCF_HL_INFO, z.key, &count, &scalar, -1)
if z.len == 0:
if bcf_hdr_id2type(hdr, BCF_HL_INFO, z.key) == BCF_HT_FLAG:
value = True
elif scalar:
value = None
else:
value = ()
elif z.len == 1:
if z.type == BCF_BT_INT8:
value = z.v1.i if z.v1.i != bcf_int8_missing else None
elif z.type == BCF_BT_INT16:
value = z.v1.i if z.v1.i != bcf_int16_missing else None
elif z.type == BCF_BT_INT32:
value = z.v1.i if z.v1.i != bcf_int32_missing else None
elif z.type == BCF_BT_FLOAT:
value = z.v1.f if not bcf_float_is_missing(z.v1.f) else None
elif z.type == BCF_BT_CHAR:
value = force_str(chr(z.v1.i))
else:
raise TypeError('unsupported info type code')
if not scalar and value != ():
value = (value,)
else:
value = bcf_array_to_object(z.vptr, z.type, z.len, count, scalar)
return value
cdef object bcf_check_values(VariantRecord record, value, int sample,
int hl_type, int ht_type,
int id, int bt_type, ssize_t bt_len,
ssize_t *value_count, int *scalar, int *realloc):
if record is None:
raise ValueError('record must not be None')
bcf_get_value_count(record, hl_type, id, value_count, scalar, sample)
# Validate values now that we know the type and size
values = (value,) if not isinstance(value, (list, tuple)) else value
# Validate values now that we know the type and size
if ht_type == BCF_HT_FLAG:
value_count[0] = 1
elif hl_type == BCF_HL_FMT and is_gt_fmt(record.header.ptr, id):
# KBJ: htslib lies about the cardinality of GT fields-- they're really VLEN (-1)
value_count[0] = -1
cdef int given = len(values)
if value_count[0] != -1 and value_count[0] != given:
if scalar[0]:
raise TypeError('value expected to be scalar, given len={}'.format(given))
else:
raise TypeError('values expected to be {}-tuple, given len={}'.format(value_count[0], given))
if ht_type == BCF_HT_REAL:
for v in values:
if not(v is None or isinstance(v, (float, int))):
raise TypeError('invalid value for Float format')
elif ht_type == BCF_HT_INT:
for v in values:
if not(v is None or (isinstance(v, (float, int)) and int(v) == v)):
raise TypeError('invalid value for Integer format')
for v in values:
if not(v is None or bcf_int32_missing < v <= INT32_MAX):
raise ValueError('Integer value too small/large to store in VCF/BCF')
elif ht_type == BCF_HT_STR:
values = b','.join(force_bytes(v) if v is not None else b'' for v in values)
elif ht_type == BCF_HT_FLAG:
if values[0] not in (True, False, None, 1, 0):
raise ValueError('Flag values must be: True, False, None, 1, 0')
else:
raise TypeError('unsupported type')
realloc[0] = 0
if len(values) <= 1 and hl_type == BCF_HL_INFO:
realloc[0] = 0
elif len(values) > bt_len:
realloc[0] = 1
elif bt_type == BCF_BT_INT8:
for v in values:
if v is not None and not(bcf_int8_missing < v <= INT8_MAX):
realloc[0] = 1
break
elif bt_type == BCF_BT_INT16:
for v in values:
if v is not None and not(bcf_int16_missing < v <= INT16_MAX):
realloc[0] = 1
break
return values
cdef bcf_encode_alleles(VariantRecord record, values):
if record is None:
raise ValueError('record must not be None')
cdef bcf1_t *r = record.ptr
cdef int32_t nalleles = r.n_allele
cdef list gt_values = []
cdef char *s
cdef int i
if values is None:
return ()
if not isinstance(values, (list, tuple)):
values = (values,)
for value in values:
if value is None:
gt_values.append(bcf_gt_missing)
elif isinstance(value, (str, bytes)):
bvalue = force_bytes(value)
s = bvalue
for i in range(r.n_allele):
if strcmp(r.d.allele[i], s) != 0:
gt_values.append(bcf_gt_unphased(i))
break
else:
raise ValueError('Unknown allele')
else:
i = value
if not (0 <= i < nalleles):
raise ValueError('Invalid allele index')
gt_values.append(bcf_gt_unphased(i))
return gt_values
cdef bcf_info_set_value(VariantRecord record, key, value):
if record is None:
raise ValueError('record must not be None')
cdef bcf_hdr_t *hdr = record.header.ptr
cdef bcf1_t *r = record.ptr
cdef int info_id, info_type, scalar, dst_type, realloc, vlen = 0
cdef ssize_t i, value_count, alloc_len, alloc_size, dst_size
if bcf_unpack(r, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
cdef bytes bkey = force_bytes(key)
cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
if info:
info_id = info.key
else:
info_id = bcf_header_get_info_id(hdr, bkey)
if info_id < 0:
raise KeyError('unknown INFO: {}'.format(key))
if not check_header_id(hdr, BCF_HL_INFO, info_id):
raise ValueError('Invalid header')
info_type = bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id)
values = bcf_check_values(record, value, -1,
BCF_HL_INFO, info_type, info_id,
info.type if info else -1,
info.len if info else -1,
&value_count, &scalar, &realloc)
if info_type == BCF_HT_FLAG:
if bcf_update_info(hdr, r, bkey, NULL, bool(values[0]), info_type) < 0:
raise ValueError('Unable to update INFO values')
return
vlen = value_count < 0
value_count = len(values)
# DISABLED DUE TO ISSUES WITH THE CRAZY POINTERS
# If we can, write updated values to existing allocated storage
if 0 and info and not realloc:
r.d.shared_dirty |= BCF1_DIRTY_INF
if value_count == 0:
info.len = 0
if not info.vptr:
info.vptr = <uint8_t *>&info.v1.i
elif value_count == 1:
# FIXME: Check if need to free vptr if info.len > 0?
if info.type == BCF_BT_INT8 or info.type == BCF_BT_INT16 or info.type == BCF_BT_INT32:
bcf_object_to_array(values, &info.v1.i, BCF_BT_INT32, 1, vlen)
elif info.type == BCF_BT_FLOAT:
bcf_object_to_array(values, &info.v1.f, BCF_BT_FLOAT, 1, vlen)
else:
raise TypeError('unsupported info type code')
info.len = 1
if not info.vptr:
info.vptr = <uint8_t *>&info.v1.i
else:
bcf_object_to_array(values, info.vptr, info.type, info.len, vlen)
return
alloc_len = max(1, value_count)
if info and info.len > alloc_len:
alloc_len = info.len
new_values = bcf_empty_array(info_type, alloc_len, vlen)
cdef char *valp = <char *>new_values
if info_type == BCF_HT_INT:
dst_type = BCF_BT_INT32
elif info_type == BCF_HT_REAL:
dst_type = BCF_BT_FLOAT
elif info_type == BCF_HT_STR:
dst_type = BCF_BT_CHAR
else:
raise ValueError('Unsupported INFO type')
bcf_object_to_array(values, valp, dst_type, alloc_len, vlen)
if bcf_update_info(hdr, r, bkey, valp, <int>alloc_len, info_type) < 0:
raise ValueError('Unable to update INFO values')
cdef bcf_info_del_value(VariantRecord record, key):
if record is None:
raise ValueError('record must not be None')
cdef bcf_hdr_t *hdr = record.header.ptr
cdef bcf1_t *r = record.ptr
cdef ssize_t value_count
cdef int scalar
if bcf_unpack(r, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
cdef bytes bkey = force_bytes(key)
cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
if not info:
raise KeyError(key)
bcf_get_value_count(record, BCF_HL_INFO, info.key, &value_count, &scalar, -1)
if value_count <= 0:
null_value = ()
elif scalar:
null_value = None
else:
null_value = (None,)*value_count
bcf_info_set_value(record, bkey, null_value)
cdef bcf_format_get_value(VariantRecordSample sample, key):
if sample is None:
raise ValueError('sample must not be None')
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
cdef ssize_t count
cdef int scalar
if bcf_unpack(r, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
cdef bytes bkey = force_bytes(key)
cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
if not fmt or not fmt.p:
raise KeyError('invalid FORMAT: {}'.format(key))
if is_gt_fmt(hdr, fmt.id):
return bcf_format_get_allele_indices(sample)
bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &count, &scalar, sample.index)
if fmt.p and fmt.n and fmt.size:
return bcf_array_to_object(fmt.p + sample.index * fmt.size, fmt.type, fmt.n, count, scalar)
elif scalar:
return None
elif count <= 0:
return ()
else:
return (None,)*count
cdef bcf_format_set_value(VariantRecordSample sample, key, value):
if sample is None:
raise ValueError('sample must not be None')
if key == 'phased':
sample.phased = bool(value)
return
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
cdef int fmt_id
cdef vdict_t *d
cdef khiter_t k
cdef int fmt_type, scalar, realloc, dst_type, vlen = 0
cdef ssize_t i, nsamples, value_count, alloc_size, alloc_len, dst_size
if bcf_unpack(r, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
cdef bytes bkey = force_bytes(key)
cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
if fmt:
fmt_id = fmt.id
else:
d = <vdict_t *>hdr.dict[BCF_DT_ID]
k = kh_get_vdict(d, bkey)
if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_FMT] & 0xF == 0xF:
raise KeyError('unknown format: {}'.format(key))
fmt_id = kh_val_vdict(d, k).id
if not check_header_id(hdr, BCF_HL_FMT, fmt_id):
raise ValueError('Invalid header')
fmt_type = bcf_hdr_id2type(hdr, BCF_HL_FMT, fmt_id)
if fmt_type == BCF_HT_FLAG:
raise ValueError('Flag types are not allowed on FORMATs')
if is_gt_fmt(hdr, fmt_id):
value = bcf_encode_alleles(sample.record, value)
# KBJ: GT field is considered to be a string by the VCF header but BCF represents it as INT.
fmt_type = BCF_HT_INT
values = bcf_check_values(sample.record, value, sample.index,
BCF_HL_FMT, fmt_type, fmt_id,
fmt.type if fmt else -1,
fmt.n if fmt else -1,
&value_count, &scalar, &realloc)
vlen = value_count < 0
value_count = len(values)
# If we can, write updated values to existing allocated storage.
if fmt and not realloc:
r.d.indiv_dirty = 1
bcf_object_to_array(values, fmt.p + sample.index * fmt.size, fmt.type, fmt.n, vlen)
return
alloc_len = max(1, value_count)
if fmt and fmt.n > alloc_len:
alloc_len = fmt.n
nsamples = r.n_sample
new_values = bcf_empty_array(fmt_type, nsamples * alloc_len, vlen)
cdef char *new_values_p = <char *>new_values
if fmt_type == BCF_HT_INT:
dst_type = BCF_BT_INT32
dst_size = sizeof(int32_t) * alloc_len
elif fmt_type == BCF_HT_REAL:
dst_type = BCF_BT_FLOAT
dst_size = sizeof(float) * alloc_len
elif fmt_type == BCF_HT_STR:
dst_type = BCF_BT_CHAR
dst_size = sizeof(char) * alloc_len
else:
raise ValueError('Unsupported FORMAT type')
if fmt and nsamples > 1:
for i in range(nsamples):
bcf_copy_expand_array(fmt.p + i * fmt.size, fmt.type, fmt.n,
new_values_p + i * dst_size, dst_type, alloc_len,
vlen)
bcf_object_to_array(values, new_values_p + sample.index * dst_size, dst_type, alloc_len, vlen)
if bcf_update_format(hdr, r, bkey, new_values_p, <int>(nsamples * alloc_len), fmt_type) < 0:
raise ValueError('Unable to update format values')
cdef bcf_format_del_value(VariantRecordSample sample, key):
if sample is None:
raise ValueError('sample must not be None')
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
cdef ssize_t value_count
cdef int scalar
if bcf_unpack(r, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
cdef bytes bkey = force_bytes(key)
cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
if not fmt or not fmt.p:
raise KeyError(key)
bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &value_count, &scalar, sample.index)
if value_count <= 0:
null_value = ()
elif scalar:
null_value = None
else:
null_value = (None,)*value_count
bcf_format_set_value(sample, bkey, null_value)
cdef bcf_format_get_allele_indices(VariantRecordSample sample):
if sample is None:
raise ValueError('sample must not be None')
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
cdef int32_t n = r.n_sample
if bcf_unpack(r, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
if sample.index < 0 or sample.index >= n or not r.n_fmt:
return ()
cdef bcf_fmt_t *fmt0 = r.d.fmt
cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
if not gt0 or not fmt0.n:
return ()
cdef int8_t *data8
cdef int16_t *data16
cdef int32_t *data32
cdef int32_t a, nalleles = r.n_allele
cdef list alleles = []
if fmt0.type == BCF_BT_INT8:
data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
for i in range(fmt0.n):
if data8[i] == bcf_int8_vector_end:
break
elif data8[i] == bcf_gt_missing:
a = -1
else:
a = bcf_gt_allele(data8[i])
alleles.append(a if 0 <= a < nalleles else None)
elif fmt0.type == BCF_BT_INT16:
data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
for i in range(fmt0.n):
if data16[i] == bcf_int16_vector_end:
break
elif data16[i] == bcf_gt_missing:
a = -1
else:
a = bcf_gt_allele(data16[i])
alleles.append(a if 0 <= a < nalleles else None)
elif fmt0.type == BCF_BT_INT32:
data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
for i in range(fmt0.n):
if data32[i] == bcf_int32_vector_end:
break
elif data32[i] == bcf_gt_missing:
a = -1
else:
a = bcf_gt_allele(data32[i])
alleles.append(a if 0 <= a < nalleles else None)
return tuple(alleles)
cdef bcf_format_get_alleles(VariantRecordSample sample):
if sample is None:
raise ValueError('sample must not be None')
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
cdef int32_t nsamples = r.n_sample
if bcf_unpack(r, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
cdef int32_t nalleles = r.n_allele
if sample.index < 0 or sample.index >= nsamples or not r.n_fmt:
return ()
cdef bcf_fmt_t *fmt0 = r.d.fmt
cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
if not gt0 or not fmt0.n:
return ()
cdef int32_t a
cdef int8_t *data8
cdef int16_t *data16
cdef int32_t *data32
alleles = []
if fmt0.type == BCF_BT_INT8:
data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
for i in range(fmt0.n):
if data8[i] == bcf_int8_vector_end:
break
a = bcf_gt_allele(data8[i])
alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
elif fmt0.type == BCF_BT_INT16:
data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
for i in range(fmt0.n):
if data16[i] == bcf_int16_vector_end:
break
a = bcf_gt_allele(data16[i])
alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
elif fmt0.type == BCF_BT_INT32:
data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
for i in range(fmt0.n):
if data32[i] == bcf_int32_vector_end:
break
a = bcf_gt_allele(data32[i])
alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
return tuple(alleles)
cdef bint bcf_sample_get_phased(VariantRecordSample sample):
if sample is None:
raise ValueError('sample must not be None')
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
cdef int32_t n = r.n_sample
if bcf_unpack(r, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
if sample.index < 0 or sample.index >= n or not r.n_fmt:
return False
cdef bcf_fmt_t *fmt0 = r.d.fmt
cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
if not gt0 or not fmt0.n:
return False
cdef int8_t *data8
cdef int16_t *data16
cdef int32_t *data32
cdef bint phased = False
if fmt0.type == BCF_BT_INT8:
data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
for i in range(fmt0.n):
if data8[i] == bcf_int8_vector_end:
break
elif data8[i] == bcf_int8_missing:
continue
elif i and not bcf_gt_is_phased(data8[i]):
return False
else:
phased = True
elif fmt0.type == BCF_BT_INT16:
data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
for i in range(fmt0.n):
if data16[i] == bcf_int16_vector_end:
break
elif data16[i] == bcf_int16_missing:
continue
elif i and not bcf_gt_is_phased(data16[i]):
return False
else:
phased = True
elif fmt0.type == BCF_BT_INT32:
data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
for i in range(fmt0.n):
if data32[i] == bcf_int32_vector_end:
break
elif data32[i] == bcf_int32_missing:
continue
elif i and not bcf_gt_is_phased(data32[i]):
return False
else:
phased = True
return phased
cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased):
if sample is None:
raise ValueError('sample must not be None')
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
cdef int32_t n = r.n_sample
if bcf_unpack(r, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
if sample.index < 0 or sample.index >= n or not r.n_fmt:
return
cdef bcf_fmt_t *fmt0 = r.d.fmt
cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
if not gt0 or not fmt0.n:
raise ValueError('Cannot set phased before genotype is set')
cdef int8_t *data8
cdef int16_t *data16
cdef int32_t *data32
if fmt0.type == BCF_BT_INT8:
data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
for i in range(fmt0.n):
if data8[i] == bcf_int8_vector_end:
break
elif data8[i] == bcf_int8_missing:
continue
elif i:
data8[i] = (data8[i] & 0xFE) | phased
elif fmt0.type == BCF_BT_INT16:
data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
for i in range(fmt0.n):
if data16[i] == bcf_int16_vector_end:
break
elif data16[i] == bcf_int16_missing:
continue
elif i:
data16[i] = (data16[i] & 0xFFFE) | phased
elif fmt0.type == BCF_BT_INT32:
data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
for i in range(fmt0.n):
if data32[i] == bcf_int32_vector_end:
break
elif data32[i] == bcf_int32_missing:
continue
elif i:
data32[i] = (data32[i] & 0xFFFFFFFE) | phased
cdef inline bcf_sync_end(VariantRecord record):
cdef bcf_hdr_t *hdr = record.header.ptr
cdef bcf_info_t *info
cdef int end_id = bcf_header_get_info_id(record.header.ptr, b'END')
cdef int ref_len
# allow missing ref when instantiating a new record
if record.ref is not None:
ref_len = len(record.ref)
else:
ref_len = 0
# Delete INFO/END if no alleles are present or if rlen is equal to len(ref)
# Always keep END for symbolic alleles
if not has_symbolic_allele(record) and (not record.ptr.n_allele or record.ptr.rlen == ref_len):
# If INFO/END is not defined in the header, it doesn't exist in the record
if end_id >= 0:
info = bcf_get_info(hdr, record.ptr, b'END')
if info and info.vptr:
if bcf_update_info(hdr, record.ptr, b'END', NULL, 0, info.type) < 0:
raise ValueError('Unable to delete END')
else:
# Create END header, if not present
if end_id < 0:
record.header.info.add('END', number=1, type='Integer', description='Stop position of the interval')
# Update to reflect stop position
bcf_info_set_value(record, b'END', record.ptr.pos + record.ptr.rlen)
cdef inline int has_symbolic_allele(VariantRecord record):
"""Return index of first symbolic allele. 0 if no symbolic alleles."""
for i in range(1, record.ptr.n_allele):
alt = record.ptr.d.allele[i]
if alt[0] == b'<' and alt[len(alt) - 1] == b'>':
return i
return 0
########################################################################
########################################################################
## Variant Header objects
########################################################################
cdef bcf_header_remove_hrec(VariantHeader header, int i):
if header is None:
raise ValueError('header must not be None')
cdef bcf_hdr_t *hdr = header.ptr
if i < 0 or i >= hdr.nhrec:
raise ValueError('Invalid header record index')
cdef bcf_hrec_t *hrec = hdr.hrec[i]
hdr.nhrec -= 1
if i < hdr.nhrec:
memmove(&hdr.hrec[i], &hdr.hrec[i+1], (hdr.nhrec-i)*sizeof(bcf_hrec_t*))
bcf_hrec_destroy(hrec)
hdr.hrec[hdr.nhrec] = NULL
hdr.dirty = 1
#FIXME: implement a full mapping interface
#FIXME: passing bcf_hrec_t* is not safe, since we cannot control the
# object lifetime.
cdef class VariantHeaderRecord(object):
"""header record from a :class:`VariantHeader` object"""
def __init__(self, *args, **kwargs):
raise TypeError('this class cannot be instantiated from Python')
@property
def type(self):
"""header type: FILTER, INFO, FORMAT, CONTIG, STRUCTURED, or GENERIC"""
cdef bcf_hrec_t *r = self.ptr
if not r:
return None
return METADATA_TYPES[r.type]
@property
def key(self):
"""header key (the part before '=', in FILTER/INFO/FORMAT/contig/fileformat etc.)"""
cdef bcf_hrec_t *r = self.ptr
return bcf_str_cache_get_charptr(r.key) if r and r.key else None
@property
def value(self):
"""header value. Set only for generic lines, None for FILTER/INFO, etc."""
cdef bcf_hrec_t *r = self.ptr
return charptr_to_str(r.value) if r and r.value else None
@property
def attrs(self):
"""sequence of additional header attributes"""
cdef bcf_hrec_t *r = self.ptr
if not r:
return ()
cdef int i
return tuple((bcf_str_cache_get_charptr(r.keys[i]) if r.keys[i] else None,
charptr_to_str(r.vals[i]) if r.vals[i] else None)
for i in range(r.nkeys))
def __len__(self):
cdef bcf_hrec_t *r = self.ptr
return r.nkeys if r else 0
def __bool__(self):
cdef bcf_hrec_t *r = self.ptr
return r != NULL and r.nkeys != 0
def __getitem__(self, key):
"""get attribute value"""
cdef bcf_hrec_t *r = self.ptr
cdef int i
if r:
bkey = force_bytes(key)
for i in range(r.nkeys):
if r.keys[i] and r.keys[i] == bkey:
return charptr_to_str(r.vals[i]) if r.vals[i] else None
raise KeyError('cannot find metadata key')
def __iter__(self):
cdef bcf_hrec_t *r = self.ptr
if not r:
return
cdef int i
for i in range(r.nkeys):
if r.keys[i]:
yield bcf_str_cache_get_charptr(r.keys[i])
def get(self, key, default=None):
"""D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
try:
return self[key]
except KeyError:
return default
def __contains__(self, key):
try:
self[key]
except KeyError:
return False
else:
return True
def iterkeys(self):
"""D.iterkeys() -> an iterator over the keys of D"""
return iter(self)
def itervalues(self):
"""D.itervalues() -> an iterator over the values of D"""
cdef bcf_hrec_t *r = self.ptr
if not r:
return
cdef int i
for i in range(r.nkeys):
if r.keys[i]:
yield charptr_to_str(r.vals[i]) if r.vals[i] else None
def iteritems(self):
"""D.iteritems() -> an iterator over the (key, value) items of D"""
cdef bcf_hrec_t *r = self.ptr
if not r:
return
cdef int i
for i in range(r.nkeys):
if r.keys[i]:
yield (bcf_str_cache_get_charptr(r.keys[i]), charptr_to_str(r.vals[i]) if r.vals[i] else None)
def keys(self):
"""D.keys() -> list of D's keys"""
return list(self)
def items(self):
"""D.items() -> list of D's (key, value) pairs, as 2-tuples"""
return list(self.iteritems())
def values(self):
"""D.values() -> list of D's values"""
return list(self.itervalues())
def update(self, items=None, **kwargs):
"""D.update([E, ]**F) -> None.
Update D from dict/iterable E and F.
"""
for k, v in items.items():
self[k] = v
if kwargs:
for k, v in kwargs.items():
self[k] = v
def pop(self, key, default=_nothing):
try:
value = self[key]
del self[key]
return value
except KeyError:
if default is not _nothing:
return default
raise
# Mappings are not hashable by default, but subclasses can change this
__hash__ = None
#TODO: implement __richcmp__
def __str__(self):
cdef bcf_hrec_t *r = self.ptr
if not r:
raise ValueError('cannot convert deleted record to str')
cdef kstring_t hrec_str
hrec_str.l = hrec_str.m = 0
hrec_str.s = NULL
bcf_hrec_format(r, &hrec_str)
ret = charptr_to_str_w_len(hrec_str.s, hrec_str.l)
if hrec_str.m:
free(hrec_str.s)
return ret
# FIXME: Not safe -- causes trivial segfaults at the moment
def remove(self):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef bcf_hrec_t *r = self.ptr
if not r:
return
assert r.key
cdef char *key = r.key if r.type == BCF_HL_GEN else r.value
bcf_hdr_remove(hdr, r.type, key)
self.ptr = NULL
cdef VariantHeaderRecord makeVariantHeaderRecord(VariantHeader header, bcf_hrec_t *hdr):
if not header:
raise ValueError('invalid VariantHeader')
if not hdr:
return None
cdef VariantHeaderRecord record = VariantHeaderRecord.__new__(VariantHeaderRecord)
record.header = header
record.ptr = hdr
return record
cdef class VariantHeaderRecords(object):
"""sequence of :class:`VariantHeaderRecord` object from a :class:`VariantHeader` object"""
def __init__(self, *args, **kwargs):
raise TypeError('this class cannot be instantiated from Python')
def __len__(self):
return self.header.ptr.nhrec
def __bool__(self):
return self.header.ptr.nhrec != 0
def __getitem__(self, index):
cdef int32_t i = index
if i < 0 or i >= self.header.ptr.nhrec:
raise IndexError('invalid header record index')
return makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
def __iter__(self):
cdef int32_t i
for i in range(self.header.ptr.nhrec):
yield makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
__hash__ = None
cdef VariantHeaderRecords makeVariantHeaderRecords(VariantHeader header):
if not header:
raise ValueError('invalid VariantHeader')
cdef VariantHeaderRecords records = VariantHeaderRecords.__new__(VariantHeaderRecords)
records.header = header
return records
cdef class VariantMetadata(object):
"""filter, info or format metadata record from a :class:`VariantHeader` object"""
def __init__(self, *args, **kwargs):
raise TypeError('this class cannot be instantiated from Python')
@property
def name(self):
"""metadata name"""
cdef bcf_hdr_t *hdr = self.header.ptr
return bcf_str_cache_get_charptr(hdr.id[BCF_DT_ID][self.id].key)
# Q: Should this be exposed?
@property
def id(self):
"""metadata internal header id number"""
return self.id
@property
def number(self):
"""metadata number (i.e. cardinality)"""
cdef bcf_hdr_t *hdr = self.header.ptr
if not check_header_id(hdr, self.type, self.id):
raise ValueError('Invalid header id')
if self.type == BCF_HL_FLT:
return None
cdef int l = bcf_hdr_id2length(hdr, self.type, self.id)
if l == BCF_VL_FIXED:
return bcf_hdr_id2number(hdr, self.type, self.id)
elif l == BCF_VL_VAR:
return '.'
else:
return METADATA_LENGTHS[l]
@property
def type(self):
"""metadata value type"""
cdef bcf_hdr_t *hdr = self.header.ptr
if not check_header_id(hdr, self.type, self.id):
raise ValueError('Invalid header id')
if self.type == BCF_HL_FLT:
return None
return VALUE_TYPES[bcf_hdr_id2type(hdr, self.type, self.id)]
@property
def description(self):
"""metadata description (or None if not set)"""
descr = self.record.get('Description')
if descr:
descr = descr.strip('"')
return force_str(descr)
@property
def record(self):
""":class:`VariantHeaderRecord` associated with this :class:`VariantMetadata` object"""
cdef bcf_hdr_t *hdr = self.header.ptr
if not check_header_id(hdr, self.type, self.id):
raise ValueError('Invalid header id')
cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_ID][self.id].val.hrec[self.type]
if not hrec:
return None
return makeVariantHeaderRecord(self.header, hrec)
def remove_header(self):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef const char *key = hdr.id[BCF_DT_ID][self.id].key
bcf_hdr_remove(hdr, self.type, key)
cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id):
if not header:
raise ValueError('invalid VariantHeader')
if type != BCF_HL_FLT and type != BCF_HL_INFO and type != BCF_HL_FMT:
raise ValueError('invalid metadata type')
if id < 0 or id >= header.ptr.n[BCF_DT_ID]:
raise ValueError('invalid metadata id')
cdef VariantMetadata meta = VariantMetadata.__new__(VariantMetadata)
meta.header = header
meta.type = type
meta.id = id
return meta
cdef class VariantHeaderMetadata(object):
"""mapping from filter, info or format name to :class:`VariantMetadata` object"""
def __init__(self, *args, **kwargs):
raise TypeError('this class cannot be instantiated from Python')
def add(self, id, number, type, description, **kwargs):
"""Add a new filter, info or format record"""
if id in self:
raise ValueError('Header already exists for id={}'.format(id))
if self.type == BCF_HL_FLT:
if number is not None:
raise ValueError('Number must be None when adding a filter')
if type is not None:
raise ValueError('Type must be None when adding a filter')
items = [('ID', unquoted_str(id)), ('Description', description)]
else:
if type not in VALUE_TYPES:
raise ValueError('unknown type specified: {}'.format(type))
if number is None:
number = '.'
items = [('ID', unquoted_str(id)),
('Number', unquoted_str(number)),
('Type', unquoted_str(type)),
('Description', description)]
items += kwargs.items()
self.header.add_meta(METADATA_TYPES[self.type], items=items)
def __len__(self):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef bcf_idpair_t *idpair
cdef int32_t i, n = 0
for i in range(hdr.n[BCF_DT_ID]):
idpair = hdr.id[BCF_DT_ID] + i
if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
n += 1
return n
def __bool__(self):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef bcf_idpair_t *idpair
cdef int32_t i
for i in range(hdr.n[BCF_DT_ID]):
idpair = hdr.id[BCF_DT_ID] + i
if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
return True
return False
def __getitem__(self, key):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
cdef bytes bkey = force_bytes(key)
cdef khiter_t k = kh_get_vdict(d, bkey)
if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
raise KeyError('invalid key: {}'.format(key))
return makeVariantMetadata(self.header, self.type, kh_val_vdict(d, k).id)
def remove_header(self, key):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
cdef bytes bkey = force_bytes(key)
cdef khiter_t k = kh_get_vdict(d, bkey)
if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
raise KeyError('invalid key: {}'.format(key))
bcf_hdr_remove(hdr, self.type, bkey)
#bcf_hdr_sync(hdr)
def clear_header(self):
cdef bcf_hdr_t *hdr = self.header.ptr
bcf_hdr_remove(hdr, self.type, NULL)
#bcf_hdr_sync(hdr)
def __iter__(self):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef bcf_idpair_t *idpair
cdef int32_t i
for i in range(hdr.n[BCF_DT_ID]):
idpair = hdr.id[BCF_DT_ID] + i
if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
yield bcf_str_cache_get_charptr(idpair.key)
def get(self, key, default=None):
"""D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
try:
return self[key]
except KeyError:
return default
def __contains__(self, key):
try:
self[key]
except KeyError:
return False
else:
return True
def iterkeys(self):
"""D.iterkeys() -> an iterator over the keys of D"""
return iter(self)
def itervalues(self):
"""D.itervalues() -> an iterator over the values of D"""
for key in self:
yield self[key]
def iteritems(self):
"""D.iteritems() -> an iterator over the (key, value) items of D"""
for key in self:
yield (key, self[key])
def keys(self):
"""D.keys() -> list of D's keys"""
return list(self)
def items(self):
"""D.items() -> list of D's (key, value) pairs, as 2-tuples"""
return list(self.iteritems())
def values(self):
"""D.values() -> list of D's values"""
return list(self.itervalues())
# Mappings are not hashable by default, but subclasses can change this
__hash__ = None
#TODO: implement __richcmp__
cdef VariantHeaderMetadata makeVariantHeaderMetadata(VariantHeader header, int32_t type):
if not header:
raise ValueError('invalid VariantHeader')
cdef VariantHeaderMetadata meta = VariantHeaderMetadata.__new__(VariantHeaderMetadata)
meta.header = header
meta.type = type
return meta
cdef class VariantContig(object):
"""contig metadata from a :class:`VariantHeader`"""
def __init__(self, *args, **kwargs):
raise TypeError('this class cannot be instantiated from Python')
@property
def name(self):
"""contig name"""
cdef bcf_hdr_t *hdr = self.header.ptr
return bcf_str_cache_get_charptr(hdr.id[BCF_DT_CTG][self.id].key)
@property
def id(self):
"""contig internal id number"""
return self.id
@property
def length(self):
"""contig length or None if not available"""
cdef bcf_hdr_t *hdr = self.header.ptr
cdef uint32_t length = hdr.id[BCF_DT_CTG][self.id].val.info[0]
return length if length else None
@property
def header_record(self):
""":class:`VariantHeaderRecord` associated with this :class:`VariantContig` object"""
cdef bcf_hdr_t *hdr = self.header.ptr
cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_CTG][self.id].val.hrec[0]
return makeVariantHeaderRecord(self.header, hrec)
def remove_header(self):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef const char *key = hdr.id[BCF_DT_CTG][self.id].key
bcf_hdr_remove(hdr, BCF_HL_CTG, key)
cdef VariantContig makeVariantContig(VariantHeader header, int id):
if not header:
raise ValueError('invalid VariantHeader')
if id < 0 or id >= header.ptr.n[BCF_DT_CTG]:
raise ValueError('invalid contig id')
cdef VariantContig contig = VariantContig.__new__(VariantContig)
contig.header = header
contig.id = id
return contig
cdef class VariantHeaderContigs(object):
"""mapping from contig name or index to :class:`VariantContig` object."""
def __init__(self, *args, **kwargs):
raise TypeError('this class cannot be instantiated from Python')
def __len__(self):
cdef bcf_hdr_t *hdr = self.header.ptr
assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG]
return hdr.n[BCF_DT_CTG]
def __bool__(self):
cdef bcf_hdr_t *hdr = self.header.ptr
assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG]
return hdr.n[BCF_DT_CTG] != 0
def __getitem__(self, key):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef int index
if isinstance(key, int):
index = key
if index < 0 or index >= hdr.n[BCF_DT_CTG]:
raise IndexError('invalid contig index')
return makeVariantContig(self.header, index)
cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
cdef bytes bkey = force_bytes(key)
cdef khiter_t k = kh_get_vdict(d, bkey)
if k == kh_end(d):
raise KeyError('invalid contig: {}'.format(key))
cdef int id = kh_val_vdict(d, k).id
return makeVariantContig(self.header, id)
def remove_header(self, key):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef int index
cdef const char *ckey
cdef vdict_t *d
cdef khiter_t k
if isinstance(key, int):
index = key
if index < 0 or index >= hdr.n[BCF_DT_CTG]:
raise IndexError('invalid contig index')
ckey = hdr.id[BCF_DT_CTG][self.id].key
else:
d = <vdict_t *>hdr.dict[BCF_DT_CTG]
key = force_bytes(key)
if kh_get_vdict(d, key) == kh_end(d):
raise KeyError('invalid contig: {}'.format(key))
ckey = key
bcf_hdr_remove(hdr, BCF_HL_CTG, ckey)
def clear_header(self):
cdef bcf_hdr_t *hdr = self.header.ptr
bcf_hdr_remove(hdr, BCF_HL_CTG, NULL)
#bcf_hdr_sync(hdr)
def __iter__(self):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
cdef uint32_t n = kh_size(d)
assert n == hdr.n[BCF_DT_CTG]
for i in range(n):
yield bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, i))
def get(self, key, default=None):
"""D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
try:
return self[key]
except KeyError:
return default
def __contains__(self, key):
try:
self[key]
except KeyError:
return False
else:
return True
def iterkeys(self):
"""D.iterkeys() -> an iterator over the keys of D"""
return iter(self)
def itervalues(self):
"""D.itervalues() -> an iterator over the values of D"""
for key in self:
yield self[key]
def iteritems(self):
"""D.iteritems() -> an iterator over the (key, value) items of D"""
for key in self:
yield (key, self[key])
def keys(self):
"""D.keys() -> list of D's keys"""
return list(self)
def items(self):
"""D.items() -> list of D's (key, value) pairs, as 2-tuples"""
return list(self.iteritems())
def values(self):
"""D.values() -> list of D's values"""
return list(self.itervalues())
# Mappings are not hashable by default, but subclasses can change this
__hash__ = None
#TODO: implement __richcmp__
def add(self, id, length=None, **kwargs):
"""Add a new contig record"""
if id in self:
raise ValueError('Header already exists for contig {}'.format(id))
items = [('ID', unquoted_str(id))]
if length is not None:
items.append(("length", unquoted_str(length)))
items += kwargs.items()
self.header.add_meta('contig', items=items)
cdef VariantHeaderContigs makeVariantHeaderContigs(VariantHeader header):
if not header:
raise ValueError('invalid VariantHeader')
cdef VariantHeaderContigs contigs = VariantHeaderContigs.__new__(VariantHeaderContigs)
contigs.header = header
return contigs
cdef class VariantHeaderSamples(object):
"""sequence of sample names from a :class:`VariantHeader` object"""
def __init__(self, *args, **kwargs):
raise TypeError('this class cannot be instantiated from Python')
def __len__(self):
return bcf_hdr_nsamples(self.header.ptr)
def __bool__(self):
return bcf_hdr_nsamples(self.header.ptr) != 0
def __getitem__(self, index):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef int32_t n = bcf_hdr_nsamples(hdr)
cdef int32_t i = index
if i < 0 or i >= n:
raise IndexError('invalid sample index')
return charptr_to_str(hdr.samples[i])
def __iter__(self):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef int32_t i, n = bcf_hdr_nsamples(hdr)
for i in range(n):
yield charptr_to_str(hdr.samples[i])
def __contains__(self, key):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_SAMPLE]
cdef bytes bkey = force_bytes(key)
cdef khiter_t k = kh_get_vdict(d, bkey)
return k != kh_end(d)
# Mappings are not hashable by default, but subclasses can change this
__hash__ = None
#TODO: implement __richcmp__
def add(self, name):
"""Add a new sample"""
self.header.add_sample(name)
cdef VariantHeaderSamples makeVariantHeaderSamples(VariantHeader header):
if not header:
raise ValueError('invalid VariantHeader')
cdef VariantHeaderSamples samples = VariantHeaderSamples.__new__(VariantHeaderSamples)
samples.header = header
return samples
cdef class VariantHeader(object):
"""header information for a :class:`VariantFile` object"""
#FIXME: Add structured proxy
#FIXME: Add generic proxy
#FIXME: Add mutable methods
# See makeVariantHeader for C constructor
def __cinit__(self):
self.ptr = NULL
# Python constructor
def __init__(self):
self.ptr = bcf_hdr_init(b'w')
if not self.ptr:
raise ValueError('cannot create VariantHeader')
def __dealloc__(self):
if self.ptr:
bcf_hdr_destroy(self.ptr)
self.ptr = NULL
def __bool__(self):
return self.ptr != NULL
def copy(self):
return makeVariantHeader(bcf_hdr_dup(self.ptr))
def merge(self, VariantHeader header):
if header is None:
raise ValueError('header must not be None')
bcf_hdr_merge(self.ptr, header.ptr)
@property
def version(self):
"""VCF version"""
return force_str(bcf_hdr_get_version(self.ptr))
@property
def samples(self):
"""samples (:class:`VariantHeaderSamples`)"""
return makeVariantHeaderSamples(self)
@property
def records(self):
"""header records (:class:`VariantHeaderRecords`)"""
return makeVariantHeaderRecords(self)
@property
def contigs(self):
"""contig information (:class:`VariantHeaderContigs`)"""
return makeVariantHeaderContigs(self)
@property
def filters(self):
"""filter metadata (:class:`VariantHeaderMetadata`)"""
return makeVariantHeaderMetadata(self, BCF_HL_FLT)
@property
def info(self):
"""info metadata (:class:`VariantHeaderMetadata`)"""
return makeVariantHeaderMetadata(self, BCF_HL_INFO)
@property
def formats(self):
"""format metadata (:class:`VariantHeaderMetadata`)"""
return makeVariantHeaderMetadata(self, BCF_HL_FMT)
@property
def alts(self):
"""alt metadata (:class:`dict` ID->record).
The data returned just a snapshot of alt records, is created
every time the property is requested, and modifications will
not be reflected in the header metadata and vice versa.
i.e. it is just a dict that reflects the state of alt records
at the time it is created.
"""
return {record['ID']:record for record in self.records
if record.key.upper() == 'ALT' }
# only safe to do when opening an htsfile
cdef _subset_samples(self, include_samples):
keep_samples = set(self.samples)
include_samples = set(include_samples)
missing_samples = include_samples - keep_samples
keep_samples &= include_samples
if missing_samples:
# FIXME: add specialized exception with payload
raise ValueError(
'missing {:d} requested samples'.format(
len(missing_samples)))
keep_samples = force_bytes(','.join(keep_samples))
cdef char *keep = <char *>keep_samples if keep_samples else NULL
cdef ret = bcf_hdr_set_samples(self.ptr, keep, 0)
if ret != 0:
raise ValueError(
'bcf_hdr_set_samples failed: ret = {}'.format(ret))
def __str__(self):
cdef int hlen
cdef kstring_t line
line.l = line.m = 0
line.s = NULL
if bcf_hdr_format(self.ptr, 0, &line) < 0:
if line.m:
free(line.s)
raise ValueError('bcf_hdr_format failed')
ret = charptr_to_str_w_len(line.s, line.l)
if line.m:
free(line.s)
return ret
def new_record(self, contig=None, start=0, stop=0, alleles=None,
id=None, qual=None, filter=None, info=None, samples=None,
**kwargs):
"""Create a new empty VariantRecord.
Arguments are currently experimental. Use with caution and expect
changes in upcoming releases.
"""
rec = makeVariantRecord(self, bcf_init())
if not rec:
raise MemoryError('unable to allocate BCF record')
rec.ptr.n_sample = bcf_hdr_nsamples(self.ptr)
if contig is not None:
rec.contig = contig
rec.start = start
rec.stop = stop
rec.id = id
rec.qual = qual
if alleles is not None:
rec.alleles = alleles
if filter is not None:
if isinstance(filter, (list, tuple, VariantRecordFilter)):
for f in filter:
rec.filter.add(f)
else:
rec.filter.add(filter)
if info:
rec.info.update(info)
if kwargs:
if 'GT' in kwargs:
rec.samples[0]['GT'] = kwargs.pop('GT')
rec.samples[0].update(kwargs)
if samples:
for i, sample in enumerate(samples):
if 'GT' in sample:
rec.samples[i]['GT'] = sample.pop('GT')
rec.samples[i].update(sample)
return rec
def add_record(self, VariantHeaderRecord record):
"""Add an existing :class:`VariantHeaderRecord` to this header"""
if record is None:
raise ValueError('record must not be None')
cdef bcf_hrec_t *hrec = bcf_hrec_dup(record.ptr)
bcf_hdr_add_hrec(self.ptr, hrec)
self._hdr_sync()
def add_line(self, line):
"""Add a metadata line to this header"""
bline = force_bytes(line)
if bcf_hdr_append(self.ptr, bline) < 0:
raise ValueError('invalid header line')
self._hdr_sync()
def add_meta(self, key, value=None, items=None):
"""Add metadata to this header"""
if not ((value is not None) ^ (items is not None)):
raise ValueError('either value or items must be specified')
cdef bcf_hrec_t *hrec = <bcf_hrec_t*>calloc(1, sizeof(bcf_hrec_t))
cdef int quoted
try:
key = force_bytes(key)
hrec.key = strdup(key)
if value is not None:
hrec.value = strdup(force_bytes(value))
else:
for key, value in items:
quoted = not isinstance(value, unquoted_str) and key not in ("ID", "Number", "Type")
key = force_bytes(key)
bcf_hrec_add_key(hrec, key, <int>len(key))
value = force_bytes(str(value))
bcf_hrec_set_val(hrec, hrec.nkeys-1, value, <int>len(value), quoted)
except:
bcf_hrec_destroy(hrec)
raise
bcf_hdr_add_hrec(self.ptr, hrec)
self._hdr_sync()
cdef _add_sample(self, name):
bname = force_bytes(name)
if bcf_hdr_add_sample(self.ptr, bname) < 0:
raise ValueError('Duplicated sample name: {}'.format(name))
cdef _hdr_sync(self):
cdef bcf_hdr_t *hdr = self.ptr
if hdr.dirty:
if bcf_hdr_sync(hdr) < 0:
raise MemoryError('unable to reallocate VariantHeader')
def add_sample(self, name):
"""Add a new sample to this header"""
self._add_sample(name)
self._hdr_sync()
def add_samples(self, *args):
"""Add several new samples to this header.
This function takes multiple arguments, each of which may
be either a sample name or an iterable returning sample names
(e.g., a list of sample names).
"""
for arg in args:
if isinstance(arg, str):
self._add_sample(arg)
else:
for name in arg:
self._add_sample(name)
self._hdr_sync()
cdef VariantHeader makeVariantHeader(bcf_hdr_t *hdr):
if not hdr:
raise ValueError('cannot create VariantHeader')
cdef VariantHeader header = VariantHeader.__new__(VariantHeader)
header.ptr = hdr
return header
cdef inline int bcf_header_get_info_id(bcf_hdr_t *hdr, key) except? -2:
cdef vdict_t *d
cdef khiter_t k
cdef int info_id
if isinstance(key, str):
key = force_bytes(key)
d = <vdict_t *>hdr.dict[BCF_DT_ID]
k = kh_get_vdict(d, key)
if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF:
return -1
return kh_val_vdict(d, k).id
########################################################################
########################################################################
## Variant Record objects
########################################################################
cdef class VariantRecordFilter(object):
"""Filters set on a :class:`VariantRecord` object, presented as a mapping from
filter index or name to :class:`VariantMetadata` object"""
def __init__(self, *args, **kwargs):
raise TypeError('this class cannot be instantiated from Python')
def __len__(self):
return self.record.ptr.d.n_flt
def __bool__(self):
return self.record.ptr.d.n_flt != 0
def __getitem__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef int index, id
cdef int n = r.d.n_flt
if isinstance(key, int):
index = key
if index < 0 or index >= n:
raise IndexError('invalid filter index')
id = r.d.flt[index]
else:
if key == '.':
key = 'PASS'
bkey = force_bytes(key)
id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
raise KeyError('Invalid filter: {}'.format(key))
return makeVariantMetadata(self.record.header, BCF_HL_FLT, id)
def add(self, key):
"""Add a new filter"""
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef int id
if key == '.':
key = 'PASS'
cdef bytes bkey = force_bytes(key)
id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
if not check_header_id(hdr, BCF_HL_FLT, id):
raise KeyError('Invalid filter: {}'.format(key))
bcf_add_filter(hdr, r, id)
def __delitem__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef int index, id
cdef int n = r.d.n_flt
if isinstance(key, int):
index = key
if index < 0 or index >= n:
raise IndexError('invalid filter index')
id = r.d.flt[index]
else:
if key == '.':
key = 'PASS'
bkey = force_bytes(key)
id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
raise KeyError('Invalid filter: {}'.format(key))
bcf_remove_filter(hdr, r, id, 0)
def clear(self):
"""Clear all filters"""
cdef bcf1_t *r = self.record.ptr
r.d.shared_dirty |= BCF1_DIRTY_FLT
r.d.n_flt = 0
def __iter__(self):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef int i
for i in range(r.d.n_flt):
yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, r.d.flt[i]))
def get(self, key, default=None):
"""D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
try:
return self[key]
except KeyError:
return default
def __contains__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bytes bkey = force_bytes(key)
return bcf_has_filter(hdr, r, bkey) == 1
def iterkeys(self):
"""D.iterkeys() -> an iterator over the keys of D"""
return iter(self)
def itervalues(self):
"""D.itervalues() -> an iterator over the values of D"""
for key in self:
yield self[key]
def iteritems(self):
"""D.iteritems() -> an iterator over the (key, value) items of D"""
for key in self:
yield (key, self[key])
def keys(self):
"""D.keys() -> list of D's keys"""
return list(self)
def items(self):
"""D.items() -> list of D's (key, value) pairs, as 2-tuples"""
return list(self.iteritems())
def values(self):
"""D.values() -> list of D's values"""
return list(self.itervalues())
def __richcmp__(VariantRecordFilter self not None, VariantRecordFilter other not None, int op):
if op != 2 and op != 3:
return NotImplemented
cdef bcf1_t *s = self.record.ptr
cdef bcf1_t *o = other.record.ptr
cdef bint cmp = (s.d.n_flt == o.d.n_flt and list(self) == list(other))
if op == 3:
cmp = not cmp
return cmp
# Mappings are not hashable by default, but subclasses can change this
__hash__ = None
#TODO: implement __richcmp__
cdef VariantRecordFilter makeVariantRecordFilter(VariantRecord record):
if not record:
raise ValueError('invalid VariantRecord')
cdef VariantRecordFilter filter = VariantRecordFilter.__new__(VariantRecordFilter)
filter.record = record
return filter
cdef class VariantRecordFormat(object):
"""Format data present for each sample in a :class:`VariantRecord` object,
presented as mapping from format name to :class:`VariantMetadata` object."""
def __init__(self, *args, **kwargs):
raise TypeError('this class cannot be instantiated from Python')
def __len__(self):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef int i, n = 0
for i in range(r.n_fmt):
if r.d.fmt[i].p:
n += 1
return n
def __bool__(self):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef int i
for i in range(r.n_fmt):
if r.d.fmt[i].p:
return True
return False
def __getitem__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bytes bkey = force_bytes(key)
cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
if not fmt or not fmt.p:
raise KeyError('unknown format: {}'.format(key))
return makeVariantMetadata(self.record.header, BCF_HL_FMT, fmt.id)
def __delitem__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bytes bkey = force_bytes(key)
cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
if not fmt or not fmt.p:
raise KeyError('unknown format: {}'.format(key))
if bcf_update_format(hdr, r, bkey, fmt.p, 0, fmt.type) < 0:
raise ValueError('Unable to delete FORMAT')
def clear(self):
"""Clear all formats for all samples within the associated
:class:`VariantRecord` instance"""
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bcf_fmt_t *fmt
cdef const char *key
cdef int i
for i in reversed(range(r.n_fmt)):
fmt = &r.d.fmt[i]
if fmt.p:
key = bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)
if bcf_update_format(hdr, r, key, fmt.p, 0, fmt.type) < 0:
raise ValueError('Unable to delete FORMAT')
def __iter__(self):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bcf_fmt_t *fmt
cdef int i
for i in range(r.n_fmt):
fmt = &r.d.fmt[i]
if fmt.p:
yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
def get(self, key, default=None):
"""D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
try:
return self[key]
except KeyError:
return default
def __contains__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bytes bkey = force_bytes(key)
cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
return fmt != NULL and fmt.p != NULL
def iterkeys(self):
"""D.iterkeys() -> an iterator over the keys of D"""
return iter(self)
def itervalues(self):
"""D.itervalues() -> an iterator over the values of D"""
for key in self:
yield self[key]
def iteritems(self):
"""D.iteritems() -> an iterator over the (key, value) items of D"""
for key in self:
yield (key, self[key])
def keys(self):
"""D.keys() -> list of D's keys"""
return list(self)
def items(self):
"""D.items() -> list of D's (key, value) pairs, as 2-tuples"""
return list(self.iteritems())
def values(self):
"""D.values() -> list of D's values"""
return list(self.itervalues())
# Mappings are not hashable by default, but subclasses can change this
__hash__ = None
#TODO: implement __richcmp__
cdef VariantRecordFormat makeVariantRecordFormat(VariantRecord record):
if not record:
raise ValueError('invalid VariantRecord')
cdef VariantRecordFormat format = VariantRecordFormat.__new__(VariantRecordFormat)
format.record = record
return format
#TODO: Add a getmeta method to return the corresponding VariantMetadata?
cdef class VariantRecordInfo(object):
"""Info data stored in a :class:`VariantRecord` object, presented as a
mapping from info metadata name to value."""
def __init__(self, *args, **kwargs):
raise TypeError('this class cannot be instantiated from Python')
def __len__(self):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bcf_info_t *info
cdef const char *key
cdef int i, count = 0
if bcf_unpack(r, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
for i in range(r.n_info):
info = &r.d.info[i]
key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0:
count += 1
return count
def __bool__(self):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bcf_info_t *info
cdef const char *key
cdef int i
if bcf_unpack(r, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
for i in range(r.n_info):
info = &r.d.info[i]
key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0:
return True
return False
def __getitem__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
if bcf_unpack(r, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
cdef bytes bkey = force_bytes(key)
if strcmp(bkey, b'END') == 0:
raise KeyError('END is a reserved attribute; access is via record.stop')
cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
# Cannot stop here if info == NULL, since flags must return False
cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
if info_id < 0:
raise KeyError('Unknown INFO field: {}'.format(key))
if not check_header_id(hdr, BCF_HL_INFO, info_id):
raise ValueError('Invalid header')
# Handle type=Flag values
if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
return info != NULL and info.vptr != NULL
if not info or not info.vptr:
raise KeyError('Invalid INFO field: {}'.format(key))
return bcf_info_get_value(self.record, info)
def __setitem__(self, key, value):
cdef bytes bkey = force_bytes(key)
if strcmp(bkey, b'END') == 0:
raise KeyError('END is a reserved attribute; access is via record.stop')
if bcf_unpack(self.record.ptr, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
bcf_info_set_value(self.record, key, value)
def __delitem__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bytes bkey = force_bytes(key)
if strcmp(bkey, b'END') == 0:
raise KeyError('END is a reserved attribute; access is via record.stop')
if bcf_unpack(r, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
# Cannot stop here if info == NULL, since flags must return False
cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
if info_id < 0:
raise KeyError('Unknown INFO field: {}'.format(key))
if not check_header_id(hdr, BCF_HL_INFO, info_id):
raise ValueError('Invalid header')
# Handle flags
if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr):
return
if not info or not info.vptr:
raise KeyError('Unknown INFO field: {}'.format(key))
if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0:
raise ValueError('Unable to delete INFO')
def clear(self):
"""Clear all info data"""
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bcf_info_t *info
cdef const char *key
cdef int i
if bcf_unpack(r, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
for i in range(r.n_info):
info = &r.d.info[i]
if info and info.vptr:
key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
if strcmp(key, b'END') == 0:
continue
if bcf_update_info(hdr, r, key, NULL, 0, info.type) < 0:
raise ValueError('Unable to delete INFO')
def __iter__(self):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bcf_info_t *info
cdef const char *key
cdef int i
if bcf_unpack(r, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
for i in range(r.n_info):
info = &r.d.info[i]
if info and info.vptr:
key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
if strcmp(key, b'END') != 0:
yield bcf_str_cache_get_charptr(key)
def get(self, key, default=None):
"""D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
if bcf_unpack(r, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
cdef bytes bkey = force_bytes(key)
if strcmp(bkey, b'END') == 0:
return default
cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
# Cannot stop here if info == NULL, since flags must return False
cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
if not check_header_id(hdr, BCF_HL_INFO, info_id):
raise ValueError('Invalid header')
# Handle flags
if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
return info != NULL and info.vptr != NULL
if not info or not info.vptr:
return default
return bcf_info_get_value(self.record, info)
def __contains__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
if bcf_unpack(r, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
cdef bytes bkey = force_bytes(key)
if strcmp(bkey, b'END') == 0:
return False
cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
return info != NULL and info.vptr != NULL
def iterkeys(self):
"""D.iterkeys() -> an iterator over the keys of D"""
return iter(self)
def itervalues(self):
"""D.itervalues() -> an iterator over the values of D"""
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bcf_info_t *info
cdef const char *key
cdef int i
if bcf_unpack(r, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
for i in range(r.n_info):
info = &r.d.info[i]
if info and info.vptr:
key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
if strcmp(key, b'END') != 0:
yield bcf_info_get_value(self.record, info)
def iteritems(self):
"""D.iteritems() -> an iterator over the (key, value) items of D"""
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bcf_info_t *info
cdef const char *key
cdef int i
if bcf_unpack(r, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
for i in range(r.n_info):
info = &r.d.info[i]
if info and info.vptr:
key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
if strcmp(key, b'END') != 0:
value = bcf_info_get_value(self.record, info)
yield bcf_str_cache_get_charptr(key), value
def keys(self):
"""D.keys() -> list of D's keys"""
return list(self)
def items(self):
"""D.items() -> list of D's (key, value) pairs, as 2-tuples"""
return list(self.iteritems())
def values(self):
"""D.values() -> list of D's values"""
return list(self.itervalues())
def update(self, items=None, **kwargs):
"""D.update([E, ]**F) -> None.
Update D from dict/iterable E and F.
"""
for k, v in items.items():
if k != 'END':
self[k] = v
if kwargs:
kwargs.pop('END', None)
for k, v in kwargs.items():
self[k] = v
def pop(self, key, default=_nothing):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
if bcf_unpack(r, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
cdef bytes bkey = force_bytes(key)
cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
# Cannot stop here if info == NULL, since flags must return False
cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
if info_id < 0:
if default is _nothing:
raise KeyError('Unknown INFO field: {}'.format(key))
return default
if not check_header_id(hdr, BCF_HL_INFO, info_id):
raise ValueError('Invalid header')
# Handle flags
if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr):
return
if not info or not info.vptr:
if default is _nothing:
raise KeyError('Unknown INFO field: {}'.format(key))
return default
value = bcf_info_get_value(self.record, info)
if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0:
raise ValueError('Unable to delete INFO')
return value
def __richcmp__(VariantRecordInfo self not None, VariantRecordInfo other not None, int op):
if op != 2 and op != 3:
return NotImplemented
cdef bcf1_t *s = self.record.ptr
cdef bcf1_t *o = other.record.ptr
# Cannot use n_info as shortcut logic, since null values may remain
cdef bint cmp = dict(self) == dict(other)
if op == 3:
cmp = not cmp
return cmp
# Mappings are not hashable by default, but subclasses can change this
__hash__ = None
cdef VariantRecordInfo makeVariantRecordInfo(VariantRecord record):
if not record:
raise ValueError('invalid VariantRecord')
cdef VariantRecordInfo info = VariantRecordInfo.__new__(VariantRecordInfo)
info.record = record
return info
cdef class VariantRecordSamples(object):
"""mapping from sample index or name to :class:`VariantRecordSample` object."""
def __init__(self, *args, **kwargs):
raise TypeError('this class cannot be instantiated from Python')
def __len__(self):
return self.record.ptr.n_sample # bcf_hdr_nsamples(self.record.header.ptr)
def __bool__(self):
return self.record.ptr.n_sample != 0 # bcf_hdr_nsamples(self.record.header.ptr) != 0
def __getitem__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef int n = self.record.ptr.n_sample
cdef int sample_index
cdef vdict_t *d
cdef khiter_t k
if isinstance(key, int):
sample_index = key
else:
bkey = force_bytes(key)
sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
if sample_index < 0:
raise KeyError('invalid sample name: {}'.format(key))
if sample_index < 0 or sample_index >= n:
raise IndexError('invalid sample index')
return makeVariantRecordSample(self.record, sample_index)
def __iter__(self):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef int32_t i, n = self.record.ptr.n_sample
for i in range(n):
yield charptr_to_str(hdr.samples[i])
def get(self, key, default=None):
"""D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
try:
return self[key]
except KeyError:
return default
def __contains__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef int n = self.record.ptr.n_sample
cdef int sample_index
cdef vdict_t *d
cdef khiter_t k
if isinstance(key, int):
sample_index = key
else:
bkey = force_bytes(key)
sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
if sample_index < 0:
raise KeyError('invalid sample name: {}'.format(key))
return 0 <= sample_index < n
def iterkeys(self):
"""D.iterkeys() -> an iterator over the keys of D"""
return iter(self)
def itervalues(self):
"""D.itervalues() -> an iterator over the values of D"""
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef int32_t i, n = self.record.ptr.n_sample
for i in range(n):
yield makeVariantRecordSample(self.record, i)
def iteritems(self):
"""D.iteritems() -> an iterator over the (key, value) items of D"""
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef int32_t i, n = self.record.ptr.n_sample
for i in range(n):
yield (charptr_to_str(hdr.samples[i]), makeVariantRecordSample(self.record, i))
def keys(self):
"""D.keys() -> list of D's keys"""
return list(self)
def items(self):
"""D.items() -> list of D's (key, value) pairs, as 2-tuples"""
return list(self.iteritems())
def values(self):
"""D.values() -> list of D's values"""
return list(self.itervalues())
def update(self, items=None, **kwargs):
"""D.update([E, ]**F) -> None.
Update D from dict/iterable E and F.
"""
for k, v in items.items():
self[k] = v
if kwargs:
for k, v in kwargs.items():
self[k] = v
def pop(self, key, default=_nothing):
try:
value = self[key]
del self[key]
return value
except KeyError:
if default is not _nothing:
return default
raise
def __richcmp__(VariantRecordSamples self not None, VariantRecordSamples other not None, int op):
if op != 2 and op != 3:
return NotImplemented
cdef bcf1_t *s = self.record.ptr
cdef bcf1_t *o = other.record.ptr
cdef bint cmp = (s.n_sample == o.n_sample and self.values() == other.values())
if op == 3:
cmp = not cmp
return cmp
# Mappings are not hashable by default, but subclasses can change this
__hash__ = None
cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record):
if not record:
raise ValueError('invalid VariantRecord')
cdef VariantRecordSamples samples = VariantRecordSamples.__new__(
VariantRecordSamples)
samples.record = record
return samples
cdef class VariantRecord(object):
"""Variant record"""
def __init__(self, *args, **kwargs):
raise TypeError('this class cannot be instantiated from Python')
def __dealloc__(self):
if self.ptr:
bcf_destroy1(self.ptr)
self.ptr = NULL
def copy(self):
"""return a copy of this VariantRecord object"""
return makeVariantRecord(self.header, bcf_dup(self.ptr))
def translate(self, VariantHeader dst_header):
if dst_header is None:
raise ValueError('dst_header must not be None')
cdef bcf_hdr_t *src_hdr = self.header.ptr
cdef bcf_hdr_t *dst_hdr = dst_header.ptr
if src_hdr != dst_hdr:
if self.ptr.n_sample != bcf_hdr_nsamples(dst_hdr):
msg = 'Cannot translate record. Number of samples does not match header ({} vs {})'
raise ValueError(msg.format(self.ptr.n_sample, bcf_hdr_nsamples(dst_hdr)))
bcf_translate(dst_hdr, src_hdr, self.ptr)
self.header = dst_header
@property
def rid(self):
"""internal reference id number"""
return self.ptr.rid
@rid.setter
def rid(self, value):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef int r = value
if r < 0 or r >= hdr.n[BCF_DT_CTG] or not hdr.id[BCF_DT_CTG][r].val:
raise ValueError('invalid reference id')
self.ptr.rid = r
@property
def chrom(self):
"""chromosome/contig name"""
cdef bcf_hdr_t *hdr = self.header.ptr
cdef int rid = self.ptr.rid
if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
raise ValueError('Invalid header')
return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
@chrom.setter
def chrom(self, value):
cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
bchrom = force_bytes(value)
cdef khint_t k = kh_get_vdict(d, bchrom)
if k == kh_end(d):
raise ValueError('Invalid chromosome/contig')
self.ptr.rid = kh_val_vdict(d, k).id
@property
def contig(self):
"""chromosome/contig name"""
cdef bcf_hdr_t *hdr = self.header.ptr
cdef int rid = self.ptr.rid
if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
raise ValueError('Invalid header')
return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
@contig.setter
def contig(self, value):
cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
bchrom = force_bytes(value)
cdef khint_t k = kh_get_vdict(d, bchrom)
if k == kh_end(d):
raise ValueError('Invalid chromosome/contig')
self.ptr.rid = kh_val_vdict(d, k).id
@property
def pos(self):
"""record start position on chrom/contig (1-based inclusive)"""
return self.ptr.pos + 1
@pos.setter
def pos(self, value):
cdef int p = value
if p < 1:
raise ValueError('Position must be positive')
self.ptr.pos = p - 1
bcf_sync_end(self)
@property
def start(self):
"""record start position on chrom/contig (0-based inclusive)"""
return self.ptr.pos
@start.setter
def start(self, value):
cdef int s = value
if s < 0:
raise ValueError('Start coordinate must be non-negative')
self.ptr.pos = s
bcf_sync_end(self)
@property
def stop(self):
"""record stop position on chrom/contig (0-based exclusive)"""
return self.ptr.pos + self.ptr.rlen
@stop.setter
def stop(self, value):
cdef int s = value
if s < 0:
raise ValueError('Stop coordinate must be non-negative')
self.ptr.rlen = s - self.ptr.pos
bcf_sync_end(self)
@property
def rlen(self):
"""record length on chrom/contig (aka rec.stop - rec.start)"""
return self.ptr.rlen
@rlen.setter
def rlen(self, value):
cdef int r = value
self.ptr.rlen = r
bcf_sync_end(self)
@property
def qual(self):
"""phred scaled quality score or None if not available"""
return self.ptr.qual if not bcf_float_is_missing(self.ptr.qual) else None
@qual.setter
def qual(self, value):
if value is not None:
self.ptr.qual = value
else:
bcf_float_set(&self.ptr.qual, bcf_float_missing)
# @property
# def n_allele(self):
# return self.ptr.n_allele
# @property
# def n_sample(self):
# return self.ptr.n_sample
@property
def id(self):
"""record identifier or None if not available"""
cdef bcf1_t *r = self.ptr
if bcf_unpack(r, BCF_UN_STR) < 0:
raise ValueError('Error unpacking VariantRecord')
# causes a memory leak https://github.com/pysam-developers/pysam/issues/773
# return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None
if (r.d.m_id == 0):
raise ValueError('Error extracting ID')
return charptr_to_str(r.d.id) if r.d.id != b'.' else None
@id.setter
def id(self, value):
cdef bcf1_t *r = self.ptr
if bcf_unpack(r, BCF_UN_STR) < 0:
raise ValueError('Error unpacking VariantRecord')
cdef char *idstr = NULL
if value is not None:
bid = force_bytes(value)
idstr = bid
if bcf_update_id(self.header.ptr, self.ptr, idstr) < 0:
raise ValueError('Error updating id')
@property
def ref(self):
"""reference allele"""
cdef bcf1_t *r = self.ptr
if bcf_unpack(r, BCF_UN_STR) < 0:
raise ValueError('Error unpacking VariantRecord')
return charptr_to_str(r.d.allele[0]) if r.d.allele else None
@ref.setter
def ref(self, value):
cdef bcf1_t *r = self.ptr
if bcf_unpack(r, BCF_UN_STR) < 0:
raise ValueError('Error unpacking VariantRecord')
#FIXME: Set alleles directly -- this is stupid
if not value:
raise ValueError('ref allele must not be null')
value = force_bytes(value)
if r.d.allele and r.n_allele:
alleles = [r.d.allele[i] for i in range(r.n_allele)]
alleles[0] = value
else:
alleles = [value, '<NON_REF>']
self.alleles = alleles
bcf_sync_end(self)
@property
def alleles(self):
"""tuple of reference allele followed by alt alleles"""
cdef bcf1_t *r = self.ptr
if bcf_unpack(r, BCF_UN_STR) < 0:
raise ValueError('Error unpacking VariantRecord')
if not r.d.allele:
return None
cdef tuple res = PyTuple_New(r.n_allele)
for i in range(r.n_allele):
a = charptr_to_str(r.d.allele[i])
PyTuple_SET_ITEM(res, i, a)
Py_INCREF(a)
return res
@alleles.setter
def alleles(self, values):
cdef bcf1_t *r = self.ptr
# Cache rlen of symbolic alleles before call to bcf_update_alleles_str
cdef int rlen = r.rlen
if bcf_unpack(r, BCF_UN_STR) < 0:
raise ValueError('Error unpacking VariantRecord')
values = [force_bytes(v) for v in values]
if len(values) < 2:
raise ValueError('must set at least 2 alleles')
if b'' in values:
raise ValueError('cannot set null allele')
value = b','.join(values)
if bcf_update_alleles_str(self.header.ptr, r, value) < 0:
raise ValueError('Error updating alleles')
# Reset rlen if alternate allele isn't symbolic, otherwise used cached
if has_symbolic_allele(self):
self.ptr.rlen = rlen
else:
self.ptr.rlen = len(values[0])
r.d.var_type = -1
bcf_sync_end(self)
@property
def alts(self):
"""tuple of alt alleles"""
cdef bcf1_t *r = self.ptr
if bcf_unpack(r, BCF_UN_STR) < 0:
raise ValueError('Error unpacking VariantRecord')
if r.n_allele < 2 or not r.d.allele:
return None
cdef tuple res = PyTuple_New(r.n_allele - 1)
for i in range(1, r.n_allele):
a = charptr_to_str(r.d.allele[i])
PyTuple_SET_ITEM(res, i - 1, a)
Py_INCREF(a)
return res
@alts.setter
def alts(self, value):
#FIXME: Set alleles directly -- this is stupid
cdef bcf1_t *r = self.ptr
if bcf_unpack(r, BCF_UN_STR) < 0:
raise ValueError('Error unpacking VariantRecord')
value = [force_bytes(v) for v in value]
if b'' in value:
raise ValueError('cannot set null alt allele')
ref = [r.d.allele[0] if r.d.allele and r.n_allele else b'.']
self.alleles = ref + value
r.d.var_type = -1
@property
def filter(self):
"""filter information (see :class:`VariantRecordFilter`)"""
if bcf_unpack(self.ptr, BCF_UN_FLT) < 0:
raise ValueError('Error unpacking VariantRecord')
return makeVariantRecordFilter(self)
@property
def info(self):
"""info data (see :class:`VariantRecordInfo`)"""
if bcf_unpack(self.ptr, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
return makeVariantRecordInfo(self)
@property
def format(self):
"""sample format metadata (see :class:`VariantRecordFormat`)"""
if bcf_unpack(self.ptr, BCF_UN_FMT) < 0:
raise ValueError('Error unpacking VariantRecord')
return makeVariantRecordFormat(self)
@property
def samples(self):
"""sample data (see :class:`VariantRecordSamples`)"""
if bcf_unpack(self.ptr, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
return makeVariantRecordSamples(self)
property alleles_variant_types:
def __get__(self):
cdef bcf1_t *r = self.ptr
cdef tuple result = PyTuple_New(r.n_allele)
for i in range(r.n_allele):
tp = bcf_get_variant_type(r, i)
if tp == VCF_REF:
v_type = "REF"
elif tp == VCF_SNP:
v_type = "SNP"
elif tp == VCF_MNP:
v_type = "MNP"
elif tp == VCF_INDEL:
v_type = "INDEL"
elif tp == VCF_BND:
v_type = "BND"
elif tp == VCF_OVERLAP:
v_type = "OVERLAP"
else:
v_type = "OTHER"
PyTuple_SET_ITEM(result, i, v_type)
Py_INCREF(v_type)
return result
def __richcmp__(VariantRecord self not None, VariantRecord other not None, int op):
if op != 2 and op != 3:
return NotImplemented
cdef bcf1_t *s = self.ptr
cdef bcf1_t *o = other.ptr
cdef bint cmp = self is other or (
s.pos == o.pos
and s.rlen == o.rlen
and ((bcf_float_is_missing(s.qual) and bcf_float_is_missing(o.qual))
or s.qual == o.qual)
and s.n_sample == o.n_sample
and s.n_allele == o.n_allele
and self.contig == other.contig
and self.alleles == other.alleles
and self.id == other.id
and self.info == other.info
and self.filter == other.filter
and self.samples == other.samples)
if op == 3:
cmp = not cmp
return cmp
def __str__(self):
cdef kstring_t line
cdef char c
line.l = line.m = 0
line.s = NULL
if vcf_format(self.header.ptr, self.ptr, &line) < 0:
if line.m:
free(line.s)
raise ValueError('vcf_format failed')
# Strip CR/LF?
#while line.l:
# c = line.s[line.l - 1]
# if c != b'\n' and c != b'\r':
# break
# line.l -= 1
ret = charptr_to_str_w_len(line.s, line.l)
if line.m:
free(line.s)
return ret
cdef VariantRecord makeVariantRecord(VariantHeader header, bcf1_t *r):
if not header:
raise ValueError('invalid VariantHeader')
if not r:
raise ValueError('cannot create VariantRecord')
if r.errcode:
msg = []
#if r.errcode & BCF_ERR_CTG_UNDEF:
# msg.append('undefined contig')
#if r.errcode & BCF_ERR_TAG_UNDEF:
# msg.append('undefined tag')
if r.errcode & BCF_ERR_NCOLS:
msg.append('invalid number of columns')
if r.errcode & BCF_ERR_LIMITS:
msg.append('limits violated')
if r.errcode & BCF_ERR_CHAR:
msg.append('invalid character found')
if r.errcode & BCF_ERR_CTG_INVALID:
msg.append('invalid contig')
if r.errcode & BCF_ERR_TAG_INVALID:
msg.append('invalid tag')
if msg:
msg = ', '.join(msg)
raise ValueError('Error(s) reading record: {}'.format(msg))
cdef VariantRecord record = VariantRecord.__new__(VariantRecord)
record.header = header
record.ptr = r
return record
########################################################################
########################################################################
## Variant Sampletype object
########################################################################
cdef class VariantRecordSample(object):
"""Data for a single sample from a :class:`VariantRecord` object.
Provides data accessors for genotypes and a mapping interface
from format name to values.
"""
def __init__(self, *args, **kwargs):
raise TypeError('this class cannot be instantiated from Python')
@property
def name(self):
"""sample name"""
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef int32_t n = r.n_sample
if self.index < 0 or self.index >= n:
raise ValueError('invalid sample index')
return charptr_to_str(hdr.samples[self.index])
@property
def allele_indices(self):
"""allele indices for called genotype, if present. Otherwise None"""
return bcf_format_get_allele_indices(self)
@allele_indices.setter
def allele_indices(self, value):
self['GT'] = value
@allele_indices.deleter
def allele_indices(self):
self['GT'] = ()
@property
def alleles(self):
"""alleles for called genotype, if present. Otherwise None"""
return bcf_format_get_alleles(self)
@alleles.setter
def alleles(self, value):
# Sets the genotype, supply a tuple of alleles to set.
# The supplied alleles need to be defined in the correspoding pysam.libcbcf.VariantRecord
# The genotype is reset when an empty tuple, None or (None,) is supplied
if value==(None,) or value==tuple() or value is None:
self['GT'] = ()
return
if any((type(x) == int for x in value)):
raise ValueError('Use .allele_indices to set integer allele indices')
# determine and set allele indices:
try:
self['GT'] = tuple( (self.record.alleles.index(allele) for allele in value) )
except ValueError:
raise ValueError("One or more of the supplied sample alleles are not defined as alleles of the corresponding pysam.libcbcf.VariantRecord."
"First set the .alleles of this record to define the alleles")
@alleles.deleter
def alleles(self):
self['GT'] = ()
@property
def phased(self):
"""False if genotype is missing or any allele is unphased. Otherwise True."""
return bcf_sample_get_phased(self)
@phased.setter
def phased(self, value):
bcf_sample_set_phased(self, value)
def __len__(self):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef int i, n = 0
if bcf_unpack(r, BCF_UN_FMT) < 0:
raise ValueError('Error unpacking VariantRecord')
for i in range(r.n_fmt):
if r.d.fmt[i].p:
n += 1
return n
def __bool__(self):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef int i
if bcf_unpack(r, BCF_UN_FMT) < 0:
raise ValueError('Error unpacking VariantRecord')
for i in range(r.n_fmt):
if r.d.fmt[i].p:
return True
return False
def __getitem__(self, key):
return bcf_format_get_value(self, key)
def __setitem__(self, key, value):
bcf_format_set_value(self, key, value)
def __delitem__(self, key):
bcf_format_del_value(self, key)
def clear(self):
"""Clear all format data (including genotype) for this sample"""
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bcf_fmt_t *fmt
cdef int i
for i in range(r.n_fmt):
fmt = &r.d.fmt[i]
if fmt.p:
bcf_format_del_value(self, bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
def __iter__(self):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bcf_fmt_t *fmt
cdef int i
for i in range(r.n_fmt):
fmt = &r.d.fmt[i]
if r.d.fmt[i].p:
yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
def get(self, key, default=None):
"""D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
try:
return self[key]
except KeyError:
return default
def __contains__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bytes bkey = force_bytes(key)
cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
return fmt != NULL and fmt.p != NULL
def iterkeys(self):
"""D.iterkeys() -> an iterator over the keys of D"""
return iter(self)
def itervalues(self):
"""D.itervalues() -> an iterator over the values of D"""
for key in self:
yield self[key]
def iteritems(self):
"""D.iteritems() -> an iterator over the (key, value) items of D"""
for key in self:
yield (key, self[key])
def keys(self):
"""D.keys() -> list of D's keys"""
return list(self)
def items(self):
"""D.items() -> list of D's (key, value) pairs, as 2-tuples"""
return list(self.iteritems())
def values(self):
"""D.values() -> list of D's values"""
return list(self.itervalues())
def update(self, items=None, **kwargs):
"""D.update([E, ]**F) -> None.
Update D from dict/iterable E and F.
"""
for k, v in items.items():
self[k] = v
if kwargs:
for k, v in kwargs.items():
self[k] = v
def pop(self, key, default=_nothing):
try:
value = self[key]
del self[key]
return value
except KeyError:
if default is not _nothing:
return default
raise
def __richcmp__(VariantRecordSample self not None, VariantRecordSample other not None, int op):
if op != 2 and op != 3:
return NotImplemented
cdef bint cmp = dict(self) == dict(other)
if op == 3:
cmp = not cmp
return cmp
# Mappings are not hashable by default, but subclasses can change this
__hash__ = None
cdef VariantRecordSample makeVariantRecordSample(VariantRecord record, int32_t sample_index):
if not record or sample_index < 0:
raise ValueError('cannot create VariantRecordSample')
cdef VariantRecordSample sample = VariantRecordSample.__new__(VariantRecordSample)
sample.record = record
sample.index = sample_index
return sample
########################################################################
########################################################################
## Index objects
########################################################################
cdef class BaseIndex(object):
def __init__(self):
self.refs = ()
self.remap = {}
def __len__(self):
return len(self.refs)
def __bool__(self):
return len(self.refs) != 0
def __getitem__(self, key):
if isinstance(key, int):
return self.refs[key]
else:
return self.refmap[key]
def __iter__(self):
return iter(self.refs)
def get(self, key, default=None):
"""D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
try:
return self[key]
except KeyError:
return default
def __contains__(self, key):
try:
self[key]
except KeyError:
return False
else:
return True
def iterkeys(self):
"""D.iterkeys() -> an iterator over the keys of D"""
return iter(self)
def itervalues(self):
"""D.itervalues() -> an iterator over the values of D"""
for key in self:
yield self[key]
def iteritems(self):
"""D.iteritems() -> an iterator over the (key, value) items of D"""
for key in self:
yield (key, self[key])
def keys(self):
"""D.keys() -> list of D's keys"""
return list(self)
def items(self):
"""D.items() -> list of D's (key, value) pairs, as 2-tuples"""
return list(self.iteritems())
def values(self):
"""D.values() -> list of D's values"""
return list(self.itervalues())
def update(self, items=None, **kwargs):
"""D.update([E, ]**F) -> None.
Update D from dict/iterable E and F.
"""
for k, v in items.items():
self[k] = v
if kwargs:
for k, v in kwargs.items():
self[k] = v
def pop(self, key, default=_nothing):
try:
value = self[key]
del self[key]
return value
except KeyError:
if default is not _nothing:
return default
raise
# Mappings are not hashable by default, but subclasses can change this
__hash__ = None
#TODO: implement __richcmp__
cdef class BCFIndex(object):
"""CSI index data structure for BCF files"""
def __init__(self):
self.refs = ()
self.refmap = {}
if not self.ptr:
raise ValueError('Invalid index object')
cdef int n
cdef const char **refs = bcf_index_seqnames(self.ptr, self.header.ptr, &n)
self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else ()
self.refmap = { r:i for i,r in enumerate(self.refs) }
def __dealloc__(self):
if self.ptr:
hts_idx_destroy(self.ptr)
self.ptr = NULL
def fetch(self, bcf, contig, start, stop, reopen):
return BCFIterator(bcf, contig, start, stop, reopen)
cdef BCFIndex makeBCFIndex(VariantHeader header, hts_idx_t *idx):
if not idx:
return None
if not header:
raise ValueError('invalid VariantHeader')
cdef BCFIndex index = BCFIndex.__new__(BCFIndex)
index.header = header
index.ptr = idx
index.__init__()
return index
cdef class TabixIndex(BaseIndex):
"""Tabix index data structure for VCF files"""
def __init__(self):
self.refs = ()
self.refmap = {}
if not self.ptr:
raise ValueError('Invalid index object')
cdef int n
cdef const char **refs = tbx_seqnames(self.ptr, &n)
self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else ()
self.refmap = { r:i for i,r in enumerate(self.refs) }
def __dealloc__(self):
if self.ptr:
tbx_destroy(self.ptr)
self.ptr = NULL
def fetch(self, bcf, contig, start, stop, reopen):
return TabixIterator(bcf, contig, start, stop, reopen)
cdef TabixIndex makeTabixIndex(tbx_t *idx):
if not idx:
return None
cdef TabixIndex index = TabixIndex.__new__(TabixIndex)
index.ptr = idx
index.__init__()
return index
########################################################################
########################################################################
## Iterators
########################################################################
cdef class BaseIterator(object):
pass
# Internal function to clean up after iteration stop or failure.
# This would be a nested function if it weren't a cdef function.
cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record):
bcf_destroy1(record)
# destroy iter so future calls to __next__ raise StopIteration
bcf_itr_destroy(self.iter)
self.iter = NULL
cdef class BCFIterator(BaseIterator):
def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, reopen=True):
if bcf is None:
raise ValueError('bcf must not be None')
if contig is None:
raise ValueError('contig must be specified')
if not isinstance(bcf.index, BCFIndex):
raise ValueError('bcf index required')
cdef BCFIndex index = bcf.index
self.bcf = bcf
self.index = index
cdef int rid, cstart, cstop
try:
rid = index.refmap[contig]
except KeyError:
# A query for a non-existent contig yields an empty iterator, does not raise an error
self.iter = NULL
return
if reopen:
self.bcf = self.bcf.copy()
cstart = start if start is not None else 0
cstop = stop if stop is not None else MAX_POS
with nogil:
self.iter = bcf_itr_queryi(index.ptr, rid, cstart, cstop)
if not self.iter:
if errno:
raise IOError(errno, strerror(errno))
else:
raise IOError('unable to fetch {}:{}-{}'.format(contig, start+1, stop))
def __dealloc__(self):
if self.iter:
bcf_itr_destroy(self.iter)
self.iter = NULL
def __iter__(self):
return self
def __next__(self):
if not self.iter:
raise StopIteration
cdef bcf1_t *record = bcf_init1()
if not record:
raise MemoryError('unable to allocate BCF record')
record.pos = -1
if self.bcf.drop_samples:
record.max_unpack = BCF_UN_SHR
cdef int ret
with nogil:
ret = bcf_itr_next(self.bcf.htsfile, self.iter, record)
if ret < 0:
_stop_BCFIterator(self, record)
if ret == -1:
raise StopIteration
elif ret == -2:
raise IOError('truncated file')
elif errno:
raise IOError(errno, strerror(errno))
else:
raise IOError('unable to fetch next record')
ret = bcf_subset_format(self.bcf.header.ptr, record)
if ret < 0:
_stop_BCFIterator(self, record)
raise ValueError('error in bcf_subset_format')
return makeVariantRecord(self.bcf.header, record)
cdef class TabixIterator(BaseIterator):
def __cinit__(self, *args, **kwargs):
self.line_buffer.l = 0
self.line_buffer.m = 0
self.line_buffer.s = NULL
def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, reopen=True):
if bcf is None:
raise ValueError('bcf must not be None')
if not isinstance(bcf.index, TabixIndex):
raise ValueError('tabix index required')
cdef TabixIndex index = bcf.index
self.bcf = bcf
self.index = index
cdef int rid, cstart, cstop
try:
rid = index.refmap[contig]
except KeyError:
# A query for a non-existent contig yields an empty iterator, does not raise an error
self.iter = NULL
return
if reopen:
self.bcf = self.bcf.copy()
cstart = start if start is not None else 0
cstop = stop if stop is not None else MAX_POS
self.iter = tbx_itr_queryi(index.ptr, rid, start, stop)
if not self.iter:
if errno:
raise IOError(errno, strerror(errno))
else:
raise IOError('unable to fetch {}:{}-{}'.format(contig, start+1, stop))
def __dealloc__(self):
if self.iter:
tbx_itr_destroy(self.iter)
self.iter = NULL
if self.line_buffer.m:
free(self.line_buffer.s)
self.line_buffer.l = 0
self.line_buffer.m = 0
self.line_buffer.s = NULL
def __iter__(self):
return self
def __next__(self):
if not self.iter:
raise StopIteration
cdef int ret
with nogil:
ret = tbx_itr_next(self.bcf.htsfile, self.index.ptr, self.iter, &self.line_buffer)
if ret < 0:
tbx_itr_destroy(self.iter)
self.iter = NULL
if ret == -1:
raise StopIteration
elif ret == -2:
raise IOError('truncated file')
elif errno:
raise IOError(errno, strerror(errno))
else:
raise IOError('unable to fetch next record')
cdef bcf1_t *record = bcf_init1()
if not record:
raise MemoryError('unable to allocate BCF record')
record.pos = -1
if self.bcf.drop_samples:
record.max_unpack = BCF_UN_SHR
ret = vcf_parse1(&self.line_buffer, self.bcf.header.ptr, record)
# FIXME: stop iteration on parse failure?
if ret < 0:
bcf_destroy1(record)
raise ValueError('error in vcf_parse')
return makeVariantRecord(self.bcf.header, record)
########################################################################
########################################################################
## Variant File
########################################################################
cdef class VariantFile(HTSFile):
"""*(filename, mode=None, index_filename=None, header=None, drop_samples=False,
duplicate_filehandle=True, ignore_truncation=False, threads=1)*
A :term:`VCF`/:term:`BCF` formatted file. The file is automatically
opened.
If an index for a variant file exists (.csi or .tbi), it will be
opened automatically. Without an index random access to records
via :meth:`fetch` is disabled.
For writing, a :class:`VariantHeader` object must be provided,
typically obtained from another :term:`VCF` file/:term:`BCF`
file.
Parameters
----------
mode : string
*mode* should be ``r`` for reading or ``w`` for writing. The default is
text mode (:term:`VCF`). For binary (:term:`BCF`) I/O you should append
``b`` for compressed or ``u`` for uncompressed :term:`BCF` output.
If ``b`` is present, it must immediately follow ``r`` or ``w``. Valid
modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, ``wbu`` and ``wb0``.
For instance, to open a :term:`BCF` formatted file for reading, type::
f = pysam.VariantFile('ex1.bcf','r')
If mode is not specified, we will try to auto-detect the file type. All
of the following should work::
f1 = pysam.VariantFile('ex1.bcf')
f2 = pysam.VariantFile('ex1.vcf')
f3 = pysam.VariantFile('ex1.vcf.gz')
index_filename : string
Explicit path to an index file.
header : VariantHeader
:class:`VariantHeader` object required for writing.
drop_samples: bool
Ignore sample information when reading.
duplicate_filehandle: bool
By default, file handles passed either directly or through
File-like objects will be duplicated before passing them to
htslib. The duplication prevents issues where the same stream
will be closed by htslib and through destruction of the
high-level python object. Set to False to turn off
duplication.
ignore_truncation: bool
Issue a warning, instead of raising an error if the current file
appears to be truncated due to a missing EOF marker. Only applies
to bgzipped formats. (Default=False)
threads: integer
Number of threads to use for compressing/decompressing VCF/BCF files.
Setting threads to > 1 cannot be combined with `ignore_truncation`.
(Default=1)
"""
def __cinit__(self, *args, **kwargs):
self.htsfile = NULL
def __init__(self, *args, **kwargs):
self.header = None
self.index = None
self.filename = None
self.mode = None
self.threads = 1
self.index_filename = None
self.is_stream = False
self.is_remote = False
self.is_reading = False
self.drop_samples = False
self.header_written = False
self.start_offset = -1
self.open(*args, **kwargs)
def __dealloc__(self):
if not self.htsfile or not self.header:
return
# Write header if no records were written
if self.htsfile.is_write and not self.header_written:
with nogil:
bcf_hdr_write(self.htsfile, self.header.ptr)
cdef int ret = hts_close(self.htsfile)
self.htsfile = NULL
self.header = self.index = None
if ret < 0:
global errno
if errno == EPIPE:
errno = 0
else:
raise IOError(errno, force_str(strerror(errno)))
def close(self):
"""closes the :class:`pysam.VariantFile`."""
if not self.htsfile:
return
# Write header if no records were written
if self.htsfile.is_write and not self.header_written:
with nogil:
bcf_hdr_write(self.htsfile, self.header.ptr)
cdef int ret = hts_close(self.htsfile)
self.htsfile = NULL
self.header = self.index = None
if ret < 0:
global errno
if errno == EPIPE:
errno = 0
else:
raise IOError(errno, force_str(strerror(errno)))
def __iter__(self):
if not self.is_open:
raise ValueError('I/O operation on closed file')
if self.htsfile.is_write:
raise ValueError('cannot iterate over Variantfile opened for writing')
self.is_reading = 1
return self
def __next__(self):
cdef int ret
cdef int errcode
cdef bcf1_t *record = bcf_init1()
if not record:
raise MemoryError('unable to allocate BCF record')
record.pos = -1
if self.drop_samples:
record.max_unpack = BCF_UN_SHR
with nogil:
ret = bcf_read1(self.htsfile, self.header.ptr, record)
if ret < 0:
errcode = record.errcode
bcf_destroy1(record)
if errcode:
raise IOError('unable to parse next record')
if ret == -1:
raise StopIteration
elif ret == -2:
raise IOError('truncated file')
elif errno:
raise IOError(errno, strerror(errno))
else:
raise IOError('unable to fetch next record')
return makeVariantRecord(self.header, record)
def copy(self):
if not self.is_open:
raise ValueError
cdef VariantFile vars = VariantFile.__new__(VariantFile)
cdef bcf_hdr_t *hdr
# FIXME: re-open using fd or else header and index could be invalid
vars.htsfile = self._open_htsfile()
if not vars.htsfile:
raise ValueError('Cannot re-open htsfile')
# minimize overhead by re-using header and index. This approach is
# currently risky, but see above for how this can be mitigated.
vars.header = self.header
vars.index = self.index
vars.filename = self.filename
vars.mode = self.mode
vars.threads = self.threads
vars.index_filename = self.index_filename
vars.drop_samples = self.drop_samples
vars.is_stream = self.is_stream
vars.is_remote = self.is_remote
vars.is_reading = self.is_reading
vars.start_offset = self.start_offset
vars.header_written = self.header_written
if self.htsfile.is_bin:
vars.seek(self.tell())
else:
with nogil:
hdr = bcf_hdr_read(vars.htsfile)
makeVariantHeader(hdr)
return vars
def open(self, filename, mode='r',
index_filename=None,
VariantHeader header=None,
drop_samples=False,
duplicate_filehandle=True,
ignore_truncation=False,
threads=1):
"""open a vcf/bcf file.
If open is called on an existing VariantFile, the current file will be
closed and a new file will be opened.
"""
cdef bcf_hdr_t *hdr
cdef BGZF *bgzfp
cdef hts_idx_t *idx
cdef tbx_t *tidx
cdef char *cfilename
cdef char *cindex_filename = NULL
cdef char *cmode
if threads > 1 and ignore_truncation:
# This won't raise errors if reaching a truncated alignment,
# because bgzf_mt_reader in htslib does not deal with
# bgzf_mt_read_block returning non-zero values, contrary
# to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888)
# Better to avoid this (for now) than to produce seemingly correct results.
raise ValueError('Cannot add extra threads when "ignore_truncation" is True')
self.threads = threads
# close a previously opened file
if self.is_open:
self.close()
if not mode or mode[0] not in 'rwa':
raise ValueError('mode must begin with r, w or a')
self.duplicate_filehandle = duplicate_filehandle
format_modes = [m for m in mode[1:] if m in 'bcguz']
if len(format_modes) > 1:
raise ValueError('mode contains conflicting format specifiers: {}'.format(''.join(format_modes)))
invalid_modes = [m for m in mode[1:] if m not in 'bcguz0123456789ex']
if invalid_modes:
raise ValueError('invalid mode options: {}'.format(''.join(invalid_modes)))
# Autodetect mode from filename
if mode == 'w' and isinstance(filename, str):
if filename.endswith('.gz'):
mode = 'wz'
elif filename.endswith('.bcf'):
mode = 'wb'
# for htslib, wbu seems to not work
if mode == 'wbu':
mode = 'wb0'
self.mode = mode = force_bytes(mode)
try:
filename = encode_filename(filename)
self.is_remote = hisremote(filename)
self.is_stream = filename == b'-'
except TypeError:
filename = filename
self.is_remote = False
self.is_stream = True
self.filename = filename
if index_filename is not None:
self.index_filename = index_filename = encode_filename(index_filename)
else:
self.index_filename = None
self.drop_samples = bool(drop_samples)
self.header = None
self.header_written = False
if mode.startswith(b'w'):
# open file for writing
if index_filename is not None:
raise ValueError('Cannot specify an index filename when writing a VCF/BCF file')
# header structure (used for writing)
if header:
self.header = header.copy()
else:
self.header = VariantHeader()
#raise ValueError('a VariantHeader must be specified')
# Header is not written until the first write or on close
self.htsfile = self._open_htsfile()
if not self.htsfile:
raise ValueError("could not open file `{}` (mode='{}')".format(filename, mode))
elif mode.startswith(b'r'):
# open file for reading
self.htsfile = self._open_htsfile()
if not self.htsfile:
if errno:
raise IOError(errno, 'could not open variant file `{}`: {}'.format(filename, force_str(strerror(errno))))
else:
raise ValueError('could not open variant file `{}`'.format(filename))
if self.htsfile.format.format not in (bcf, vcf):
raise ValueError('invalid file `{}` (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode))
self.check_truncation(ignore_truncation)
with nogil:
hdr = bcf_hdr_read(self.htsfile)
try:
self.header = makeVariantHeader(hdr)
except ValueError:
raise ValueError('file `{}` does not have valid header (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode))
if isinstance(self.filename, bytes):
cfilename = self.filename
else:
cfilename = NULL
# check for index and open if present
if self.htsfile.format.format == bcf and cfilename:
if index_filename is not None:
cindex_filename = index_filename
with nogil:
idx = bcf_index_load2(cfilename, cindex_filename)
self.index = makeBCFIndex(self.header, idx)
elif self.htsfile.format.compression == bgzf and cfilename:
if index_filename is not None:
cindex_filename = index_filename
with nogil:
tidx = tbx_index_load2(cfilename, cindex_filename)
self.index = makeTabixIndex(tidx)
if not self.is_stream:
self.start_offset = self.tell()
else:
raise ValueError('unknown mode {}'.format(mode))
def reset(self):
"""reset file position to beginning of file just after the header."""
return self.seek(self.start_offset)
def is_valid_tid(self, tid):
"""
return True if the numerical :term:`tid` is valid; False otherwise.
returns -1 if reference is not known.
"""
if not self.is_open:
raise ValueError('I/O operation on closed file')
cdef bcf_hdr_t *hdr = self.header.ptr
cdef int rid = tid
return 0 <= rid < hdr.n[BCF_DT_CTG]
def get_tid(self, reference):
"""
return the numerical :term:`tid` corresponding to
:term:`reference`
returns -1 if reference is not known.
"""
if not self.is_open:
raise ValueError('I/O operation on closed file')
cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
reference = force_bytes(reference)
cdef khint_t k = kh_get_vdict(d, reference)
return kh_val_vdict(d, k).id if k != kh_end(d) else -1
def get_reference_name(self, tid):
"""
return :term:`reference` name corresponding to numerical :term:`tid`
"""
if not self.is_open:
raise ValueError('I/O operation on closed file')
cdef bcf_hdr_t *hdr = self.header.ptr
cdef int rid = tid
if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
raise ValueError('Invalid tid')
return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False, end=None, reference=None):
"""fetch records in a :term:`region`, specified either by
:term:`contig`, *start*, and *end* (which are 0-based, half-open);
or alternatively by a samtools :term:`region` string (which is
1-based inclusive).
Without *contig* or *region* all mapped records will be fetched. The
records will be returned ordered by contig, which will not necessarily
be the order within the file.
Set *reopen* to true if you will be using multiple iterators on the
same file at the same time. The iterator returned will receive its
own copy of a filehandle to the file effectively re-opening the
file. Re-opening a file incurrs some overhead, so use with care.
If only *contig* is set, all records on *contig* will be fetched.
If both *region* and *contig* are given, an exception is raised.
Note that a bgzipped :term:`VCF`.gz file without a tabix/CSI index
(.tbi/.csi) or a :term:`BCF` file without a CSI index can only be
read sequentially.
"""
if not self.is_open:
raise ValueError('I/O operation on closed file')
if self.htsfile.is_write:
raise ValueError('cannot fetch from Variantfile opened for writing')
if contig is None and region is None:
self.is_reading = 1
bcf = self.copy() if reopen else self
bcf.seek(self.start_offset)
return iter(bcf)
if self.index is None:
raise ValueError('fetch requires an index')
_, tid, start, stop = self.parse_region(contig, start, stop, region,
None, end=end, reference=reference)
if contig is None:
contig = self.get_reference_name(tid)
self.is_reading = 1
return self.index.fetch(self, contig, start, stop, reopen)
def new_record(self, *args, **kwargs):
"""Create a new empty :class:`VariantRecord`.
See :meth:`VariantHeader.new_record`
"""
return self.header.new_record(*args, **kwargs)
cpdef int write(self, VariantRecord record) except -1:
"""
write a single :class:`pysam.VariantRecord` to disk.
returns the number of bytes written.
"""
if record is None:
raise ValueError('record must not be None')
if not self.is_open:
return ValueError('I/O operation on closed file')
if not self.htsfile.is_write:
raise ValueError('cannot write to a Variantfile opened for reading')
if not self.header_written:
self.header_written = True
with nogil:
bcf_hdr_write(self.htsfile, self.header.ptr)
#if record.header is not self.header:
# record.translate(self.header)
# raise ValueError('Writing records from a different VariantFile is not yet supported')
if record.ptr.n_sample != bcf_hdr_nsamples(self.header.ptr):
msg = 'Invalid VariantRecord. Number of samples does not match header ({} vs {})'
raise ValueError(msg.format(record.ptr.n_sample, bcf_hdr_nsamples(self.header.ptr)))
# Sync END annotation before writing
bcf_sync_end(record)
cdef int ret
with nogil:
ret = bcf_write1(self.htsfile, self.header.ptr, record.ptr)
if ret < 0:
raise IOError(errno, strerror(errno))
return ret
def subset_samples(self, include_samples):
"""
Read only a subset of samples to reduce processing time and memory.
Must be called prior to retrieving records.
"""
if not self.is_open:
raise ValueError('I/O operation on closed file')
if self.htsfile.is_write:
raise ValueError('cannot subset samples from Variantfile opened for writing')
if self.is_reading:
raise ValueError('cannot subset samples after fetching records')
self.header._subset_samples(include_samples)
# potentially unnecessary optimization that also sets max_unpack
if not include_samples:
self.drop_samples = True