NCTC / models /research /seq2species /seq2label_utils.py
NCTCMumbai's picture
Upload 2571 files
0b8359d
raw
history blame
3.49 kB
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for working with Seq2Label datasets and models.
This library provides utilities for parsing and generating Seq2Label protos.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from protos import seq2label_pb2
def get_all_label_values(dataset_info):
"""Retrieves possible values for modeled labels from a `Seq2LabelDatasetInfo`.
Args:
dataset_info: a `Seq2LabelDatasetInfo` message.
Returns:
A dictionary mapping each label name to a tuple of its permissible values.
"""
return {
label_info.name: tuple(label_info.values)
for label_info in dataset_info.labels
}
def construct_seq2label_model_info(hparams, model_type, targets, metadata_path,
batch_size, num_filters,
training_noise_rate):
"""Constructs a Seq2LabelModelInfo proto with the given properties.
Args:
hparams: initialized tf.contrib.training.Hparams object.
model_type: string; descriptive tag indicating type of model, ie. "conv".
targets: list of names of the targets the model is trained to predict.
metadata_path: string; full path to Seq2LabelDatasetInfo text proto used
to initialize the model.
batch_size: int; number of reads per mini-batch.
num_filters: int; number of filters for convolutional model.
training_noise_rate: float; rate [0.0, 1.0] of base-flipping noise injected
into input read sequenced at training time.
Returns:
The Seq2LabelModelInfo proto with the hparams, model_type, targets,
num_filters, batch_size, metadata_path, and training_noise_rate fields
set to the given values.
"""
return seq2label_pb2.Seq2LabelModelInfo(
hparams_string=hparams.to_json(),
model_type=model_type,
targets=sorted(targets),
num_filters=num_filters,
batch_size=batch_size,
metadata_path=metadata_path,
training_noise_rate=training_noise_rate)
def add_read_noise(read, base_flip_probability=0.01):
"""Adds base-flipping noise to the given read sequence.
Args:
read: string; the read sequence to which to add noise.
base_flip_probability: float; probability of a base flip at each position.
Returns:
The given read with base-flipping noise added at the provided
base_flip_probability rate.
"""
base_flips = np.random.binomial(1, base_flip_probability, len(read))
if sum(base_flips) == 0:
return read
read = np.array(list(read))
possible_mutations = np.char.replace(['ACTG'] * sum(base_flips),
read[base_flips == 1], '')
mutations = map(np.random.choice, map(list, possible_mutations))
read[base_flips == 1] = mutations
return ''.join(read)