jayke commited on Nov 25, 2022

Commit

cf64894

1 Parent(s): ea52087

init

Browse files

Files changed (23) hide show

model_repo_stateful/decoder/1/.gitignore +0 -0
model_repo_stateful/decoder/1/decoder.onnx +3 -0
model_repo_stateful/decoder/config.pbtxt +55 -0
model_repo_stateful/decoder/config_template.pbtxt +73 -0
model_repo_stateful/decoder/config_template2.pbtxt +68 -0
model_repo_stateful/encoder/1/.gitignore +0 -0
model_repo_stateful/encoder/1/encoder.onnx +3 -0
model_repo_stateful/encoder/config.pbtxt +109 -0
model_repo_stateful/encoder/config_template.pbtxt +122 -0
model_repo_stateful/encoder/config_template2.pbtxt +110 -0
model_repo_stateful/feature_extractor/1/__pycache__/model.cpython-38.pyc +0 -0
model_repo_stateful/feature_extractor/1/model.py +277 -0
model_repo_stateful/feature_extractor/config.pbtxt +98 -0
model_repo_stateful/feature_extractor/config_template.pbtxt +111 -0
model_repo_stateful/streaming_wenet/1/.gitignore +0 -0
model_repo_stateful/streaming_wenet/config.pbtxt +102 -0
model_repo_stateful/streaming_wenet/config_template.pbtxt +115 -0
model_repo_stateful/wenet/1/__pycache__/model.cpython-38.pyc +0 -0
model_repo_stateful/wenet/1/__pycache__/wenet_onnx_model.cpython-38.pyc +0 -0
model_repo_stateful/wenet/1/model.py +180 -0
model_repo_stateful/wenet/1/wenet_onnx_model.py +277 -0
model_repo_stateful/wenet/config.pbtxt +126 -0
model_repo_stateful/wenet/config_template.pbtxt +139 -0

model_repo_stateful/decoder/1/.gitignore ADDED Viewed

File without changes

model_repo_stateful/decoder/1/decoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2272437072ed614f41591e42633e4cc3a1c63729ff490a1c3c89a789a05eb70a
+size 56292294

model_repo_stateful/decoder/config.pbtxt ADDED Viewed

	@@ -0,0 +1,55 @@

+name: "decoder"
+backend: "onnxruntime"
+default_model_filename: "decoder.onnx"
+max_batch_size: 640
+input [
+  {
+    name: "encoder_out"
+    data_type: TYPE_FP16
+    dims: [-1, 512] # [-1, feature_size]
+  },
+  {
+    name: "encoder_out_lens"
+    data_type: TYPE_INT32
+    dims: [1]
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "hyps_pad_sos_eos"
+    data_type: TYPE_INT64
+    dims: [10, -1]
+  },
+ {
+    name: "hyps_lens_sos"
+    data_type: TYPE_INT32
+    dims: [10]
+  },
+  {
+    name: "ctc_score"
+    data_type: TYPE_FP16
+    dims: [10]
+  }
+]
+output [
+   {
+    name: "best_index"
+    data_type: TYPE_INT64
+    dims: [1]
+    reshape: { shape: [ ] }
+  }
+]
+dynamic_batching {
+    preferred_batch_size: [ 16, 32 ]
+  }
+instance_group [
+    {
+      count: 2
+      kind: KIND_GPU
+    }
+]

model_repo_stateful/decoder/config_template.pbtxt ADDED Viewed

	@@ -0,0 +1,73 @@

+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "decoder"
+backend: "onnxruntime"
+default_model_filename: "decoder.onnx"
+max_batch_size: 640
+input [
+  {
+    name: "encoder_out"
+    data_type: TYPE_#DTYPE
+    dims: [-1, #output_size]
+  },
+  {
+    name: "encoder_out_lens"
+    data_type: TYPE_INT32
+    dims: [1]
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "hyps_pad_sos_eos"
+    data_type: TYPE_INT64
+    dims: [#beam_size, -1]
+  },
+ {
+    name: "hyps_lens_sos"
+    data_type: TYPE_INT32
+    dims: [#beam_size]
+  },
+  {
+    name: "r_hyps_pad_sos_eos"
+    data_type: TYPE_INT64
+    dims: [#beam_size, -1]
+  },
+  {
+    name: "ctc_score"
+    data_type: TYPE_#DTYPE
+    dims: [#beam_size]
+  }
+]
+output [
+  {
+    name: "best_index"
+    data_type: TYPE_INT64
+    dims: [1]
+    reshape: { shape: [ ] }
+  }
+]
+dynamic_batching {
+    preferred_batch_size: [ 16, 32 ]
+  }
+instance_group [
+    {
+      count: 2
+      kind: KIND_GPU
+    }
+]

model_repo_stateful/decoder/config_template2.pbtxt ADDED Viewed

	@@ -0,0 +1,68 @@

+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "decoder"
+backend: "onnxruntime"
+default_model_filename: "decoder.onnx"
+max_batch_size: 640
+input [
+  {
+    name: "encoder_out"
+    data_type: TYPE_#DTYPE
+    dims: [-1, #output_size] # [-1, feature_size]
+  },
+  {
+    name: "encoder_out_lens"
+    data_type: TYPE_INT32
+    dims: [1]
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "hyps_pad_sos_eos"
+    data_type: TYPE_INT64
+    dims: [#beam_size, -1]
+  },
+ {
+    name: "hyps_lens_sos"
+    data_type: TYPE_INT32
+    dims: [#beam_size]
+  },
+  {
+    name: "ctc_score"
+    data_type: TYPE_#DTYPE
+    dims: [#beam_size]
+  }
+]
+output [
+   {
+    name: "best_index"
+    data_type: TYPE_INT64
+    dims: [1]
+    reshape: { shape: [ ] }
+  }
+]
+dynamic_batching {
+    preferred_batch_size: [ 16, 32 ]
+  }
+instance_group [
+    {
+      count: 2
+      kind: KIND_GPU
+    }
+]

model_repo_stateful/encoder/1/.gitignore ADDED Viewed

File without changes

model_repo_stateful/encoder/1/encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9923799f94d885a0e0c798b0d63322565f48636bd67f7b95af7f8b7e4e4b0de
+size 171905418

model_repo_stateful/encoder/config.pbtxt ADDED Viewed

	@@ -0,0 +1,109 @@

+name: "encoder"
+backend: "onnxruntime"
+default_model_filename: "encoder.onnx"
+max_batch_size: 512
+sequence_batching{
+    max_sequence_idle_microseconds: 5000000
+    oldest {
+      max_candidate_sequences: 1024
+      preferred_batch_size: [32, 64, 128, 256]
+      max_queue_delay_microseconds: 5000
+    }
+    control_input [
+    ]
+    state [
+    {
+      input_name: "offset"
+      output_name: "r_offset"
+      data_type: TYPE_INT64
+      dims: [ 1 ]
+      initial_state: {
+       data_type: TYPE_INT64
+       dims: [ 1 ]
+       zero_data: true
+       name: "initial state"
+      }
+    },
+    {
+      input_name: "att_cache"
+      output_name: "r_att_cache"
+      data_type: TYPE_FP16
+      dims: [ 12, 8, 80, 128 ]
+      initial_state: {
+       data_type: TYPE_FP16
+       dims: [ 12, 8, 80, 128 ]
+       zero_data: true
+       name: "initial state"
+      }
+    },
+    {
+      input_name: "cnn_cache"
+      output_name: "r_cnn_cache"
+      data_type: TYPE_FP16
+      dims: [12, 512, 14]
+      initial_state: {
+       data_type: TYPE_FP16
+       dims: [12, 512, 14]
+       zero_data: true
+       name: "initial state"
+      }
+    },
+    {
+      input_name: "cache_mask"
+      output_name: "r_cache_mask"
+      data_type: TYPE_FP16
+      dims: [1, 80]
+      initial_state: {
+       data_type: TYPE_FP16
+       dims: [1, 80]
+       zero_data: true
+       name: "initial state"
+      }
+    }
+  ]
+}
+input [
+  {
+    name: "chunk_xs"
+    data_type: TYPE_FP16
+    dims: [67, 80]
+  },
+  {
+    name: "chunk_lens"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [] }
+  }
+]
+output [
+  {
+    name: "log_probs"
+    data_type: TYPE_FP16
+    dims: [-1, 10] # [-1, beam_size]
+  },
+  {
+    name: "log_probs_idx"
+    data_type: TYPE_INT64
+    dims: [-1, 10] # [-1, beam_size]
+  },
+  {
+    name: "chunk_out"
+    data_type: TYPE_FP16
+    dims: [-1, -1]
+  },
+  {
+    name: "chunk_out_lens"
+    data_type: TYPE_INT32
+    dims: [1]
+    reshape: { shape: [] }
+  }
+]
+instance_group [
+    {
+      count: 2
+      kind: KIND_GPU
+    }
+]

model_repo_stateful/encoder/config_template.pbtxt ADDED Viewed

	@@ -0,0 +1,122 @@

+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "encoder"
+backend: "onnxruntime"
+default_model_filename: "encoder.onnx"
+max_batch_size: 512
+sequence_batching{
+    max_sequence_idle_microseconds: 5000000
+    oldest {
+      max_candidate_sequences: 1024
+      preferred_batch_size: [32, 64, 128, 256]
+      max_queue_delay_microseconds: 5000
+    }
+    control_input [
+    ]
+    state [
+    {
+      input_name: "offset"
+      output_name: "r_offset"
+      data_type: TYPE_INT64
+      dims: [ 1 ]
+      initial_state: {
+       data_type: TYPE_INT64
+       dims: [ 1 ]
+       zero_data: true
+       name: "initial state"
+      }
+    },
+    {
+      input_name: "att_cache"
+      output_name: "r_att_cache"
+      data_type: TYPE_#DTYPE
+      dims: [ #num_layers, #num_head, #cache_size, #att_cache_output_size ]
+      initial_state: {
+       data_type: TYPE_#DTYPE
+       dims: [ #num_layers, #num_head, #cache_size, #att_cache_output_size ]
+       zero_data: true
+       name: "initial state"
+      }
+    },
+    {
+      input_name: "cnn_cache"
+      output_name: "r_cnn_cache"
+      data_type: TYPE_#DTYPE
+      dims: [#num_layers, #output_size, #cnn_module_cache]
+      initial_state: {
+       data_type: TYPE_#DTYPE
+       dims: [#num_layers, #output_size, #cnn_module_cache]
+       zero_data: true
+       name: "initial state"
+      }
+    },
+    {
+      input_name: "cache_mask"
+      output_name: "r_cache_mask"
+      data_type: TYPE_#DTYPE
+      dims: [1, #cache_size]
+      initial_state: {
+       data_type: TYPE_#DTYPE
+       dims: [1, #cache_size]
+       zero_data: true
+       name: "initial state"
+      }
+    }
+  ]
+}
+input [
+  {
+    name: "chunk_xs"
+    data_type: TYPE_#DTYPE
+    dims: [#decoding_window, #num_mel_bins]
+  },
+  {
+    name: "chunk_lens"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [] }
+  }
+]
+output [
+  {
+    name: "log_probs"
+    data_type: TYPE_#DTYPE
+    dims: [-1, #beam_size] # [-1, beam_size]
+  },
+  {
+    name: "log_probs_idx"
+    data_type: TYPE_INT64
+    dims: [-1, #beam_size] # [-1, beam_size]
+  },
+  {
+    name: "chunk_out"
+    data_type: TYPE_#DTYPE
+    dims: [-1, #encoder_output_size]
+  },
+  {
+    name: "chunk_out_lens"
+    data_type: TYPE_INT32
+    dims: [1]
+    reshape: { shape: [] }
+  }
+]
+instance_group [
+    {
+      count: 2
+      kind: KIND_GPU
+    }
+]

model_repo_stateful/encoder/config_template2.pbtxt ADDED Viewed

	@@ -0,0 +1,110 @@

+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "encoder"
+backend: "onnxruntime"
+default_model_filename: "encoder.onnx"
+max_batch_size: 512
+sequence_batching{
+    max_sequence_idle_microseconds: 5000000
+    oldest {
+      max_candidate_sequences: 1024
+      preferred_batch_size: [32, 64, 128, 256]
+      max_queue_delay_microseconds: 5000
+    }
+    control_input [
+    ]
+    state [
+    {
+      input_name: "offset"
+      output_name: "r_offset"
+      data_type: TYPE_INT64
+      dims: [ 1 ]
+      initial_state: {
+       data_type: TYPE_INT64
+       dims: [ 1 ]
+       zero_data: true
+       name: "initial state"
+      }
+    },
+    {
+      input_name: "att_cache"
+      output_name: "r_att_cache"
+      data_type: TYPE_#DTYPE
+      dims: [ #num_layers, #num_head, #cache_size, #att_cache_output_size ]
+      initial_state: {
+       data_type: TYPE_#DTYPE
+       dims: [ #num_layers, #num_head, #cache_size, #att_cache_output_size ]
+       zero_data: true
+       name: "initial state"
+      }
+    },
+    {
+      input_name: "cache_mask"
+      output_name: "r_cache_mask"
+      data_type: TYPE_#DTYPE
+      dims: [1, #cache_size]
+      initial_state: {
+       data_type: TYPE_#DTYPE
+       dims: [1, #cache_size]
+       zero_data: true
+       name: "initial state"
+      }
+    }
+  ]
+}
+input [
+  {
+    name: "chunk_xs"
+    data_type: TYPE_#DTYPE
+    dims: [#decoding_window, #num_mel_bins]
+  },
+  {
+    name: "chunk_lens"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [] }
+  }
+]
+output [
+  {
+    name: "log_probs"
+    data_type: TYPE_#DTYPE
+    dims: [-1, #beam_size] # [-1, beam_size]
+  },
+  {
+    name: "log_probs_idx"
+    data_type: TYPE_INT64
+    dims: [-1, #beam_size] # [-1, beam_size]
+  },
+  {
+    name: "chunk_out"
+    data_type: TYPE_#DTYPE
+    dims: [-1, #encoder_output_size]
+  },
+  {
+    name: "chunk_out_lens"
+    data_type: TYPE_INT32
+    dims: [1]
+    reshape: { shape: [] }
+  }
+]
+instance_group [
+    {
+      count: 2
+      kind: KIND_GPU
+    }
+]

model_repo_stateful/feature_extractor/1/__pycache__/model.cpython-38.pyc ADDED Viewed

Binary file (7.7 kB). View file

model_repo_stateful/feature_extractor/1/model.py ADDED Viewed

	@@ -0,0 +1,277 @@

+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import triton_python_backend_utils as pb_utils
+from torch.utils.dlpack import from_dlpack
+import torch
+import kaldifeat
+import _kaldifeat
+from typing import List
+import json
+import numpy as np
+class Fbank(torch.nn.Module):
+    def __init__(self, opts):
+        super(Fbank, self).__init__()
+        self.fbank = kaldifeat.Fbank(opts)
+    def forward(self, waves: List[torch.Tensor]):
+        return self.fbank(waves)
+class Feat(object):
+    def __init__(
+        self, seqid, offset_ms, sample_rate, first_chunk_sz, frame_stride, device="cpu"
+    ):
+        self.seqid = seqid
+        self.sample_rate = sample_rate
+        self.wav = torch.tensor([], device=device)
+        self.offset = int(offset_ms / 1000 * sample_rate)
+        self.frames = None
+        self.frame_stride = int(frame_stride)
+        self.first_chunk_sz = first_chunk_sz
+        self.device = device
+    def add_wavs(self, wav: torch.tensor):
+        if len(self.wav) == 0 and len(wav) < self.first_chunk_sz:
+            raise Exception("Invalid first chunk size", len(wav))
+        wav = wav.to(self.device)
+        self.wav = torch.cat([self.wav, wav], axis=0)
+    def get_seg_wav(self):
+        seg = self.wav[:]
+        self.wav = self.wav[-self.offset :]
+        return seg
+    def add_frames(self, frames: torch.tensor):
+        """
+        frames: seq_len x feat_sz
+        """
+        if self.frames is None:
+            self.frames = frames
+        else:
+            self.frames = torch.cat([self.frames, frames], axis=0)
+    def get_frames(self, num_frames: int):
+        seg = self.frames[0:num_frames]
+        self.frames = self.frames[self.frame_stride :]
+        return seg
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        self.model_config = model_config = json.loads(args["model_config"])
+        self.max_batch_size = max(model_config["max_batch_size"], 1)
+        if "GPU" in model_config["instance_group"][0]["kind"]:
+            self.device = "cuda"
+        else:
+            self.device = "cpu"
+        # Get OUTPUT0 configuration
+        output0_config = pb_utils.get_output_config_by_name(model_config, "speech")
+        # Convert Triton types to numpy types
+        self.output0_dtype = pb_utils.triton_string_to_numpy(
+            output0_config["data_type"]
+        )
+        if self.output0_dtype == np.float32:
+            self.dtype = torch.float32
+        else:
+            self.dtype = torch.float16
+        self.feature_size = output0_config["dims"][-1]
+        self.decoding_window = output0_config["dims"][-2]
+        # Get OUTPUT1 configuration
+        output1_config = pb_utils.get_output_config_by_name(
+            model_config, "speech_lengths"
+        )
+        # Convert Triton types to numpy types
+        self.output1_dtype = pb_utils.triton_string_to_numpy(
+            output1_config["data_type"]
+        )
+        feat_opt = self.parse_model_params(model_config["parameters"])
+        opts = kaldifeat.FbankOptions()
+        opts.frame_opts.dither = 0
+        opts.mel_opts.num_bins = self.feature_size
+        frame_length_ms = feat_opt["frame_length_ms"]
+        frame_shift_ms = feat_opt["frame_shift_ms"]
+        opts.frame_opts.frame_length_ms = frame_length_ms
+        opts.frame_opts.frame_shift_ms = frame_shift_ms
+        opts.frame_opts.samp_freq = feat_opt["sample_rate"]
+        opts.device = torch.device(self.device)
+        self.opts = opts
+        self.feature_extractor = Fbank(self.opts)
+        self.seq_feat = {}
+        chunk_size_s = feat_opt["chunk_size_s"]
+        sample_rate = feat_opt["sample_rate"]
+        self.chunk_size = int(chunk_size_s * sample_rate)
+        self.frame_stride = (chunk_size_s * 1000) // frame_shift_ms
+        first_chunk_size = int(self.chunk_size)
+        cur_frames = _kaldifeat.num_frames(first_chunk_size, opts.frame_opts)
+        while cur_frames < self.decoding_window:
+            first_chunk_size += frame_shift_ms * sample_rate // 1000
+            cur_frames = _kaldifeat.num_frames(first_chunk_size, opts.frame_opts)
+        #  self.pad_silence = first_chunk_size - self.chunk_size
+        self.first_chunk_size = first_chunk_size
+        self.offset_ms = self.get_offset(frame_length_ms, frame_shift_ms)
+        self.sample_rate = sample_rate
+        self.min_seg = frame_length_ms * sample_rate // 1000
+        print("MIN SEG IS", self.min_seg)
+    def get_offset(self, frame_length_ms, frame_shift_ms):
+        offset_ms = 0
+        while offset_ms + frame_shift_ms < frame_length_ms:
+            offset_ms += frame_shift_ms
+        return offset_ms
+    def parse_model_params(self, model_params):
+        model_p = {
+            "frame_length_ms": 25,
+            "frame_shift_ms": 10,
+            "sample_rate": 16000,
+            "chunk_size_s": 0.64,
+        }
+        # get parameter configurations
+        for li in model_params.items():
+            key, value = li
+            true_value = value["string_value"]
+            if key not in model_p:
+                continue
+            key_type = type(model_p[key])
+            if key_type == type(None):
+                model_p[key] = true_value
+            else:
+                model_p[key] = key_type(true_value)
+        return model_p
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference is requested
+        for this model.
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+        total_waves = []
+        responses = []
+        batch_seqid = []
+        end_seqid = {}
+        for request in requests:
+            input0 = pb_utils.get_input_tensor_by_name(request, "wav")
+            #  wavs = input0.as_numpy()[0]
+            wavs = from_dlpack(input0.to_dlpack())[0]
+            input1 = pb_utils.get_input_tensor_by_name(request, "wav_lens")
+            #  wav_lens = input1.as_numpy()[0][0]
+            wav_lens = from_dlpack(input1.to_dlpack())[0]
+            in_start = pb_utils.get_input_tensor_by_name(request, "START")
+            start = in_start.as_numpy()[0][0]
+            in_ready = pb_utils.get_input_tensor_by_name(request, "READY")
+            ready = in_ready.as_numpy()[0][0]
+            in_corrid = pb_utils.get_input_tensor_by_name(request, "CORRID")
+            corrid = in_corrid.as_numpy()[0][0]
+            in_end = pb_utils.get_input_tensor_by_name(request, "END")
+            end = in_end.as_numpy()[0][0]
+            print(wavs.size(), wav_lens, ready, start, corrid, end)
+            if start:
+                self.seq_feat[corrid] = Feat(
+                    corrid,
+                    self.offset_ms,
+                    self.sample_rate,
+                    self.first_chunk_size,
+                    self.frame_stride,
+                    self.device,
+                )
+            if ready:
+                self.seq_feat[corrid].add_wavs(wavs[0:wav_lens])
+            batch_seqid.append(corrid)
+            if end:
+                end_seqid[corrid] = 1
+            # if not start
+            # check chunk ms size
+            wav = self.seq_feat[corrid].get_seg_wav() * 32768
+            if len(wav) < self.min_seg:
+                temp = torch.zeros(
+                    self.min_seg, dtype=torch.float32, device=self.device
+                )
+                temp[0 : len(wav)] = wav[:]
+                wav = temp
+            total_waves.append(wav)
+        features = self.feature_extractor(total_waves)
+        batch_size = len(batch_seqid)
+        batch_speech = torch.zeros(
+            (batch_size, self.decoding_window, self.feature_size), dtype=self.dtype
+        )
+        batch_speech_lens = torch.zeros((batch_size, 1), dtype=torch.int32)
+        i = 0
+        for corrid, frames in zip(batch_seqid, features):
+            self.seq_feat[corrid].add_frames(frames)
+            r_frames = self.seq_feat[corrid].get_frames(self.decoding_window)
+            speech = batch_speech[i : i + 1]
+            speech_lengths = batch_speech_lens[i : i + 1]
+            i += 1
+            speech_lengths[0] = r_frames.size(0)
+            speech[0][0 : r_frames.size(0)] = r_frames.to(speech.device)
+            # out_tensor0 = pb_utils.Tensor.from_dlpack("speech", to_dlpack(speech))
+            # out_tensor1 = pb_utils.Tensor.from_dlpack("speech_lengths",
+            #                                            to_dlpack(speech_lengths))
+            out_tensor0 = pb_utils.Tensor("speech", speech.numpy())
+            out_tensor1 = pb_utils.Tensor("speech_lengths", speech_lengths.numpy())
+            output_tensors = [out_tensor0, out_tensor1]
+            response = pb_utils.InferenceResponse(output_tensors=output_tensors)
+            responses.append(response)
+            if corrid in end_seqid:
+                del self.seq_feat[corrid]
+            print(
+                f"feature extractor results: corrid is {corrid}, speech is {speech.numpy()}, speech_lengths is {speech_lengths.numpy()}"
+            )
+        return responses
+    def finalize(self):
+        print("Remove feature extractor!")

model_repo_stateful/feature_extractor/config.pbtxt ADDED Viewed

	@@ -0,0 +1,98 @@

+name: "feature_extractor"
+backend: "python"
+max_batch_size: 512
+parameters [
+  {
+    key: "frame_length_ms",
+    value: { string_value: "25" }
+  },
+  {
+    key: "frame_shift_ms"
+    value: { string_value: "10" }
+  },
+  {
+    key: "sample_rate"
+    value: { string_value: "16000" }
+  },
+  {
+    key: "chunk_size_s",
+    value: { string_value: "0.64" }
+  }
+]
+sequence_batching{
+    max_sequence_idle_microseconds: 5000000
+    oldest {
+      max_candidate_sequences: 512
+      preferred_batch_size: [ 32, 64, 128, 256]
+    }
+    control_input [
+        {
+            name: "START",
+            control [
+                {
+                    kind: CONTROL_SEQUENCE_START
+                    fp32_false_true: [0, 1]
+                }
+            ]
+        },
+        {
+            name: "READY"
+            control [
+                {
+                    kind: CONTROL_SEQUENCE_READY
+                    fp32_false_true: [0, 1]
+                }
+            ]
+        },
+        {
+            name: "CORRID",
+            control [
+                {
+                    kind: CONTROL_SEQUENCE_CORRID
+                    data_type: TYPE_UINT64
+                }
+            ]
+        },
+        {
+            name: "END",
+            control [
+                {
+                    kind: CONTROL_SEQUENCE_END
+                    fp32_false_true: [0, 1]
+                }
+            ]
+        }
+    ]
+}
+input [
+  {
+    name: "wav"
+    data_type: TYPE_FP32
+    dims: [-1]
+  },
+  {
+    name: "wav_lens"
+    data_type: TYPE_INT32
+    dims: [1]
+  }
+]
+output [
+  {
+    name: "speech"
+    data_type: TYPE_FP16 # FP32
+    dims: [67, 80]
+  },
+  {
+    name: "speech_lengths"
+    data_type: TYPE_INT32
+    dims: [1]
+  }
+]
+instance_group [
+    {
+      count: 2
+      kind: KIND_GPU
+    }
+]

model_repo_stateful/feature_extractor/config_template.pbtxt ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "feature_extractor"
+backend: "python"
+max_batch_size: 512
+parameters [
+  {
+    key: "frame_length_ms",
+    value: { string_value: "#frame_length" }
+  },
+  {
+    key: "frame_shift_ms"
+    value: { string_value: "#frame_shift" }
+  },
+  {
+    key: "sample_rate"
+    value: { string_value: "#sample_rate" }
+  },
+  {
+    key: "chunk_size_s",
+    value: { string_value: "#chunk_size_in_seconds" }
+  }
+]
+sequence_batching{
+    max_sequence_idle_microseconds: 5000000
+    oldest {
+      max_candidate_sequences: 512
+      preferred_batch_size: [ 32, 64, 128, 256]
+    }
+    control_input [
+        {
+            name: "START",
+            control [
+                {
+                    kind: CONTROL_SEQUENCE_START
+                    fp32_false_true: [0, 1]
+                }
+            ]
+        },
+        {
+            name: "READY"
+            control [
+                {
+                    kind: CONTROL_SEQUENCE_READY
+                    fp32_false_true: [0, 1]
+                }
+            ]
+        },
+        {
+            name: "CORRID",
+            control [
+                {
+                    kind: CONTROL_SEQUENCE_CORRID
+                    data_type: TYPE_UINT64
+                }
+            ]
+        },
+        {
+            name: "END",
+            control [
+                {
+                    kind: CONTROL_SEQUENCE_END
+                    fp32_false_true: [0, 1]
+                }
+            ]
+        }
+    ]
+}
+input [
+  {
+    name: "wav"
+    data_type: TYPE_FP32
+    dims: [-1]
+  },
+  {
+    name: "wav_lens"
+    data_type: TYPE_INT32
+    dims: [1]
+  }
+]
+output [
+  {
+    name: "speech"
+    data_type: TYPE_#DTYPE # FP32
+    dims: [#decoding_window, #num_mel_bins]
+  },
+  {
+    name: "speech_lengths"
+    data_type: TYPE_INT32
+    dims: [1]
+  }
+]
+instance_group [
+    {
+      count: 2
+      kind: KIND_GPU
+    }
+]

model_repo_stateful/streaming_wenet/1/.gitignore ADDED Viewed

File without changes

model_repo_stateful/streaming_wenet/config.pbtxt ADDED Viewed

	@@ -0,0 +1,102 @@

+name: "streaming_wenet"
+platform: "ensemble"
+max_batch_size: 512 #MAX_BATCH
+input [
+  {
+    name: "WAV"
+    data_type: TYPE_FP32
+    dims: [-1]
+  },
+  {
+    name: "WAV_LENS"
+    data_type: TYPE_INT32
+    dims: [1]
+  }
+]
+output [
+  {
+    name: "TRANSCRIPTS"
+    data_type: TYPE_STRING
+    dims: [1]
+  }
+]
+ensemble_scheduling {
+ step [
+   {
+        model_name: "feature_extractor"
+        model_version: -1
+        input_map {
+        key: "wav"
+        value: "WAV"
+        }
+        input_map {
+        key: "wav_lens"
+        value: "WAV_LENS"
+        }
+        output_map {
+        key: "speech"
+        value: "SPEECH"
+        }
+        output_map {
+        key: "speech_lengths"
+        value: "SPEECH_LENGTHS"
+        }
+   },
+   {
+        model_name: "encoder"
+        model_version: -1
+        input_map {
+        key: "chunk_xs"
+        value: "SPEECH"
+        }
+        input_map {
+        key: "chunk_lens"
+        value: "SPEECH_LENGTHS"
+        }
+        output_map {
+            key: "log_probs"
+            value: "LOG_PROBS"
+        }
+        output_map {
+            key: "log_probs_idx"
+            value: "LOG_PROBS_IDX"
+        }
+        output_map {
+            key: "chunk_out"
+             value: "CHUNK_OUT"
+        }
+        output_map {
+            key: "chunk_out_lens"
+            value: "CHUNK_OUT_LENS"
+        }
+    },
+    {
+        model_name: "wenet"
+        model_version: -1
+        input_map {
+        key: "log_probs"
+        value: "LOG_PROBS"
+        }
+        input_map {
+        key: "log_probs_idx"
+        value: "LOG_PROBS_IDX"
+        }
+        input_map {
+        key: "chunk_out"
+        value: "CHUNK_OUT"
+        }
+        input_map {
+        key: "chunk_out_lens"
+        value: "CHUNK_OUT_LENS"
+        }
+        output_map {
+        key: "OUTPUT0"
+        value: "TRANSCRIPTS"
+        }
+    }
+ ]
+}

model_repo_stateful/streaming_wenet/config_template.pbtxt ADDED Viewed

	@@ -0,0 +1,115 @@

+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "streaming_wenet"
+platform: "ensemble"
+max_batch_size: 512 #MAX_BATCH
+input [
+  {
+    name: "WAV"
+    data_type: TYPE_FP32
+    dims: [-1]
+  },
+  {
+    name: "WAV_LENS"
+    data_type: TYPE_INT32
+    dims: [1]
+  }
+]
+output [
+  {
+    name: "TRANSCRIPTS"
+    data_type: TYPE_STRING
+    dims: [1]
+  }
+]
+ensemble_scheduling {
+ step [
+   {
+        model_name: "feature_extractor"
+        model_version: -1
+        input_map {
+        key: "wav"
+        value: "WAV"
+        }
+        input_map {
+        key: "wav_lens"
+        value: "WAV_LENS"
+        }
+        output_map {
+        key: "speech"
+        value: "SPEECH"
+        }
+        output_map {
+        key: "speech_lengths"
+        value: "SPEECH_LENGTHS"
+        }
+   },
+   {
+        model_name: "encoder"
+        model_version: -1
+        input_map {
+        key: "chunk_xs"
+        value: "SPEECH"
+        }
+        input_map {
+        key: "chunk_lens"
+        value: "SPEECH_LENGTHS"
+        }
+        output_map {
+            key: "log_probs"
+            value: "LOG_PROBS"
+        }
+        output_map {
+            key: "log_probs_idx"
+            value: "LOG_PROBS_IDX"
+        }
+        output_map {
+            key: "chunk_out"
+             value: "CHUNK_OUT"
+        }
+        output_map {
+            key: "chunk_out_lens"
+            value: "CHUNK_OUT_LENS"
+        }
+    },
+    {
+        model_name: "wenet"
+        model_version: -1
+        input_map {
+        key: "log_probs"
+        value: "LOG_PROBS"
+        }
+        input_map {
+        key: "log_probs_idx"
+        value: "LOG_PROBS_IDX"
+        }
+        input_map {
+        key: "chunk_out"
+        value: "CHUNK_OUT"
+        }
+        input_map {
+        key: "chunk_out_lens"
+        value: "CHUNK_OUT_LENS"
+        }
+        output_map {
+        key: "OUTPUT0"
+        value: "TRANSCRIPTS"
+        }
+    }
+ ]
+}

model_repo_stateful/wenet/1/__pycache__/model.cpython-38.pyc ADDED Viewed

Binary file (4.26 kB). View file

model_repo_stateful/wenet/1/__pycache__/wenet_onnx_model.cpython-38.pyc ADDED Viewed

Binary file (6.77 kB). View file

model_repo_stateful/wenet/1/model.py ADDED Viewed

	@@ -0,0 +1,180 @@

+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import json
+import torch
+from swig_decoders import PathTrie, TrieVector
+# triton_python_backend_utils is available in every Triton Python model. You
+# need to use this module to create inference requests and responses. It also
+# contains some utility functions for extracting information from model_config
+# and converting Triton input/output types to numpy types.
+import triton_python_backend_utils as pb_utils
+from wenet_onnx_model import WenetModel
+from torch.utils.dlpack import from_dlpack
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to intialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        # You must parse model_config. JSON string is not parsed here
+        self.model_config = model_config = json.loads(args['model_config'])
+        # get device
+        if args["model_instance_kind"] == "GPU":
+            self.device = 'cuda'
+        else:
+            self.device = 'cpu'
+        # get parameter configurations
+        self.model = WenetModel(self.model_config, self.device)
+        # Get OUTPUT0 configuration
+        output0_config = pb_utils.get_output_config_by_name(
+            model_config, "OUTPUT0")
+        # Convert Triton types to numpy types
+        self.output0_dtype = pb_utils.triton_string_to_numpy(
+            output0_config['data_type'])
+        # use to record every sequence state
+        self.seq_states = {}
+        print("Finish Init")
+    def execute(self, requests):
+        """
+        requests : list
+          A list of pb_utils.InferenceRequest
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+        responses = []
+        batch_log_probs, batch_log_probs_idx, batch_len, batch_states = [], [], [], []
+        cur_encoder_out = []
+        batch_encoder_hist = []
+        batch_start = []
+        trieVector = TrieVector()
+        rescore_index = {}
+        batch_idx2_corrid = {}
+        # Every Python backend must iterate over everyone of the requests
+        # and create a pb_utils.InferenceResponse for each of them.
+        batch_idx = 0
+        for request in requests:
+            # Get INPUT0
+            in_0 = pb_utils.get_input_tensor_by_name(request, "log_probs")
+            batch_log_probs.append(in_0.as_numpy()[0])
+            in_1 = pb_utils.get_input_tensor_by_name(request, "log_probs_idx")
+            batch_log_probs_idx.append(in_1.as_numpy()[0])
+            if self.model.rescoring:
+                in_2 = pb_utils.get_input_tensor_by_name(request, "chunk_out")
+                # important to use clone or this tensor
+                # the tensor will be released after one inference
+                in_2 = from_dlpack(in_2.to_dlpack()).clone()
+                cur_encoder_out.append(in_2[0])
+            in_3 = pb_utils.get_input_tensor_by_name(request, "chunk_out_lens")
+            batch_len.append(in_3.as_numpy())
+            in_start = pb_utils.get_input_tensor_by_name(request, "START")
+            start = in_start.as_numpy()[0][0]
+            if start:
+                batch_start.append(True)
+            else:
+                batch_start.append(False)
+            in_ready = pb_utils.get_input_tensor_by_name(request, "READY")
+            ready = in_ready.as_numpy()[0][0]
+            in_corrid = pb_utils.get_input_tensor_by_name(request, "CORRID")
+            corrid = in_corrid.as_numpy()[0][0]
+            in_end = pb_utils.get_input_tensor_by_name(request, "END")
+            end = in_end.as_numpy()[0][0]
+            if start and ready:
+                # intialize states
+                encoder_out = self.model.generate_init_cache()
+                root = PathTrie()
+                # register this sequence
+                self.seq_states[corrid] = [root, encoder_out]
+            if end and ready:
+                rescore_index[batch_idx] = 1
+            if ready:
+                root, encoder_out = self.seq_states[corrid]
+                trieVector.append(root)
+                batch_idx2_corrid[batch_idx] = corrid
+                batch_encoder_hist.append(encoder_out)
+            batch_idx += 1
+        batch_states = [trieVector, batch_start, batch_encoder_hist, cur_encoder_out]
+        res_sents, new_states = self.model.infer(batch_log_probs, batch_log_probs_idx,
+                                                 batch_len, rescore_index, batch_states)
+        cur_encoder_out = new_states
+        for i in range(len(res_sents)):
+            sent = np.array(res_sents[i])
+            out_tensor_0 = pb_utils.Tensor("OUTPUT0", sent.astype(self.output0_dtype))
+            response = pb_utils.InferenceResponse(output_tensors=[out_tensor_0])
+            responses.append(response)
+            corr = batch_idx2_corrid[i]
+            if i in rescore_index:
+                # this response ends, remove it
+                del self.seq_states[corr]
+            else:
+                if self.model.rescoring:
+                    if self.seq_states[corr][1] is None:
+                        self.seq_states[corr][1] = cur_encoder_out[i]
+                    else:
+                        new_hist = torch.cat([self.seq_states[corr][1],
+                                              cur_encoder_out[i]], axis=0)
+                        self.seq_states[corr][1] = new_hist
+        assert len(requests) == len(responses)
+        return responses
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is OPTIONAL. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print('Cleaning up...')
+        del self.model

model_repo_stateful/wenet/1/wenet_onnx_model.py ADDED Viewed

	@@ -0,0 +1,277 @@

+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import multiprocessing
+import numpy as np
+import os
+import torch
+import triton_python_backend_utils as pb_utils
+from torch.utils.dlpack import to_dlpack, from_dlpack
+from swig_decoders import ctc_beam_search_decoder_batch, Scorer, map_batch
+class WenetModel(object):
+    def __init__(self, model_config, device):
+        params = self.parse_model_parameters(model_config['parameters'])
+        self.device = device
+        print("Using device", device)
+        print("Successfully load model !")
+        # load vocabulary
+        ret = self.load_vocab(params["vocab_path"])
+        self.id2vocab, self.vocab, space_id, blank_id, sos_eos = ret
+        self.space_id = space_id if space_id else -1
+        self.blank_id = blank_id if blank_id else 0
+        self.eos = self.sos = sos_eos if sos_eos else len(self.vocab) - 1
+        print("Successfully load vocabulary !")
+        self.params = params
+        # beam search setting
+        self.beam_size = params.get("beam_size")
+        self.cutoff_prob = params.get("cutoff_prob")
+        # language model
+        lm_path = params.get("lm_path", None)
+        alpha, beta = params.get('alpha'), params.get('beta')
+        self.scorer = None
+        if os.path.exists(lm_path):
+            self.scorer = Scorer(alpha, beta, lm_path, self.vocab)
+        self.bidecoder = params.get('bidecoder')
+        # rescore setting
+        self.rescoring = params.get("rescoring", 0)
+        print("Using rescoring:", bool(self.rescoring))
+        print("Successfully load all parameters!")
+        log_probs_config = pb_utils.get_input_config_by_name(
+            model_config, "log_probs")
+        # Convert Triton types to numpy types
+        log_probs_dtype = pb_utils.triton_string_to_numpy(
+            log_probs_config['data_type'])
+        if log_probs_dtype == np.float32:
+            self.dtype = torch.float32
+        else:
+            self.dtype = torch.float16
+    def generate_init_cache(self):
+        encoder_out = None
+        return encoder_out
+    def load_vocab(self, vocab_file):
+        """
+        load lang_char.txt
+        """
+        id2vocab = {}
+        space_id, blank_id, sos_eos = None, None, None
+        with open(vocab_file, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                char, id = line.split()
+                id2vocab[int(id)] = char
+                if char == " ":
+                    space_id = int(id)
+                elif char == "<blank>":
+                    blank_id = int(id)
+                elif char == "<sos/eos>":
+                    sos_eos = int(id)
+        vocab = [0] * len(id2vocab)
+        for id, char in id2vocab.items():
+            vocab[id] = char
+        return (id2vocab, vocab, space_id, blank_id, sos_eos)
+    def parse_model_parameters(self, model_parameters):
+        model_p = {"beam_size": 10,
+                   "cutoff_prob": 0.999,
+                   "vocab_path": None,
+                   "lm_path": None,
+                   "alpha": 2.0,
+                   "beta": 1.0,
+                   "rescoring": 0,
+                   "bidecoder": 1}
+        # get parameter configurations
+        for li in model_parameters.items():
+            key, value = li
+            true_value = value["string_value"]
+            if key not in model_p:
+                continue
+            key_type = type(model_p[key])
+            if key_type == type(None):
+                model_p[key] = true_value
+            else:
+                model_p[key] = key_type(true_value)
+        assert model_p["vocab_path"] is not None
+        return model_p
+    def infer(self, batch_log_probs, batch_log_probs_idx,
+              seq_lens, rescore_index, batch_states):
+        """
+        batch_states = [trieVector, batch_start,
+                       batch_encoder_hist, cur_encoder_out]
+        """
+        trie_vector, batch_start, batch_encoder_hist, cur_encoder_out = batch_states
+        num_processes = min(multiprocessing.cpu_count(), len(batch_log_probs))
+        score_hyps = self.batch_ctc_prefix_beam_search_cpu(batch_log_probs,
+                                                           batch_log_probs_idx,
+                                                           seq_lens,
+                                                           trie_vector,
+                                                           batch_start,
+                                                           self.beam_size,
+                                                           self.blank_id,
+                                                           self.space_id,
+                                                           self.cutoff_prob,
+                                                           num_processes,
+                                                           self.scorer)
+        if self.rescoring and len(rescore_index) != 0:
+            # find the end of sequence
+            rescore_encoder_hist = []
+            rescore_encoder_lens = []
+            rescore_hyps = []
+            res_idx = list(rescore_index.keys())
+            max_length = -1
+            for idx in res_idx:
+                hist_enc = batch_encoder_hist[idx]
+                if hist_enc is None:
+                    cur_enc = cur_encoder_out[idx]
+                else:
+                    cur_enc = torch.cat([hist_enc, cur_encoder_out[idx]], axis=0)
+                rescore_encoder_hist.append(cur_enc)
+                cur_mask_len = int(len(hist_enc) + seq_lens[idx])
+                rescore_encoder_lens.append(cur_mask_len)
+                rescore_hyps.append(score_hyps[idx])
+                if cur_enc.shape[0] > max_length:
+                    max_length = cur_enc.shape[0]
+            best_index = self.batch_rescoring(rescore_hyps, rescore_encoder_hist,
+                                              rescore_encoder_lens, max_length)
+        best_sent = []
+        j = 0
+        for idx, li in enumerate(score_hyps):
+            if idx in rescore_index and self.rescoring:
+                best_sent.append(li[best_index[j]][1])
+                j += 1
+            else:
+                best_sent.append(li[0][1])
+        final_result = map_batch(best_sent, self.vocab, num_processes)
+        return final_result, cur_encoder_out
+    def batch_ctc_prefix_beam_search_cpu(self, batch_log_probs_seq,
+                                         batch_log_probs_idx,
+                                         batch_len, batch_root,
+                                         batch_start, beam_size,
+                                         blank_id, space_id,
+                                         cutoff_prob, num_processes,
+                                         scorer):
+        """
+        Return: Batch x Beam_size elements, each element is a tuple
+                (score, list of ids),
+        """
+        batch_len_list = batch_len
+        batch_log_probs_seq_list = []
+        batch_log_probs_idx_list = []
+        for i in range(len(batch_len_list)):
+            cur_len = int(batch_len_list[i])
+            batch_log_probs_seq_list.append(batch_log_probs_seq[i][0:cur_len].tolist())
+            batch_log_probs_idx_list.append(batch_log_probs_idx[i][0:cur_len].tolist())
+        score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq_list,
+                                                   batch_log_probs_idx_list,
+                                                   batch_root,
+                                                   batch_start,
+                                                   beam_size,
+                                                   num_processes,
+                                                   blank_id,
+                                                   space_id,
+                                                   cutoff_prob,
+                                                   scorer)
+        return score_hyps
+    def batch_rescoring(self, score_hyps, hist_enc, hist_mask_len, max_len):
+        """
+        score_hyps: [((ctc_score, (id1, id2, id3, ....)), (), ...), ....]
+        hist_enc: [len1xF, len2xF, .....]
+        hist_mask: [1x1xlen1, 1x1xlen2]
+        return bzx1  best_index
+        """
+        bz = len(hist_enc)
+        f = hist_enc[0].shape[-1]
+        beam_size = self.beam_size
+        encoder_lens = np.zeros((bz, 1), dtype=np.int32)
+        encoder_out = torch.zeros((bz, max_len, f), dtype=self.dtype)
+        hyps = []
+        ctc_score = torch.zeros((bz, beam_size), dtype=self.dtype)
+        max_seq_len = 0
+        for i in range(bz):
+            cur_len = hist_enc[i].shape[0]
+            encoder_out[i, 0:cur_len] = hist_enc[i]
+            encoder_lens[i, 0] = hist_mask_len[i]
+            # process candidate
+            if len(score_hyps[i]) < beam_size:
+                to_append = (beam_size - len(score_hyps[i])) * [(-10000, ())]
+                score_hyps[i] = list(score_hyps[i]) + to_append
+            for idx, c in enumerate(score_hyps[i]):
+                score, idlist = c
+                if score < -10000:
+                    score = -10000
+                ctc_score[i][idx] = score
+                hyps.append(list(idlist))
+                if len(hyps[-1]) > max_seq_len:
+                    max_seq_len = len(hyps[-1])
+        max_seq_len += 2
+        hyps_pad_sos_eos = np.ones((bz, beam_size, max_seq_len), dtype=np.int64)
+        hyps_pad_sos_eos = hyps_pad_sos_eos * self.eos  # fill eos
+        if self.bidecoder:
+            r_hyps_pad_sos_eos = np.ones((bz, beam_size, max_seq_len), dtype=np.int64)
+            r_hyps_pad_sos_eos = r_hyps_pad_sos_eos * self.eos
+        hyps_lens_sos = np.ones((bz, beam_size), dtype=np.int32)
+        bz_id = 0
+        for idx, cand in enumerate(hyps):
+            bz_id = idx // beam_size
+            length = len(cand) + 2
+            bz_offset = idx % beam_size
+            pad_cand = [self.sos] + cand + [self.eos]
+            hyps_pad_sos_eos[bz_id][bz_offset][0 : length] = pad_cand
+            if self.bidecoder:
+                r_pad_cand = [self.sos] + cand[::-1] + [self.eos]
+                r_hyps_pad_sos_eos[bz_id][bz_offset][0:length] = r_pad_cand
+            hyps_lens_sos[bz_id][idx % beam_size] = len(cand) + 1
+        in0 = pb_utils.Tensor.from_dlpack("encoder_out", to_dlpack(encoder_out))
+        in1 = pb_utils.Tensor("encoder_out_lens", encoder_lens)
+        in2 = pb_utils.Tensor("hyps_pad_sos_eos", hyps_pad_sos_eos)
+        in3 = pb_utils.Tensor("hyps_lens_sos", hyps_lens_sos)
+        input_tensors = [in0, in1, in2, in3]
+        if self.bidecoder:
+            in4 = pb_utils.Tensor("r_hyps_pad_sos_eos", r_hyps_pad_sos_eos)
+            input_tensors.append(in4)
+        in5 = pb_utils.Tensor.from_dlpack("ctc_score", to_dlpack(ctc_score))
+        input_tensors.append(in5)
+        request = pb_utils.InferenceRequest(model_name='decoder',
+                                            requested_output_names=['best_index'],
+                                            inputs=input_tensors)
+        response = request.exec()
+        best_index = pb_utils.get_output_tensor_by_name(response, 'best_index')
+        best_index = from_dlpack(best_index.to_dlpack()).clone()
+        best_index = best_index.numpy()[:, 0]
+        return best_index
+    def __del__(self):
+        print("remove wenet model")

model_repo_stateful/wenet/config.pbtxt ADDED Viewed

	@@ -0,0 +1,126 @@

+name: "wenet"
+backend: "python"
+max_batch_size: 512
+sequence_batching{
+    max_sequence_idle_microseconds: 5000000
+    oldest {
+      max_candidate_sequences: 1024
+      preferred_batch_size: [32, 64, 128, 256]
+    }
+    control_input [
+        {
+            name: "START",
+            control [
+                {
+                    kind: CONTROL_SEQUENCE_START
+                    fp32_false_true: [0, 1]
+                }
+            ]
+        },
+        {
+            name: "READY"
+            control [
+                {
+                    kind: CONTROL_SEQUENCE_READY
+                    fp32_false_true: [0, 1]
+                }
+            ]
+        },
+        {
+            name: "CORRID",
+            control [
+                {
+                    kind: CONTROL_SEQUENCE_CORRID
+                    data_type: TYPE_UINT64
+                }
+            ]
+        },
+        {
+            name: "END",
+            control [
+                {
+                    kind: CONTROL_SEQUENCE_END
+                    fp32_false_true: [0, 1]
+                }
+            ]
+        }
+    ]
+}
+parameters [
+  {
+    key: "beam_size",
+    value: { string_value: "10" }
+  },
+  {
+    key: "cutoff_prob",
+    value: { string_value: "0.9999" }
+  },
+  {
+    key: "alpha",
+    value: { string_value: "2" }
+  },
+  {
+    key: "beta",
+    value: { string_value: "1" }
+  },
+  {
+    key: "vocab_path",
+    value: { string_value: "/ws/onnx_model/units.txt"}
+  },
+  {
+    key: "lm_path",
+    value: { string_value: "/ws/onnx_model/lm.bin"}
+  },
+  {
+    key: "bidecoder",
+    value: { string_value: "0"}
+  },
+  {
+    key: "rescoring",
+    value: { string_value: "1" }
+  },
+  {
+   key: "FORCE_CPU_ONLY_INPUT_TENSORS",
+   value: {string_value:"yes"}
+  }
+]
+input [
+   {
+    name: "log_probs"
+    data_type: TYPE_FP16
+    dims: [-1, 10] # [-1, beam_size]
+  },
+  {
+    name: "log_probs_idx"
+    data_type: TYPE_INT64
+    dims: [-1, 10] # [-1, beam_size]
+  },
+  {
+    name: "chunk_out"
+    data_type: TYPE_FP16
+    dims: [-1, -1]
+  },
+  {
+    name: "chunk_out_lens"
+    data_type: TYPE_INT32
+    dims: [1]
+  }
+]
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_STRING
+    dims: [1]
+    reshape { shape: [] }
+  }
+]
+instance_group [
+    {
+      count: 2
+      kind: KIND_CPU
+    }
+]

model_repo_stateful/wenet/config_template.pbtxt ADDED Viewed

	@@ -0,0 +1,139 @@

+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "wenet"
+backend: "python"
+max_batch_size: 512
+sequence_batching{
+    max_sequence_idle_microseconds: 5000000
+    oldest {
+      max_candidate_sequences: 1024
+      preferred_batch_size: [32, 64, 128, 256]
+    }
+    control_input [
+        {
+            name: "START",
+            control [
+                {
+                    kind: CONTROL_SEQUENCE_START
+                    fp32_false_true: [0, 1]
+                }
+            ]
+        },
+        {
+            name: "READY"
+            control [
+                {
+                    kind: CONTROL_SEQUENCE_READY
+                    fp32_false_true: [0, 1]
+                }
+            ]
+        },
+        {
+            name: "CORRID",
+            control [
+                {
+                    kind: CONTROL_SEQUENCE_CORRID
+                    data_type: TYPE_UINT64
+                }
+            ]
+        },
+        {
+            name: "END",
+            control [
+                {
+                    kind: CONTROL_SEQUENCE_END
+                    fp32_false_true: [0, 1]
+                }
+            ]
+        }
+    ]
+}
+parameters [
+  {
+    key: "beam_size",
+    value: { string_value: "#beam_size" }
+  },
+  {
+    key: "cutoff_prob",
+    value: { string_value: "0.9999" }
+  },
+  {
+    key: "alpha",
+    value: { string_value: "2" }
+  },
+  {
+    key: "beta",
+    value: { string_value: "1" }
+  },
+  {
+    key: "vocab_path",
+    value: { string_value: "/ws/onnx_model/units.txt"}
+  },
+  {
+    key: "lm_path",
+    value: { string_value: "/ws/onnx_model/lm.bin"}
+  },
+  {
+    key: "bidecoder",
+    value: { string_value: "#bidecoder"}
+  },
+  {
+    key: "rescoring",
+    value: { string_value: "1" }
+  },
+  {
+   key: "FORCE_CPU_ONLY_INPUT_TENSORS",
+   value: {string_value:"yes"}
+  }
+]
+input [
+   {
+    name: "log_probs"
+    data_type: TYPE_#DTYPE
+    dims: [-1, #beam_size] # [-1, beam_size]
+  },
+  {
+    name: "log_probs_idx"
+    data_type: TYPE_INT64
+    dims: [-1, #beam_size] # [-1, beam_size]
+  },
+  {
+    name: "chunk_out"
+    data_type: TYPE_#DTYPE
+    dims: [-1, -1]
+  },
+  {
+    name: "chunk_out_lens"
+    data_type: TYPE_INT32
+    dims: [1]
+  }
+]
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_STRING
+    dims: [1]
+    reshape { shape: [] }
+  }
+]
+instance_group [
+    {
+      count: 2
+      kind: KIND_CPU
+    }
+]