Spaces:

Artrajz
/

vits-simple-api-gsv

Sleeping

File size: 2,849 Bytes

960cd20

import os
import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import BertModel, BertConfig, BertTokenizer

try:
    from contants import config
except:
    from contants import config


class CharEmbedding(nn.Module):
    def __init__(self, model_dir):
        super().__init__()
        self.tokenizer = BertTokenizer.from_pretrained(model_dir)
        self.bert_config = BertConfig.from_pretrained(model_dir)
        self.hidden_size = self.bert_config.hidden_size
        self.bert = BertModel(self.bert_config)
        self.proj = nn.Linear(self.hidden_size, 256)
        self.linear = nn.Linear(256, 3)

    def text2Token(self, text):
        token = self.tokenizer.tokenize(text)
        txtid = self.tokenizer.convert_tokens_to_ids(token)
        return txtid

    def forward(self, inputs_ids, inputs_masks, tokens_type_ids):
        out_seq = self.bert(input_ids=inputs_ids,
                            attention_mask=inputs_masks,
                            token_type_ids=tokens_type_ids)[0]
        out_seq = self.proj(out_seq)
        return out_seq


class TTSProsody(object):
    def __init__(self, path, device):
        self.device = device
        self.char_model = CharEmbedding(path)
        self.char_model.load_state_dict(
            torch.load(
                os.path.join(config.abs_path, config.system.data_path, config.model_config.vits_chinese_bert,
                             "prosody_model.pt"),
                map_location=config.system.device
            ),
            strict=False
        )
        self.char_model.eval()
        self.char_model.to(self.device)

    def get_char_embeds(self, text):
        input_ids = self.char_model.text2Token(text)
        input_masks = [1] * len(input_ids)
        type_ids = [0] * len(input_ids)
        input_ids = torch.LongTensor([input_ids]).to(self.device)
        input_masks = torch.LongTensor([input_masks]).to(self.device)
        type_ids = torch.LongTensor([type_ids]).to(self.device)

        with torch.no_grad():
            char_embeds = self.char_model(
                input_ids, input_masks, type_ids).squeeze(0).cpu()
        return char_embeds

    def expand_for_phone(self, char_embeds, length):  # length of phones for char
        assert char_embeds.size(0) == len(length)
        expand_vecs = list()
        for vec, leng in zip(char_embeds, length):
            vec = vec.expand(leng, -1)
            expand_vecs.append(vec)
        expand_embeds = torch.cat(expand_vecs, 0)
        assert expand_embeds.size(0) == sum(length)
        return expand_embeds.numpy()


if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    prosody = TTSProsody('./bert/', device)
    while True:
        text = input("请输入文本：")
        prosody.get_char_embeds(text)