Spaces:
Sleeping
Sleeping
import os | |
import io | |
import sys | |
import time | |
import gradio as gr | |
from typing import List, Union, Any, Dict, Optional | |
import torch | |
import numpy as np | |
import librosa | |
from transformers import SpeechSynthesisPipeline, AutoTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor | |
from pypdfium2 import PdfDocument, Page | |
def convert_pdf_to_text(filepath): | |
doc = PdfDocument(filepath) | |
text = "" | |
for page in doc.pages(): | |
text += page.extract_text() + "\n\n" | |
return text | |
class QuantizedSpeechT5TTSPipe: | |
def __init__(self): | |
self.tokenizer = AutoTokenizer.from_pretrained('facebook/wav2vec2-base-960h') | |
self.model = Wav2Vec2ForCTC.from_pretrained('/model/quantized_model').half().cuda() | |
self.processor = Wav2Vec2Processor.from_pretrained("/model/quantized_vocab") | |
def _pad_and_concatenate(self, tensor_list: List[torch.Tensor], padding_value=0): | |
max_length = max([tensor.size(0) for tensor in tensor_list]) | |
padded_tensors = [] | |
for tensor in tensor_list: | |
pad_width = max_length - tensor.size(0) | |
if pad_width > 0: | |
tensor_padded = torch.cat((tensor, torch.full((pad_width, tensor.size(1)), fill_value=padding_value).type_as(tensor))) | |
else: | |
tensor_padded = tensor | |
padded_tensors.append(tensor_padded) | |
return torch.stack(padded_tensors) | |
def preprocess(self, inputs: Union[str, List[str]], **kwargs) -> dict: | |
if isinstance(inputs, str): | |
inputs = [inputs] | |
batch_encodings = self.tokenizer(inputs, truncation=True, padding='longest', return_tensors="pt").input_values | |
return {"batch_encodings": batch_encodings} | |
def postprocess(self, outputs: Dict[str, torch.Tensor], **kwargs) -> Union[List[str], List[bytes]]: | |
logits = outputs["logits"].cpu().detach().numpy() | |
ids = np.argmax(logits, axis=-1) | |
cleaned_ids = [id_seq[:np.where(np.array(id_seq) == 2)[0][0]] for id_seq in ids] # Remove CTC blanks | |
decoded_strings = self.tokenizer.decode(cleaned_ids) | |
audios = [] | |
for text in decoded_strings: | |
input_values = self.processor(text, sampling_rate=16000, return_tensors="pt").input_values | |
input_values = input_values.cuda().unsqueeze(0) | |
mel_outputs = self.model(input_values).mel_output | |
_, predicted_ids = torch.topk(mel_outputs.float(), k=1, dim=-1) | |
predicted_ids = predicted_ids.squeeze(-1).tolist()[0] | |
raw_waveform = self.processor.post_processing(predicted_ids) | |
waveform = raw_waveform * 32768 / max(abs(raw_waveform)) | |
wav_data = np.int16(waveform) | |
audio = io.BytesIO() | |
sf.write(audio, int(44100), wav_data, format="WAV") | |
audios.append(audio.getvalue()) | |
return audios | |
def generate(self, text: str): | |
processed_inputs = self.preprocess(text) | |
outputs = self.model(**processed_inputs) | |
results = self.postprocess(outputs) | |
return results | |
if __name__ == "__main__": | |
tts = QuantizedSpeechT5TTSPipe() | |
sample_text = 'Hello world! This is a test.' | |
result = tts.generate(sample_text) | |
print(f'Generated {len(result)} audio files from "{sample_text}"') |