import os import io import sys import time import gradio as gr from typing import List, Union, Any, Dict, Optional import torch import numpy as np import librosa from transformers import SpeechSynthesisPipeline, AutoTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor from pypdfium2 import PdfDocument, Page def convert_pdf_to_text(filepath): doc = PdfDocument(filepath) text = "" for page in doc.pages(): text += page.extract_text() + "\n\n" return text class QuantizedSpeechT5TTSPipe: def __init__(self): self.tokenizer = AutoTokenizer.from_pretrained('facebook/wav2vec2-base-960h') self.model = Wav2Vec2ForCTC.from_pretrained('/model/quantized_model').half().cuda() self.processor = Wav2Vec2Processor.from_pretrained("/model/quantized_vocab") def _pad_and_concatenate(self, tensor_list: List[torch.Tensor], padding_value=0): max_length = max([tensor.size(0) for tensor in tensor_list]) padded_tensors = [] for tensor in tensor_list: pad_width = max_length - tensor.size(0) if pad_width > 0: tensor_padded = torch.cat((tensor, torch.full((pad_width, tensor.size(1)), fill_value=padding_value).type_as(tensor))) else: tensor_padded = tensor padded_tensors.append(tensor_padded) return torch.stack(padded_tensors) def preprocess(self, inputs: Union[str, List[str]], **kwargs) -> dict: if isinstance(inputs, str): inputs = [inputs] batch_encodings = self.tokenizer(inputs, truncation=True, padding='longest', return_tensors="pt").input_values return {"batch_encodings": batch_encodings} def postprocess(self, outputs: Dict[str, torch.Tensor], **kwargs) -> Union[List[str], List[bytes]]: logits = outputs["logits"].cpu().detach().numpy() ids = np.argmax(logits, axis=-1) cleaned_ids = [id_seq[:np.where(np.array(id_seq) == 2)[0][0]] for id_seq in ids] # Remove CTC blanks decoded_strings = self.tokenizer.decode(cleaned_ids) audios = [] for text in decoded_strings: input_values = self.processor(text, sampling_rate=16000, return_tensors="pt").input_values input_values = input_values.cuda().unsqueeze(0) mel_outputs = self.model(input_values).mel_output _, predicted_ids = torch.topk(mel_outputs.float(), k=1, dim=-1) predicted_ids = predicted_ids.squeeze(-1).tolist()[0] raw_waveform = self.processor.post_processing(predicted_ids) waveform = raw_waveform * 32768 / max(abs(raw_waveform)) wav_data = np.int16(waveform) audio = io.BytesIO() sf.write(audio, int(44100), wav_data, format="WAV") audios.append(audio.getvalue()) return audios def generate(self, text: str): processed_inputs = self.preprocess(text) outputs = self.model(**processed_inputs) results = self.postprocess(outputs) return results if __name__ == "__main__": tts = QuantizedSpeechT5TTSPipe() sample_text = 'Hello world! This is a test.' result = tts.generate(sample_text) print(f'Generated {len(result)} audio files from "{sample_text}"')