Talk-To-PDF / pdf_to_audio.py
nirajandhakal's picture
Create pdf_to_audio.py
347336e verified
raw
history blame
3.32 kB
import os
import io
import sys
import time
import gradio as gr
from typing import List, Union, Any, Dict, Optional
import torch
import numpy as np
import librosa
from transformers import SpeechSynthesisPipeline, AutoTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor
from pypdfium2 import PdfDocument, Page
def convert_pdf_to_text(filepath):
doc = PdfDocument(filepath)
text = ""
for page in doc.pages():
text += page.extract_text() + "\n\n"
return text
class QuantizedSpeechT5TTSPipe:
def __init__(self):
self.tokenizer = AutoTokenizer.from_pretrained('facebook/wav2vec2-base-960h')
self.model = Wav2Vec2ForCTC.from_pretrained('/model/quantized_model').half().cuda()
self.processor = Wav2Vec2Processor.from_pretrained("/model/quantized_vocab")
def _pad_and_concatenate(self, tensor_list: List[torch.Tensor], padding_value=0):
max_length = max([tensor.size(0) for tensor in tensor_list])
padded_tensors = []
for tensor in tensor_list:
pad_width = max_length - tensor.size(0)
if pad_width > 0:
tensor_padded = torch.cat((tensor, torch.full((pad_width, tensor.size(1)), fill_value=padding_value).type_as(tensor)))
else:
tensor_padded = tensor
padded_tensors.append(tensor_padded)
return torch.stack(padded_tensors)
def preprocess(self, inputs: Union[str, List[str]], **kwargs) -> dict:
if isinstance(inputs, str):
inputs = [inputs]
batch_encodings = self.tokenizer(inputs, truncation=True, padding='longest', return_tensors="pt").input_values
return {"batch_encodings": batch_encodings}
def postprocess(self, outputs: Dict[str, torch.Tensor], **kwargs) -> Union[List[str], List[bytes]]:
logits = outputs["logits"].cpu().detach().numpy()
ids = np.argmax(logits, axis=-1)
cleaned_ids = [id_seq[:np.where(np.array(id_seq) == 2)[0][0]] for id_seq in ids] # Remove CTC blanks
decoded_strings = self.tokenizer.decode(cleaned_ids)
audios = []
for text in decoded_strings:
input_values = self.processor(text, sampling_rate=16000, return_tensors="pt").input_values
input_values = input_values.cuda().unsqueeze(0)
mel_outputs = self.model(input_values).mel_output
_, predicted_ids = torch.topk(mel_outputs.float(), k=1, dim=-1)
predicted_ids = predicted_ids.squeeze(-1).tolist()[0]
raw_waveform = self.processor.post_processing(predicted_ids)
waveform = raw_waveform * 32768 / max(abs(raw_waveform))
wav_data = np.int16(waveform)
audio = io.BytesIO()
sf.write(audio, int(44100), wav_data, format="WAV")
audios.append(audio.getvalue())
return audios
def generate(self, text: str):
processed_inputs = self.preprocess(text)
outputs = self.model(**processed_inputs)
results = self.postprocess(outputs)
return results
if __name__ == "__main__":
tts = QuantizedSpeechT5TTSPipe()
sample_text = 'Hello world! This is a test.'
result = tts.generate(sample_text)
print(f'Generated {len(result)} audio files from "{sample_text}"')