Spaces:
Running
Running
Create pdf_to_audio.py
Browse files- pdf_to_audio.py +78 -0
pdf_to_audio.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import io
|
3 |
+
import sys
|
4 |
+
import time
|
5 |
+
import gradio as gr
|
6 |
+
from typing import List, Union, Any, Dict, Optional
|
7 |
+
import torch
|
8 |
+
import numpy as np
|
9 |
+
import librosa
|
10 |
+
from transformers import SpeechSynthesisPipeline, AutoTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor
|
11 |
+
from pypdfium2 import PdfDocument, Page
|
12 |
+
|
13 |
+
def convert_pdf_to_text(filepath):
|
14 |
+
doc = PdfDocument(filepath)
|
15 |
+
text = ""
|
16 |
+
for page in doc.pages():
|
17 |
+
text += page.extract_text() + "\n\n"
|
18 |
+
return text
|
19 |
+
|
20 |
+
class QuantizedSpeechT5TTSPipe:
|
21 |
+
def __init__(self):
|
22 |
+
self.tokenizer = AutoTokenizer.from_pretrained('facebook/wav2vec2-base-960h')
|
23 |
+
self.model = Wav2Vec2ForCTC.from_pretrained('/model/quantized_model').half().cuda()
|
24 |
+
self.processor = Wav2Vec2Processor.from_pretrained("/model/quantized_vocab")
|
25 |
+
|
26 |
+
def _pad_and_concatenate(self, tensor_list: List[torch.Tensor], padding_value=0):
|
27 |
+
max_length = max([tensor.size(0) for tensor in tensor_list])
|
28 |
+
padded_tensors = []
|
29 |
+
for tensor in tensor_list:
|
30 |
+
pad_width = max_length - tensor.size(0)
|
31 |
+
if pad_width > 0:
|
32 |
+
tensor_padded = torch.cat((tensor, torch.full((pad_width, tensor.size(1)), fill_value=padding_value).type_as(tensor)))
|
33 |
+
else:
|
34 |
+
tensor_padded = tensor
|
35 |
+
padded_tensors.append(tensor_padded)
|
36 |
+
|
37 |
+
return torch.stack(padded_tensors)
|
38 |
+
|
39 |
+
def preprocess(self, inputs: Union[str, List[str]], **kwargs) -> dict:
|
40 |
+
if isinstance(inputs, str):
|
41 |
+
inputs = [inputs]
|
42 |
+
batch_encodings = self.tokenizer(inputs, truncation=True, padding='longest', return_tensors="pt").input_values
|
43 |
+
return {"batch_encodings": batch_encodings}
|
44 |
+
|
45 |
+
def postprocess(self, outputs: Dict[str, torch.Tensor], **kwargs) -> Union[List[str], List[bytes]]:
|
46 |
+
logits = outputs["logits"].cpu().detach().numpy()
|
47 |
+
ids = np.argmax(logits, axis=-1)
|
48 |
+
cleaned_ids = [id_seq[:np.where(np.array(id_seq) == 2)[0][0]] for id_seq in ids] # Remove CTC blanks
|
49 |
+
decoded_strings = self.tokenizer.decode(cleaned_ids)
|
50 |
+
|
51 |
+
audios = []
|
52 |
+
for text in decoded_strings:
|
53 |
+
input_values = self.processor(text, sampling_rate=16000, return_tensors="pt").input_values
|
54 |
+
input_values = input_values.cuda().unsqueeze(0)
|
55 |
+
|
56 |
+
mel_outputs = self.model(input_values).mel_output
|
57 |
+
_, predicted_ids = torch.topk(mel_outputs.float(), k=1, dim=-1)
|
58 |
+
predicted_ids = predicted_ids.squeeze(-1).tolist()[0]
|
59 |
+
|
60 |
+
raw_waveform = self.processor.post_processing(predicted_ids)
|
61 |
+
waveform = raw_waveform * 32768 / max(abs(raw_waveform))
|
62 |
+
wav_data = np.int16(waveform)
|
63 |
+
audio = io.BytesIO()
|
64 |
+
sf.write(audio, int(44100), wav_data, format="WAV")
|
65 |
+
audios.append(audio.getvalue())
|
66 |
+
return audios
|
67 |
+
|
68 |
+
def generate(self, text: str):
|
69 |
+
processed_inputs = self.preprocess(text)
|
70 |
+
outputs = self.model(**processed_inputs)
|
71 |
+
results = self.postprocess(outputs)
|
72 |
+
return results
|
73 |
+
|
74 |
+
if __name__ == "__main__":
|
75 |
+
tts = QuantizedSpeechT5TTSPipe()
|
76 |
+
sample_text = 'Hello world! This is a test.'
|
77 |
+
result = tts.generate(sample_text)
|
78 |
+
print(f'Generated {len(result)} audio files from "{sample_text}"')
|