nirajandhakal commited on
Commit
347336e
1 Parent(s): caba8fc

Create pdf_to_audio.py

Browse files
Files changed (1) hide show
  1. pdf_to_audio.py +78 -0
pdf_to_audio.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import sys
4
+ import time
5
+ import gradio as gr
6
+ from typing import List, Union, Any, Dict, Optional
7
+ import torch
8
+ import numpy as np
9
+ import librosa
10
+ from transformers import SpeechSynthesisPipeline, AutoTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor
11
+ from pypdfium2 import PdfDocument, Page
12
+
13
+ def convert_pdf_to_text(filepath):
14
+ doc = PdfDocument(filepath)
15
+ text = ""
16
+ for page in doc.pages():
17
+ text += page.extract_text() + "\n\n"
18
+ return text
19
+
20
+ class QuantizedSpeechT5TTSPipe:
21
+ def __init__(self):
22
+ self.tokenizer = AutoTokenizer.from_pretrained('facebook/wav2vec2-base-960h')
23
+ self.model = Wav2Vec2ForCTC.from_pretrained('/model/quantized_model').half().cuda()
24
+ self.processor = Wav2Vec2Processor.from_pretrained("/model/quantized_vocab")
25
+
26
+ def _pad_and_concatenate(self, tensor_list: List[torch.Tensor], padding_value=0):
27
+ max_length = max([tensor.size(0) for tensor in tensor_list])
28
+ padded_tensors = []
29
+ for tensor in tensor_list:
30
+ pad_width = max_length - tensor.size(0)
31
+ if pad_width > 0:
32
+ tensor_padded = torch.cat((tensor, torch.full((pad_width, tensor.size(1)), fill_value=padding_value).type_as(tensor)))
33
+ else:
34
+ tensor_padded = tensor
35
+ padded_tensors.append(tensor_padded)
36
+
37
+ return torch.stack(padded_tensors)
38
+
39
+ def preprocess(self, inputs: Union[str, List[str]], **kwargs) -> dict:
40
+ if isinstance(inputs, str):
41
+ inputs = [inputs]
42
+ batch_encodings = self.tokenizer(inputs, truncation=True, padding='longest', return_tensors="pt").input_values
43
+ return {"batch_encodings": batch_encodings}
44
+
45
+ def postprocess(self, outputs: Dict[str, torch.Tensor], **kwargs) -> Union[List[str], List[bytes]]:
46
+ logits = outputs["logits"].cpu().detach().numpy()
47
+ ids = np.argmax(logits, axis=-1)
48
+ cleaned_ids = [id_seq[:np.where(np.array(id_seq) == 2)[0][0]] for id_seq in ids] # Remove CTC blanks
49
+ decoded_strings = self.tokenizer.decode(cleaned_ids)
50
+
51
+ audios = []
52
+ for text in decoded_strings:
53
+ input_values = self.processor(text, sampling_rate=16000, return_tensors="pt").input_values
54
+ input_values = input_values.cuda().unsqueeze(0)
55
+
56
+ mel_outputs = self.model(input_values).mel_output
57
+ _, predicted_ids = torch.topk(mel_outputs.float(), k=1, dim=-1)
58
+ predicted_ids = predicted_ids.squeeze(-1).tolist()[0]
59
+
60
+ raw_waveform = self.processor.post_processing(predicted_ids)
61
+ waveform = raw_waveform * 32768 / max(abs(raw_waveform))
62
+ wav_data = np.int16(waveform)
63
+ audio = io.BytesIO()
64
+ sf.write(audio, int(44100), wav_data, format="WAV")
65
+ audios.append(audio.getvalue())
66
+ return audios
67
+
68
+ def generate(self, text: str):
69
+ processed_inputs = self.preprocess(text)
70
+ outputs = self.model(**processed_inputs)
71
+ results = self.postprocess(outputs)
72
+ return results
73
+
74
+ if __name__ == "__main__":
75
+ tts = QuantizedSpeechT5TTSPipe()
76
+ sample_text = 'Hello world! This is a test.'
77
+ result = tts.generate(sample_text)
78
+ print(f'Generated {len(result)} audio files from "{sample_text}"')