import spaces import gradio as gr import torch from transformers import AutoTokenizer, AutoFeatureExtractor from parler_tts import ParlerTTSForConditionalGeneration import docx2txt from PyPDF2 import PdfReader import re import os from pydub import AudioSegment import tempfile # Global variables and model initialization device = "cuda:0" if torch.cuda.is_available() else "cpu" repo_id = "parler-tts/parler-tts-mini-v1" model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device) tokenizer = AutoTokenizer.from_pretrained(repo_id) feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id) SAMPLE_RATE = feature_extractor.sampling_rate def preprocess_text(text): # Remove extra whitespace, normalize text, and handle numbers text = re.sub(r'\s+', ' ', text).strip() text = re.sub(r'\d+', lambda m: ' '.join(m.group(0)), text) return text def extract_text_from_file(file): if file.name.endswith('.txt'): with open(file.name, 'r', encoding='utf-8') as f: return f.read() elif file.name.endswith('.docx'): return docx2txt.process(file.name) elif file.name.endswith('.pdf'): with open(file.name, 'rb') as f: reader = PdfReader(f) return ' '.join([page.extract_text() for page in reader.pages]) else: raise ValueError("Unsupported file type") def split_text_into_chunks(text, max_length=1000): words = text.split() chunks = [] current_chunk = [] current_length = 0 for word in words: if current_length + len(word) + 1 > max_length: chunks.append(' '.join(current_chunk)) current_chunk = [word] current_length = len(word) else: current_chunk.append(word) current_length += len(word) + 1 if current_chunk: chunks.append(' '.join(current_chunk)) return chunks @spaces.GPU(duration=300) def generate_audio(text, description): preprocessed_text = preprocess_text(text) inputs = tokenizer(description.strip(), return_tensors="pt").to(device) prompt = tokenizer(preprocessed_text, return_tensors="pt").to(device) generation = model.generate( input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0 ) audio_arr = generation.cpu().numpy().squeeze() return SAMPLE_RATE, audio_arr def process_input(file, text_input, description, max_duration): if file: text = extract_text_from_file(file) else: text = text_input if not text: return None, "Please provide text input or upload a file." try: chunks = split_text_into_chunks(text) audio_segments = [] total_duration = 0 for chunk in chunks: audio = generate_audio(chunk, description) segment = AudioSegment( audio[1].tobytes(), frame_rate=audio[0], sample_width=2, channels=1 ) chunk_duration = len(segment) / 1000 # Duration in seconds if total_duration + chunk_duration > max_duration: break audio_segments.append(segment) total_duration += chunk_duration if not audio_segments: return None, "Generated audio exceeds maximum duration. Please use shorter text." combined_audio = sum(audio_segments) with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file: combined_audio.export(temp_file.name, format="wav") return temp_file.name, None except Exception as e: return None, f"Error generating audio: {str(e)}" def update_max_duration(file, text_input): if file: text = extract_text_from_file(file) else: text = text_input if not text: return gr.Slider.update(value=60) estimated_duration = len(text.split()) / 3 # Rough estimate: 3 words per second return gr.Slider.update(value=min(300, max(60, estimated_duration))) # Gradio interface css = """ .container { max-width: 850px; margin: auto; padding: 20px; background-color: #f0f4f8; border-radius: 12px; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); } .input-area, .output-area { background-color: white; padding: 25px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); margin-bottom: 20px; } .generate-btn { background-color: #4CAF50 !important; color: white !important; padding: 10px 20px !important; font-size: 16px !important; font-weight: bold !important; border-radius: 5px !important; border: none !important; cursor: pointer !important; transition: background-color 0.3s !important; } .generate-btn:hover { background-color: #45a049 !important; } """ with gr.Blocks(css=css) as demo: gr.Markdown("# 🎙️ Parler TTS: Advanced Text-to-Speech Generator") with gr.Row(elem_classes="container"): with gr.Column(elem_classes="input-area"): file_input = gr.File(label="📄 Upload File (TXT, DOCX, PDF)") text_input = gr.Textbox(label="✍️ Or enter text here", lines=5, placeholder="Type or paste your text here...") description = gr.Textbox( label="🗣️ Voice Description", lines=2, value="A clear, neutral voice with minimal background noise.", placeholder="Describe the voice characteristics you want..." ) max_duration = gr.Slider( minimum=10, maximum=300, value=60, step=10, label="⏱️ Maximum Audio Duration (seconds)" ) submit_btn = gr.Button("🚀 Generate Audio", elem_classes="generate-btn") with gr.Column(elem_classes="output-area"): output_audio = gr.Audio(label="🔊 Generated Audio") error_output = gr.Markdown() file_input.change( fn=update_max_duration, inputs=[file_input, text_input], outputs=[max_duration] ) text_input.change( fn=update_max_duration, inputs=[file_input, text_input], outputs=[max_duration] ) submit_btn.click( fn=process_input, inputs=[file_input, text_input, description, max_duration], outputs=[output_audio, error_output] ) gr.Markdown( """ ## 📌 Tips for Best Results - For longer texts, the generator will create audio up to the specified maximum duration. - Experiment with different voice descriptions to achieve the desired output. - Use punctuation to control pacing and intonation in the generated speech. - For optimal quality, try to keep individual sentences or paragraphs concise. ## 🛠️ Technical Details - This demo uses the Parler TTS Mini v1 model. - Audio generation is GPU-accelerated for faster processing. - Maximum file size for uploads: 5MB """ ) demo.queue() demo.launch()