Spaces:
Sleeping
Sleeping
import streamlit as st | |
import os | |
import json | |
import shutil | |
import re | |
import requests | |
import pyttsx3 | |
from pydub import AudioSegment | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import torch | |
from dotenv import load_dotenv | |
# Load environment variables from .env file | |
load_dotenv() | |
# Streamlit configuration | |
st.set_page_config(page_title="Podcast Generator", layout="wide") | |
st.title("🎙️ Podcast Generator") | |
# System prompt for conversation generation | |
system_prompt = """you are an experienced podcast host... | |
- based on text like an article you can create an engaging conversation between two people. | |
- make the conversation engaging with a lot of emotion. | |
- in the response, identify speakers as Sascha and Marina. | |
- Sascha is the writer, and Marina is the one asking questions. | |
- The podcast is called The Machine Learning Engineer. | |
- Short sentences that can be easily used with speech synthesis. | |
- Use natural conversation fillers like "äh" to make it sound real. | |
""" | |
# Load Hugging Face's distilgpt2 model and tokenizer | |
model_name = "distilgpt2" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForCausalLM.from_pretrained(model_name) | |
# Pyttsx3 setup | |
engine = pyttsx3.init() | |
engine.setProperty("rate", 150) # Adjust speech rate as needed | |
engine.setProperty("voice", "english") # Set to English voice | |
# Retrieve ElevenLabs API key from environment | |
elevenlabs_api_key = os.getenv("ELEVENLABS_API_KEY") | |
elevenlabs_url = "https://api.elevenlabs.io/v1/text-to-speech/ERL3svWBAQ18ByCZTr4k" | |
elevenlabs_headers = { | |
"Accept": "audio/mpeg", | |
"Content-Type": "application/json", | |
"xi-api-key": elevenlabs_api_key | |
} | |
# ElevenLabs TTS function for Sascha | |
def synthesize_speech_elevenlabs(text, speaker, index): | |
data = { | |
"text": text, | |
"model_id": "eleven_turbo_v2_5", | |
"voice_settings": { | |
"stability": 0.5, | |
"similarity_boost": 0.75 | |
} | |
} | |
response = requests.post(elevenlabs_url, json=data, headers=elevenlabs_headers) | |
filename = f"audio-files/{index}_{speaker}.mp3" | |
with open(filename, "wb") as out: | |
for chunk in response.iter_content(chunk_size=1024): | |
if chunk: | |
out.write(chunk) | |
# Pyttsx3 TTS function for Marina | |
def synthesize_speech_pyttsx3(text, speaker, index): | |
filename = f"audio-files/{index}_{speaker}.mp3" | |
engine.save_to_file(text, filename) | |
engine.runAndWait() | |
# Function to synthesize speech based on the speaker | |
def synthesize_speech(text, speaker, index): | |
if speaker == "Sascha": | |
synthesize_speech_elevenlabs(text, speaker, index) | |
else: | |
synthesize_speech_pyttsx3(text, speaker, index) | |
# Function to sort filenames naturally | |
def natural_sort_key(filename): | |
return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', filename)] | |
# Function to merge audio files | |
def merge_audios(audio_folder, output_file): | |
combined = AudioSegment.empty() | |
audio_files = sorted( | |
[f for f in os.listdir(audio_folder) if f.endswith(".mp3") or f.endswith(".wav")], | |
key=natural_sort_key | |
) | |
for filename in audio_files: | |
audio_path = os.path.join(audio_folder, filename) | |
audio = AudioSegment.from_file(audio_path) | |
combined += audio | |
combined.export(output_file, format="mp3") | |
# Function to generate the conversation using distilgpt2 | |
def generate_conversation(article): | |
prompt = system_prompt + "\n\nArticle:\n" + article + "\n\nSascha: " | |
input_ids = tokenizer.encode(prompt, return_tensors="pt") | |
output = model.generate(input_ids, max_length=8192, num_return_sequences=1, no_repeat_ngram_size=2, pad_token_id=tokenizer.eos_token_id) | |
# Process output to create a structured conversation | |
conversation_text = tokenizer.decode(output[0], skip_special_tokens=True) | |
lines = conversation_text.splitlines() | |
conversation = [] | |
speaker = "Sascha" | |
for line in lines: | |
if line.strip(): | |
conversation.append({"speaker": speaker, "text": line.strip()}) | |
speaker = "Marina" if speaker == "Sascha" else "Sascha" | |
return conversation | |
# Function to generate the podcast audio from conversation data | |
def generate_audio(conversation): | |
if os.path.exists('audio-files'): | |
shutil.rmtree('audio-files') | |
os.makedirs('audio-files', exist_ok=True) | |
for index, part in enumerate(conversation): | |
speaker = part['speaker'] | |
text = part['text'] | |
synthesize_speech(text, speaker, index) | |
output_file = "podcast.mp3" | |
merge_audios("audio-files", output_file) | |
return output_file | |
# Streamlit inputs and outputs | |
article = st.text_area("Article Content", "Paste the article text here", height=300) | |
if st.button("Generate Podcast"): | |
if not article: | |
st.error("Please enter article content to generate a podcast.") | |
else: | |
with st.spinner("Generating conversation..."): | |
conversation = generate_conversation(article) | |
st.success("Conversation generated successfully!") | |
st.json(conversation) | |
# Generate audio files | |
with st.spinner("Synthesizing audio..."): | |
podcast_file = generate_audio(conversation) | |
st.success("Audio synthesis complete!") | |
st.audio(podcast_file, format="audio/mp3") | |
with open(podcast_file, "rb") as file: | |
st.download_button("Download Podcast", data=file, file_name="podcast.mp3", mime="audio/mp3") | |