Spaces:
Sleeping
Sleeping
stefan-french
commited on
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import soundfile as sf
|
6 |
+
import streamlit as st
|
7 |
+
import document_to_podcast
|
8 |
+
|
9 |
+
from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS
|
10 |
+
from document_to_podcast.inference.model_loaders import (
|
11 |
+
load_llama_cpp_model,
|
12 |
+
load_outetts_model,
|
13 |
+
)
|
14 |
+
from document_to_podcast.config import DEFAULT_PROMPT, DEFAULT_SPEAKERS, Speaker
|
15 |
+
from document_to_podcast.inference.text_to_speech import text_to_speech
|
16 |
+
from document_to_podcast.inference.text_to_text import text_to_text_stream
|
17 |
+
|
18 |
+
|
19 |
+
@st.cache_resource
|
20 |
+
def load_text_to_text_model():
|
21 |
+
return load_llama_cpp_model(
|
22 |
+
model_id="allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf"
|
23 |
+
)
|
24 |
+
|
25 |
+
|
26 |
+
@st.cache_resource
|
27 |
+
def load_text_to_speech_model():
|
28 |
+
return load_outetts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
|
29 |
+
|
30 |
+
|
31 |
+
script = "script"
|
32 |
+
audio = "audio"
|
33 |
+
gen_button = "generate podcast button"
|
34 |
+
if script not in st.session_state:
|
35 |
+
st.session_state[script] = ""
|
36 |
+
if audio not in st.session_state:
|
37 |
+
st.session_state.audio = []
|
38 |
+
if gen_button not in st.session_state:
|
39 |
+
st.session_state[gen_button] = False
|
40 |
+
|
41 |
+
|
42 |
+
def gen_button_clicked():
|
43 |
+
st.session_state[gen_button] = True
|
44 |
+
|
45 |
+
|
46 |
+
st.title("Document To Podcast")
|
47 |
+
|
48 |
+
st.header("Uploading Data")
|
49 |
+
|
50 |
+
uploaded_file = st.file_uploader(
|
51 |
+
"Choose a file", type=["pdf", "html", "txt", "docx", "md"]
|
52 |
+
)
|
53 |
+
|
54 |
+
|
55 |
+
if uploaded_file is not None:
|
56 |
+
st.divider()
|
57 |
+
st.header("Loading and Cleaning Data")
|
58 |
+
st.markdown(
|
59 |
+
"[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-1-document-pre-processing)"
|
60 |
+
)
|
61 |
+
st.divider()
|
62 |
+
|
63 |
+
extension = Path(uploaded_file.name).suffix
|
64 |
+
|
65 |
+
col1, col2 = st.columns(2)
|
66 |
+
|
67 |
+
raw_text = DATA_LOADERS[extension](uploaded_file)
|
68 |
+
with col1:
|
69 |
+
st.subheader("Raw Text")
|
70 |
+
st.text_area(
|
71 |
+
f"Number of characters before cleaning: {len(raw_text)}",
|
72 |
+
f"{raw_text[:500]} . . .",
|
73 |
+
)
|
74 |
+
|
75 |
+
clean_text = DATA_CLEANERS[extension](raw_text)
|
76 |
+
with col2:
|
77 |
+
st.subheader("Cleaned Text")
|
78 |
+
st.text_area(
|
79 |
+
f"Number of characters after cleaning: {len(clean_text)}",
|
80 |
+
f"{clean_text[:500]} . . .",
|
81 |
+
)
|
82 |
+
|
83 |
+
st.divider()
|
84 |
+
st.header("Downloading and Loading models")
|
85 |
+
st.markdown(
|
86 |
+
"[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-2-podcast-script-generation)"
|
87 |
+
)
|
88 |
+
st.divider()
|
89 |
+
|
90 |
+
st.markdown(
|
91 |
+
"For this demo, we are using the following models: \n"
|
92 |
+
"- [OLMoE-1B-7B-0924-Instruct](https://huggingface.co/allenai/OLMoE-1B-7B-0924-Instruct-GGUF)\n"
|
93 |
+
"- [OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf](https://huggingface.co/OuteAI/OuteTTS-0.2-500M-GGUF)"
|
94 |
+
)
|
95 |
+
st.markdown(
|
96 |
+
"You can check the [Customization Guide](https://mozilla-ai.github.io/document-to-podcast/customization/)"
|
97 |
+
" for more information on how to use different models."
|
98 |
+
)
|
99 |
+
|
100 |
+
text_model = load_text_to_text_model()
|
101 |
+
speech_model = load_text_to_speech_model()
|
102 |
+
|
103 |
+
# ~4 characters per token is considered a reasonable default.
|
104 |
+
max_characters = text_model.n_ctx() * 4
|
105 |
+
if len(clean_text) > max_characters:
|
106 |
+
st.warning(
|
107 |
+
f"Input text is too big ({len(clean_text)})."
|
108 |
+
f" Using only a subset of it ({max_characters})."
|
109 |
+
)
|
110 |
+
clean_text = clean_text[:max_characters]
|
111 |
+
|
112 |
+
st.divider()
|
113 |
+
st.header("Podcast generation")
|
114 |
+
st.markdown(
|
115 |
+
"[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-3-audio-podcast-generation)"
|
116 |
+
)
|
117 |
+
st.divider()
|
118 |
+
|
119 |
+
st.subheader("Speaker configuration")
|
120 |
+
for s in DEFAULT_SPEAKERS:
|
121 |
+
s.pop("id", None)
|
122 |
+
speakers = st.data_editor(DEFAULT_SPEAKERS, num_rows="dynamic")
|
123 |
+
|
124 |
+
if st.button("Generate Podcast", on_click=gen_button_clicked):
|
125 |
+
for n, speaker in enumerate(speakers):
|
126 |
+
speaker["id"] = n + 1
|
127 |
+
speakers_str = "\n".join(
|
128 |
+
str(Speaker.model_validate(speaker))
|
129 |
+
for speaker in speakers
|
130 |
+
if all(
|
131 |
+
speaker.get(x, None) for x in ["name", "description", "voice_profile"]
|
132 |
+
)
|
133 |
+
)
|
134 |
+
system_prompt = DEFAULT_PROMPT.replace("{SPEAKERS}", speakers_str)
|
135 |
+
with st.spinner("Generating Podcast..."):
|
136 |
+
text = ""
|
137 |
+
for chunk in text_to_text_stream(
|
138 |
+
clean_text, text_model, system_prompt=system_prompt.strip()
|
139 |
+
):
|
140 |
+
text += chunk
|
141 |
+
if text.endswith("\n") and "Speaker" in text:
|
142 |
+
st.session_state.script += text
|
143 |
+
st.write(text)
|
144 |
+
|
145 |
+
speaker_id = re.search(r"Speaker (\d+)", text).group(1)
|
146 |
+
voice_profile = next(
|
147 |
+
speaker["voice_profile"]
|
148 |
+
for speaker in speakers
|
149 |
+
if speaker["id"] == int(speaker_id)
|
150 |
+
)
|
151 |
+
with st.spinner("Generating Audio..."):
|
152 |
+
speech = text_to_speech(
|
153 |
+
text.split(f'"Speaker {speaker_id}":')[-1],
|
154 |
+
speech_model,
|
155 |
+
voice_profile,
|
156 |
+
)
|
157 |
+
st.audio(speech, sample_rate=speech_model.audio_codec.sr)
|
158 |
+
|
159 |
+
st.session_state.audio.append(speech)
|
160 |
+
text = ""
|
161 |
+
|
162 |
+
if st.session_state[gen_button]:
|
163 |
+
if st.button("Save Podcast to audio file"):
|
164 |
+
st.session_state.audio = np.concatenate(st.session_state.audio)
|
165 |
+
sf.write(
|
166 |
+
"podcast.wav",
|
167 |
+
st.session_state.audio,
|
168 |
+
samplerate=speech_model.audio_codec.sr,
|
169 |
+
)
|
170 |
+
st.markdown("Podcast saved to disk!")
|
171 |
+
|
172 |
+
if st.button("Save Podcast script to text file"):
|
173 |
+
with open("script.txt", "w") as f:
|
174 |
+
st.session_state.script += "}"
|
175 |
+
f.write(st.session_state.script)
|
176 |
+
|
177 |
+
st.markdown("Script saved to disk!")
|