Spaces:
Running
on
A100
Running
on
A100
https://huggingface.co/spaces/fffiloni/YuE
#3
by
wowsuffer
- opened
- app.py +17 -32
- inference/infer.py +2 -6
- requirements.txt +1 -1
app.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
import subprocess
|
3 |
import os
|
4 |
-
import re
|
5 |
import shutil
|
6 |
import tempfile
|
7 |
|
@@ -67,19 +66,11 @@ def empty_output_folder(output_dir):
|
|
67 |
print(f"Error deleting file {file_path}: {e}")
|
68 |
|
69 |
# Function to create a temporary file with string content
|
70 |
-
def create_temp_file(content,
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
temp_file.close()
|
76 |
-
|
77 |
-
# Debug: Print file contents
|
78 |
-
print(f"\nContent written to {prefix}{suffix}:")
|
79 |
-
print(content)
|
80 |
-
print("---")
|
81 |
-
|
82 |
-
return temp_file.name
|
83 |
|
84 |
def get_last_mp3_file(output_dir):
|
85 |
# List all files in the output directory
|
@@ -103,8 +94,8 @@ def get_last_mp3_file(output_dir):
|
|
103 |
|
104 |
def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
|
105 |
# Create temporary files
|
106 |
-
genre_txt_path = create_temp_file(genre_txt_content,
|
107 |
-
lyrics_txt_path = create_temp_file(lyrics_txt_content,
|
108 |
|
109 |
print(f"Genre TXT path: {genre_txt_path}")
|
110 |
print(f"Lyrics TXT path: {lyrics_txt_path}")
|
@@ -124,10 +115,11 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
|
|
124 |
"--genre_txt", f"{genre_txt_path}",
|
125 |
"--lyrics_txt", f"{lyrics_txt_path}",
|
126 |
"--run_n_segments", str(num_segments),
|
127 |
-
"--stage2_batch_size", "
|
128 |
"--output_dir", f"{output_dir}",
|
129 |
"--cuda_idx", "0",
|
130 |
-
"--max_new_tokens", str(max_new_tokens)
|
|
|
131 |
]
|
132 |
|
133 |
# Set up environment variables for CUDA with optimized settings
|
@@ -155,17 +147,15 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
|
|
155 |
|
156 |
if last_mp3:
|
157 |
print("Last .mp3 file:", last_mp3)
|
158 |
-
|
159 |
-
vocal_mp3_path = "./output/vocoder/stems/vocal.mp3"
|
160 |
-
return last_mp3, instrumental_mp3_path, vocal_mp3_path
|
161 |
else:
|
162 |
-
return None
|
163 |
else:
|
164 |
print("Output folder is empty.")
|
165 |
-
|
166 |
except subprocess.CalledProcessError as e:
|
167 |
print(f"Error occurred: {e}")
|
168 |
-
|
169 |
finally:
|
170 |
# Clean up temporary files
|
171 |
os.remove(genre_txt_path)
|
@@ -215,22 +205,17 @@ with gr.Blocks() as demo:
|
|
215 |
)
|
216 |
lyrics_txt = gr.Textbox(
|
217 |
label="Lyrics", lines=12,
|
218 |
-
placeholder=""
|
219 |
-
Type the lyrics here...
|
220 |
-
At least 2 segments, Annotate your segments with brackets, [verse] [chorus] [bridge]""",
|
221 |
info="Text containing the lyrics for the music generation. These lyrics will be processed and split into structured segments to guide the generation process."
|
222 |
)
|
223 |
|
224 |
with gr.Column():
|
225 |
|
226 |
-
num_segments = gr.Number(label="Number of Segments", value=2, interactive=
|
227 |
max_new_tokens = gr.Slider(label="Max New Tokens", minimum=500, maximum="3000", step=500, value=1500, interactive=True)
|
228 |
|
229 |
submit_btn = gr.Button("Submit")
|
230 |
music_out = gr.Audio(label="Audio Result")
|
231 |
-
with gr.Accordion("Vocal & Instrumental", open=False):
|
232 |
-
instrumental = gr.Audio(label="Intrumental")
|
233 |
-
vocal = gr.Audio(label="Vocal")
|
234 |
|
235 |
gr.Examples(
|
236 |
examples = [
|
@@ -273,6 +258,6 @@ Living out my dreams with this mic and a deal"""
|
|
273 |
submit_btn.click(
|
274 |
fn = infer,
|
275 |
inputs = [genre_txt, lyrics_txt, num_segments, max_new_tokens],
|
276 |
-
outputs = [music_out
|
277 |
)
|
278 |
demo.queue().launch(show_api=False, show_error=True)
|
|
|
1 |
import gradio as gr
|
2 |
import subprocess
|
3 |
import os
|
|
|
4 |
import shutil
|
5 |
import tempfile
|
6 |
|
|
|
66 |
print(f"Error deleting file {file_path}: {e}")
|
67 |
|
68 |
# Function to create a temporary file with string content
|
69 |
+
def create_temp_file(content, suffix=".txt"):
|
70 |
+
fd, path = tempfile.mkstemp(suffix=suffix)
|
71 |
+
with os.fdopen(fd, "w", encoding="utf-8") as f:
|
72 |
+
f.write(content)
|
73 |
+
return path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
def get_last_mp3_file(output_dir):
|
76 |
# List all files in the output directory
|
|
|
94 |
|
95 |
def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
|
96 |
# Create temporary files
|
97 |
+
genre_txt_path = create_temp_file(genre_txt_content, ".txt")
|
98 |
+
lyrics_txt_path = create_temp_file(lyrics_txt_content, ".txt")
|
99 |
|
100 |
print(f"Genre TXT path: {genre_txt_path}")
|
101 |
print(f"Lyrics TXT path: {lyrics_txt_path}")
|
|
|
115 |
"--genre_txt", f"{genre_txt_path}",
|
116 |
"--lyrics_txt", f"{lyrics_txt_path}",
|
117 |
"--run_n_segments", str(num_segments),
|
118 |
+
"--stage2_batch_size", "4",
|
119 |
"--output_dir", f"{output_dir}",
|
120 |
"--cuda_idx", "0",
|
121 |
+
"--max_new_tokens", str(max_new_tokens),
|
122 |
+
"--disable_offload_model"
|
123 |
]
|
124 |
|
125 |
# Set up environment variables for CUDA with optimized settings
|
|
|
147 |
|
148 |
if last_mp3:
|
149 |
print("Last .mp3 file:", last_mp3)
|
150 |
+
return last_mp3
|
|
|
|
|
151 |
else:
|
152 |
+
return None
|
153 |
else:
|
154 |
print("Output folder is empty.")
|
155 |
+
return None
|
156 |
except subprocess.CalledProcessError as e:
|
157 |
print(f"Error occurred: {e}")
|
158 |
+
return None
|
159 |
finally:
|
160 |
# Clean up temporary files
|
161 |
os.remove(genre_txt_path)
|
|
|
205 |
)
|
206 |
lyrics_txt = gr.Textbox(
|
207 |
label="Lyrics", lines=12,
|
208 |
+
placeholder="Type the lyrics here...",
|
|
|
|
|
209 |
info="Text containing the lyrics for the music generation. These lyrics will be processed and split into structured segments to guide the generation process."
|
210 |
)
|
211 |
|
212 |
with gr.Column():
|
213 |
|
214 |
+
num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
|
215 |
max_new_tokens = gr.Slider(label="Max New Tokens", minimum=500, maximum="3000", step=500, value=1500, interactive=True)
|
216 |
|
217 |
submit_btn = gr.Button("Submit")
|
218 |
music_out = gr.Audio(label="Audio Result")
|
|
|
|
|
|
|
219 |
|
220 |
gr.Examples(
|
221 |
examples = [
|
|
|
258 |
submit_btn.click(
|
259 |
fn = infer,
|
260 |
inputs = [genre_txt, lyrics_txt, num_segments, max_new_tokens],
|
261 |
+
outputs = [music_out]
|
262 |
)
|
263 |
demo.queue().launch(show_api=False, show_error=True)
|
inference/infer.py
CHANGED
@@ -76,7 +76,7 @@ print(f"Using device: {device}")
|
|
76 |
mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
|
77 |
model = AutoModelForCausalLM.from_pretrained(
|
78 |
stage1_model,
|
79 |
-
torch_dtype=torch.
|
80 |
attn_implementation="flash_attention_2", # To enable flashattn, you have to install flash-attn
|
81 |
)
|
82 |
model.to(device)
|
@@ -120,18 +120,15 @@ stage1_output_set = []
|
|
120 |
# Tips:
|
121 |
# genre tags support instrumental,genre,mood,vocal timbr and vocal gender
|
122 |
# all kinds of tags are needed
|
123 |
-
# Ensure files exist
|
124 |
with open(args.genre_txt) as f:
|
125 |
genres = f.read().strip()
|
126 |
-
print(genres)
|
127 |
with open(args.lyrics_txt) as f:
|
128 |
lyrics = split_lyrics(f.read())
|
129 |
-
print(lyrics)
|
130 |
# intruction
|
131 |
full_lyrics = "\n".join(lyrics)
|
132 |
prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
|
133 |
prompt_texts += lyrics
|
134 |
-
|
135 |
|
136 |
random_id = uuid.uuid4()
|
137 |
output_seq = None
|
@@ -144,7 +141,6 @@ start_of_segment = mmtokenizer.tokenize('[start_of_segment]')
|
|
144 |
end_of_segment = mmtokenizer.tokenize('[end_of_segment]')
|
145 |
# Format text prompt
|
146 |
run_n_segments = min(args.run_n_segments+1, len(lyrics))
|
147 |
-
print(f"RUN N SEGMENTS: {run_n_segments}")
|
148 |
for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
|
149 |
section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
|
150 |
guidance_scale = 1.5 if i <=1 else 1.2
|
|
|
76 |
mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
|
77 |
model = AutoModelForCausalLM.from_pretrained(
|
78 |
stage1_model,
|
79 |
+
torch_dtype=torch.bfloat16,
|
80 |
attn_implementation="flash_attention_2", # To enable flashattn, you have to install flash-attn
|
81 |
)
|
82 |
model.to(device)
|
|
|
120 |
# Tips:
|
121 |
# genre tags support instrumental,genre,mood,vocal timbr and vocal gender
|
122 |
# all kinds of tags are needed
|
|
|
123 |
with open(args.genre_txt) as f:
|
124 |
genres = f.read().strip()
|
|
|
125 |
with open(args.lyrics_txt) as f:
|
126 |
lyrics = split_lyrics(f.read())
|
|
|
127 |
# intruction
|
128 |
full_lyrics = "\n".join(lyrics)
|
129 |
prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
|
130 |
prompt_texts += lyrics
|
131 |
+
|
132 |
|
133 |
random_id = uuid.uuid4()
|
134 |
output_seq = None
|
|
|
141 |
end_of_segment = mmtokenizer.tokenize('[end_of_segment]')
|
142 |
# Format text prompt
|
143 |
run_n_segments = min(args.run_n_segments+1, len(lyrics))
|
|
|
144 |
for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
|
145 |
section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
|
146 |
guidance_scale = 1.5 if i <=1 else 1.2
|
requirements.txt
CHANGED
@@ -3,7 +3,7 @@ torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu118
|
|
3 |
omegaconf
|
4 |
einops
|
5 |
numpy<2
|
6 |
-
|
7 |
sentencepiece
|
8 |
tqdm
|
9 |
tensorboard
|
|
|
3 |
omegaconf
|
4 |
einops
|
5 |
numpy<2
|
6 |
+
transformers
|
7 |
sentencepiece
|
8 |
tqdm
|
9 |
tensorboard
|