Spaces:
Runtime error
Runtime error
from transformers import VitsModel, AutoTokenizer | |
import soundfile as sf | |
import torch | |
from datetime import datetime | |
import random | |
import time | |
from ctransformers import AutoModelForCausalLM | |
from datetime import datetime | |
import whisper | |
from transformers import VitsModel, AutoTokenizer | |
import torch | |
from transformers import MusicgenForConditionalGeneration, AutoProcessor, set_seed | |
import torch | |
import numpy as np | |
import os | |
import argparse | |
import gradio as gr | |
from timeit import default_timer as timer | |
import torch | |
import numpy as np | |
import pandas as pd | |
from huggingface_hub import hf_hub_download | |
from model.bart import BartCaptionModel | |
from utils.audio_utils import load_audio, STR_CH_FIRST | |
from diffusers import DiffusionPipeline | |
from PIL import Image | |
def image_grid(imgs, rows, cols): | |
assert len(imgs) == rows*cols | |
w, h = imgs[0].size | |
grid = Image.new('RGB', size=(cols*w, rows*h)) | |
grid_w, grid_h = grid.size | |
for i, img in enumerate(imgs): | |
grid.paste(img, box=(i%cols*w, i//cols*h)) | |
return grid | |
def save_to_txt(text_to_save): | |
with open('prompt.txt', 'w', encoding='utf-8') as f: | |
f.write(text_to_save) | |
def read_txt(): | |
with open('prompt.txt') as f: | |
lines = f.readlines() | |
return lines | |
##### Chat z LLAMA #### | |
##### Chat z LLAMA #### | |
##### Chat z LLAMA #### | |
params = { | |
"max_new_tokens":512, | |
"stop":["<end>" ,"<|endoftext|>","[", "<user>"], | |
"temperature":0.7, | |
"top_p":0.8, | |
"stream":True, | |
"batch_size": 8} | |
whisper_model = whisper.load_model("medium").to("cuda") | |
print("Whisper Loaded!") | |
llm = AutoModelForCausalLM.from_pretrained("Aspik101/trurl-2-7b-pl-instruct_GGML", model_type="llama") | |
print("LLM Loaded!") | |
tts_model = VitsModel.from_pretrained("facebook/mms-tts-pol") | |
tts_model.to("cuda") | |
print("TTS Loaded!") | |
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-pol") | |
pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", | |
torch_dtype=torch.float16, | |
use_safetensors=True, | |
variant="fp16").to("cuda") | |
print("DiffusionPipeline Loaded!") | |
model_audio_gen = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small").to("cuda") | |
processor_audio_gen = AutoProcessor.from_pretrained("facebook/musicgen-small") | |
with gr.Blocks() as chat_demo: | |
chatbot = gr.Chatbot() | |
audio_input = gr.Audio(source="microphone", type="filepath", show_label=False) | |
submit_audio = gr.Button("Submit Audio") | |
clear = gr.Button("Clear") | |
audio_output = gr.Audio('temp_file.wav', label="Generated Audio (wav)", type='filepath', autoplay=False) | |
def translate(audio): | |
print("__WysyΕam nagranie do whisper!") | |
transcription = whisper_model.transcribe(audio, language="pl") | |
return transcription["text"] | |
def read_text(text): | |
print("Tutaj jest tekst to przeczytania!", text[-1][-1]) | |
inputs = tokenizer(text[-1][-1], return_tensors="pt").to("cuda") | |
with torch.no_grad(): | |
output = tts_model(**inputs).waveform.squeeze().cpu().numpy() | |
sf.write('temp_file.wav', output, tts_model.config.sampling_rate) | |
return 'temp_file.wav' | |
def user(audio_data, history): | |
if audio_data: | |
user_message = translate(audio_data) | |
print("USER!:") | |
print("", history + [[user_message, None]]) | |
return history + [[user_message, None]] | |
def parse_history(hist): | |
history_ = "" | |
for q, a in hist: | |
history_ += f"<user>: {q } \n" | |
if a: | |
history_ += f"<assistant>: {a} \n" | |
return history_ | |
def bot(history): | |
print(f"When: {datetime.today().strftime('%Y-%m-%d %H:%M:%S')}") | |
prompt = f"JesteΕ AI assystentem. Odpowiadaj krΓ³tko i po polsku. {parse_history(history)}. <assistant>:" | |
stream = llm(prompt, **params) | |
history[-1][1] = "" | |
answer_save = "" | |
for character in stream: | |
history[-1][1] += character | |
answer_save += character | |
time.sleep(0.005) | |
yield history | |
submit_audio.click(user, [audio_input, chatbot], [chatbot], queue=False).then(bot, chatbot, chatbot).then(read_text, chatbot, audio_output) | |
clear.click(lambda: None, None, chatbot, queue=False) | |
##### Audio Gen #### | |
##### Audio Gen #### | |
##### Audio Gen #### | |
sampling_rate = model_audio_gen.audio_encoder.config.sampling_rate | |
frame_rate = model_audio_gen.audio_encoder.config.frame_rate | |
text_encoder = model_audio_gen.get_text_encoder() | |
def generate_audio(decade, genre, instrument, guidance_scale=8, audio_length_in_s=20, seed=0): | |
prompt = " ".join([decade, genre, 'track with ', instrument]) | |
save_to_txt(prompt) | |
inputs = processor_audio_gen( | |
text=[prompt, "drums"], | |
padding=True, | |
return_tensors="pt", | |
).to(device) | |
with torch.no_grad(): | |
encoder_outputs = text_encoder(**inputs) | |
max_new_tokens = int(frame_rate * audio_length_in_s) | |
set_seed(seed) | |
audio_values = model_audio_gen.generate(inputs.input_ids[0][None, :], attention_mask=inputs.attention_mask, encoder_outputs=encoder_outputs, do_sample=True, guidance_scale=guidance_scale, max_new_tokens=max_new_tokens) | |
sf.write('generated_audio.wav', audio_values.cpu()[0][0], 32_000) | |
audio_values = (audio_values.cpu().numpy() * 32767).astype(np.int16) | |
return (sampling_rate, audio_values) | |
audio_gen = gr.Interface( | |
fn=generate_audio, | |
inputs=[ | |
# gr.Text(label="Negative prompt", value="drums"), | |
gr.Radio(["50s", " 60s", "70s", "80s", "90s"], label="decade", info=""), | |
gr.Radio(["classic", "rock", "pop", "metal", "jazz", "synth"], label="genre", info=""), | |
gr.Radio(["acoustic guitar", "electric guitar", "drums", "saxophone", "keyboard", "accordion", "fiddle"], label="instrument", info=""), | |
gr.Slider(1.5, 10, value=8, step=0.5, label="Guidance scale"), | |
gr.Slider(5, 30, value=20, step=5, label="Audio length in s"), | |
# gr.Slider(0, 10, value=0, step=1, label="Seed"), | |
], | |
outputs=[ | |
gr.Audio(label="Generated Music", type="numpy"), | |
]#, | |
# examples=EXAMPLES, | |
) | |
#### Audio desc and Stable ### | |
#### Audio desc and Stable ### | |
#### Audio desc and Stable ### | |
if os.path.isfile("transfer.pth") == False: | |
torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/transfer.pth', 'transfer.pth') | |
torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/folk.wav', 'folk.wav') | |
torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/electronic.mp3', 'electronic.mp3') | |
torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/orchestra.wav', 'orchestra.wav') | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
example_list = ['folk.wav', 'electronic.mp3', 'orchestra.wav'] | |
model = BartCaptionModel(max_length = 128) | |
pretrained_object = torch.load('./transfer.pth', map_location='cpu') | |
state_dict = pretrained_object['state_dict'] | |
model.load_state_dict(state_dict) | |
if torch.cuda.is_available(): | |
torch.cuda.set_device(device) | |
model = model.cuda(device) | |
model.eval() | |
def get_audio(audio_path, duration=10, target_sr=16000): | |
n_samples = int(duration * target_sr) | |
audio, sr = load_audio( | |
path= audio_path, | |
ch_format= STR_CH_FIRST, | |
sample_rate= target_sr, | |
downmix_to_mono= True, | |
) | |
if len(audio.shape) == 2: | |
audio = audio.mean(0, False) # to mono | |
input_size = int(n_samples) | |
if audio.shape[-1] < input_size: # pad sequence | |
pad = np.zeros(input_size) | |
pad[: audio.shape[-1]] = audio | |
audio = pad | |
ceil = int(audio.shape[-1] // n_samples) | |
audio = torch.from_numpy(np.stack(np.split(audio[:ceil * n_samples], ceil)).astype('float32')) | |
return audio | |
def captioning(audio_path): | |
audio_tensor = get_audio(audio_path = audio_path) | |
if torch.cuda.is_available(): | |
audio_tensor = audio_tensor.to(device) | |
with torch.no_grad(): | |
output = model.generate( | |
samples=audio_tensor, | |
num_beams=5, | |
) | |
inference = "" | |
number_of_chunks = range(audio_tensor.shape[0]) | |
for chunk, text in zip(number_of_chunks, output): | |
time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]" | |
inference += f"{time}\n{text} \n \n" | |
return inference | |
title = "" | |
description = "" | |
article = "" | |
def captioning(): | |
audio_path = 'generated_audio.wav' | |
audio_tensor = get_audio(audio_path=audio_path) | |
if torch.cuda.is_available(): | |
audio_tensor = audio_tensor.to(device) | |
with torch.no_grad(): | |
output = model.generate( | |
samples=audio_tensor, | |
num_beams=5) | |
inference = "" | |
number_of_chunks = range(audio_tensor.shape[0]) | |
for chunk, text in zip(number_of_chunks, output): | |
time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]" | |
inference += f"{time}\n{text} \n \n" | |
prompt = read_txt() | |
print(prompt[0]) | |
# Generuj obraz na podstawie tekstu | |
#generated_images = pipe(prompt=prompt[0]*5 + inference + prompt[0]*5).images | |
#image = generated_images[0] | |
num_images = 3 | |
prompt = [prompt[0]*5 + inference + prompt[0]*5] * num_images | |
images = pipe(prompt, height=768, width=768).images | |
grid = image_grid(images, rows=1, cols=3) | |
return inference, grid | |
audio_desc = gr.Interface(fn=captioning, | |
inputs=None, | |
outputs=[ | |
gr.Textbox(label="Caption generated by LP-MusicCaps Transfer Model"), | |
gr.Image(label="Generated Image") # Dodane wyjΕcie dla obrazu | |
], | |
title=title, | |
description=description, | |
article=article, | |
cache_examples=False | |
) | |
music = gr.Video("muzyka_AI.mp4") | |
voice_cloning = gr.Video("voice_cloning_fraud.mp4") | |
##### Run Alll ####### | |
##### Run Alll ####### | |
##### Run Alll ####### | |
demo_all = gr.TabbedInterface([music, audio_gen, audio_desc, voice_cloning, chat_demo], ["1.Music", "2.Audio Generation", "3.Image Generation", "4.Voice Cloning", "5.Chat with LLama"]) | |
demo_all.queue() | |
demo_all.launch() | |