Spaces:

Monke64
/

ThetaM2V

Sleeping

App Files Files Community

Monke64 commited on Jul 15, 2024

Commit

728ab38

1 Parent(s): 162019a

Added code

Browse files

Files changed (15) hide show

LoRA dataset/Training script/.ipynb_checkpoints/training_script-checkpoint.ipynb +33 -0
LoRA dataset/Training script/training_script.ipynb +33 -0
LoRA dataset/Weights/.gitattributes +1 -0
LoRA dataset/Weights/pytorch_lora_weights.safetensors +3 -0
MusicCaps/.gitattributes +1 -0
MusicCaps/__pycache__/audio_utils.cpython-310.pyc +0 -0
MusicCaps/__pycache__/bart.cpython-310.pyc +0 -0
MusicCaps/__pycache__/modules.cpython-310.pyc +0 -0
MusicCaps/audio_utils.py +245 -0
MusicCaps/bart.py +151 -0
MusicCaps/modules.py +95 -0
MusicCaps/train_model.py +32 -0
MusicCaps/transfer.pth +3 -0
app.py +125 -0
requirements.txt +0 -0

LoRA dataset/Training script/.ipynb_checkpoints/training_script-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3b9bbec2",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

LoRA dataset/Training script/training_script.ipynb ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3b9bbec2",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

LoRA dataset/Weights/.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ pytorch_lora_weights.safetensors filter=lfs diff=lfs merge=lfs -text

LoRA dataset/Weights/pytorch_lora_weights.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1b19610541f9a2c6f235a1bac2690d04b98535f9f9f7790e9ad4d0fe8ac89b0
+size 3226184

MusicCaps/.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ transfer.pth filter=lfs diff=lfs merge=lfs -text

MusicCaps/__pycache__/audio_utils.cpython-310.pyc ADDED Viewed

Binary file (7.7 kB). View file

MusicCaps/__pycache__/bart.cpython-310.pyc ADDED Viewed

Binary file (4.57 kB). View file

MusicCaps/__pycache__/modules.cpython-310.pyc ADDED Viewed

Binary file (3.27 kB). View file

MusicCaps/audio_utils.py ADDED Viewed

	@@ -0,0 +1,245 @@

+STR_CLIP_ID = 'clip_id'
+STR_AUDIO_SIGNAL = 'audio_signal'
+STR_TARGET_VECTOR = 'target_vector'
+STR_CH_FIRST = 'channels_first'
+STR_CH_LAST = 'channels_last'
+import io
+import os
+import tqdm
+import logging
+import subprocess
+from typing import Tuple
+from pathlib import Path
+import librosa
+import numpy as np
+import soundfile as sf
+import itertools
+from numpy.fft import irfft
+def _resample_load_ffmpeg(path: str, sample_rate: int, downmix_to_mono: bool) -> Tuple[np.ndarray, int]:
+    """
+    Decoding, downmixing, and downsampling by librosa.
+    Returns a channel-first audio signal.
+    Args:
+        path:
+        sample_rate:
+        downmix_to_mono:
+    Returns:
+        (audio signal, sample rate)
+    """
+    def _decode_resample_by_ffmpeg(filename, sr):
+        """decode, downmix, and resample audio file"""
+        channel_cmd = '-ac 1 ' if downmix_to_mono else ''  # downmixing option
+        resampling_cmd = f'-ar {str(sr)}' if sr else ''  # downsampling option
+        cmd = f"ffmpeg -i \"{filename}\" {channel_cmd} {resampling_cmd} -f wav -"
+        p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        out, err = p.communicate()
+        return out
+    src, sr = sf.read(io.BytesIO(_decode_resample_by_ffmpeg(path, sr=sample_rate)))
+    return src.T, sr
+def _resample_load_librosa(path, sample_rate: int, downmix_to_mono: bool, **kwargs) -> Tuple[np.ndarray, int]:
+    """
+    Decoding, downmixing, and downsampling by librosa.
+    Returns a channel-first audio signal.
+    """
+    src, sr = librosa.load(path, sr=sample_rate, mono=downmix_to_mono, **kwargs)
+    return src, sr
+def load_audio(
+    path: str or Path,
+    ch_format: str,
+    sample_rate: int = None,
+    downmix_to_mono: bool = False,
+    resample_by: str = 'librosa',
+    **kwargs,
+) -> Tuple[np.ndarray, int]:
+    """A wrapper of librosa.load that:
+        - forces the returned audio to be 2-dim,
+        - defaults to sr=None, and
+        - defaults to downmix_to_mono=False.
+    The audio decoding is done by `audioread` or `soundfile` package and ultimately, often by ffmpeg.
+    The resampling is done by `librosa`'s child package `resampy`.
+    Args:
+        path: audio file path
+        ch_format: one of 'channels_first' or 'channels_last'
+        sample_rate: target sampling rate. if None, use the rate of the audio file
+        downmix_to_mono:
+        resample_by (str): 'librosa' or 'ffmpeg'. it decides backend for audio decoding and resampling.
+        **kwargs: keyword args for librosa.load - offset, duration, dtype, res_type.
+    Returns:
+        (audio, sr) tuple
+    """
+    if ch_format not in (STR_CH_FIRST, STR_CH_LAST):
+        raise ValueError(f'ch_format is wrong here -> {ch_format}')
+    if resample_by == 'librosa':
+        src, sr = _resample_load_librosa(path, sample_rate, downmix_to_mono, **kwargs)
+    elif resample_by == 'ffmpeg':
+        src, sr = _resample_load_ffmpeg(path, sample_rate, downmix_to_mono)
+    else:
+        raise NotImplementedError(f'resample_by: "{resample_by}" is not supposred yet')
+    return src, sr
+    # if src.ndim == 1:
+    #     src = np.expand_dims(src, axis=0)
+    # # now always 2d and channels_first
+    # if ch_format == STR_CH_FIRST:
+    #     return src, sr
+    # else:
+    #     return src.T, sr
+def ms(x):
+    """Mean value of signal `x` squared.
+    :param x: Dynamic quantity.
+    :returns: Mean squared of `x`.
+    """
+    return (np.abs(x)**2.0).mean()
+def normalize(y, x=None):
+    """normalize power in y to a (standard normal) white noise signal.
+    Optionally normalize to power in signal `x`.
+    #The mean power of a Gaussian with :math:`\\mu=0` and :math:`\\sigma=1` is 1.
+    """
+    if x is not None:
+        x = ms(x)
+    else:
+        x = 1.0
+    return y * np.sqrt(x / ms(y))
+def noise(N, color='white', state=None):
+    """Noise generator.
+    :param N: Amount of samples.
+    :param color: Color of noise.
+    :param state: State of PRNG.
+    :type state: :class:`np.random.RandomState`
+    """
+    try:
+        return _noise_generators[color](N, state)
+    except KeyError:
+        raise ValueError("Incorrect color.")
+def white(N, state=None):
+    """
+    White noise.
+    :param N: Amount of samples.
+    :param state: State of PRNG.
+    :type state: :class:`np.random.RandomState`
+    White noise has a constant power density. It's narrowband spectrum is therefore flat.
+    The power in white noise will increase by a factor of two for each octave band,
+    and therefore increases with 3 dB per octave.
+    """
+    state = np.random.RandomState() if state is None else state
+    return state.randn(N)
+def pink(N, state=None):
+    """
+    Pink noise.
+    :param N: Amount of samples.
+    :param state: State of PRNG.
+    :type state: :class:`np.random.RandomState`
+    Pink noise has equal power in bands that are proportionally wide.
+    Power density decreases with 3 dB per octave.
+    """
+    state = np.random.RandomState() if state is None else state
+    uneven = N % 2
+    X = state.randn(N // 2 + 1 + uneven) + 1j * state.randn(N // 2 + 1 + uneven)
+    S = np.sqrt(np.arange(len(X)) + 1.)  # +1 to avoid divide by zero
+    y = (irfft(X / S)).real
+    if uneven:
+        y = y[:-1]
+    return normalize(y)
+def blue(N, state=None):
+    """
+    Blue noise.
+    :param N: Amount of samples.
+    :param state: State of PRNG.
+    :type state: :class:`np.random.RandomState`
+    Power increases with 6 dB per octave.
+    Power density increases with 3 dB per octave.
+    """
+    state = np.random.RandomState() if state is None else state
+    uneven = N % 2
+    X = state.randn(N // 2 + 1 + uneven) + 1j * state.randn(N // 2 + 1 + uneven)
+    S = np.sqrt(np.arange(len(X)))  # Filter
+    y = (irfft(X * S)).real
+    if uneven:
+        y = y[:-1]
+    return normalize(y)
+def brown(N, state=None):
+    """
+    Violet noise.
+    :param N: Amount of samples.
+    :param state: State of PRNG.
+    :type state: :class:`np.random.RandomState`
+    Power decreases with -3 dB per octave.
+    Power density decreases with 6 dB per octave.
+    """
+    state = np.random.RandomState() if state is None else state
+    uneven = N % 2
+    X = state.randn(N // 2 + 1 + uneven) + 1j * state.randn(N // 2 + 1 + uneven)
+    S = (np.arange(len(X)) + 1)  # Filter
+    y = (irfft(X / S)).real
+    if uneven:
+        y = y[:-1]
+    return normalize(y)
+def violet(N, state=None):
+    """
+    Violet noise. Power increases with 6 dB per octave.
+    :param N: Amount of samples.
+    :param state: State of PRNG.
+    :type state: :class:`np.random.RandomState`
+    Power increases with +9 dB per octave.
+    Power density increases with +6 dB per octave.
+    """
+    state = np.random.RandomState() if state is None else state
+    uneven = N % 2
+    X = state.randn(N // 2 + 1 + uneven) + 1j * state.randn(N // 2 + 1 + uneven)
+    S = (np.arange(len(X)))  # Filter
+    y = (irfft(X * S)).real
+    if uneven:
+        y = y[:-1]
+    return normalize(y)
+_noise_generators = {
+    'white': white,
+    'pink': pink,
+    'blue': blue,
+    'brown': brown,
+    'violet': violet,
+}
+def noise_generator(N=44100, color='white', state=None):
+    """Noise generator.
+    :param N: Amount of unique samples to generate.
+    :param color: Color of noise.
+    Generate `N` amount of unique samples and cycle over these samples.
+    """
+    #yield from itertools.cycle(noise(N, color)) # Python 3.3
+    for sample in itertools.cycle(noise(N, color, state)):
+        yield sample
+def heaviside(N):
+    """Heaviside.
+    Returns the value 0 for `x < 0`, 1 for `x > 0`, and 1/2 for `x = 0`.
+    """
+    return 0.5 * (np.sign(N) + 1)

MusicCaps/bart.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from MusicCaps.modules import AudioEncoder
+from transformers import BartForConditionalGeneration, BartTokenizer, BartConfig
+class BartCaptionModel(nn.Module):
+    def __init__(self, n_mels=128, num_of_conv=6, sr=16000, duration=10, max_length=128, label_smoothing=0.1, bart_type="facebook/bart-base", audio_dim=768):
+        super(BartCaptionModel, self).__init__()
+        # non-finetunning case
+        bart_config = BartConfig.from_pretrained(bart_type)
+        self.tokenizer = BartTokenizer.from_pretrained(bart_type)
+        self.bart = BartForConditionalGeneration(bart_config)
+        self.n_sample = sr * duration
+        self.hop_length = int(0.01 * sr) # hard coding hop_size
+        self.n_frames = int(self.n_sample // self.hop_length)
+        self.num_of_stride_conv = num_of_conv - 1
+        self.n_ctx = int(self.n_frames // 2**self.num_of_stride_conv) + 1
+        self.audio_encoder = AudioEncoder(
+            n_mels = n_mels, # hard coding n_mel
+            n_ctx = self.n_ctx,
+            audio_dim = audio_dim,
+            text_dim = self.bart.config.hidden_size,
+            num_of_stride_conv = self.num_of_stride_conv
+        )
+        self.max_length = max_length
+        self.loss_fct = nn.CrossEntropyLoss(label_smoothing= label_smoothing, ignore_index=-100)
+    @property
+    def device(self):
+        return list(self.parameters())[0].device
+    def shift_tokens_right(self, input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+        """
+        Shift input ids one token to the right.ls
+        """
+        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+        shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+        shifted_input_ids[:, 0] = decoder_start_token_id
+        if pad_token_id is None:
+            raise ValueError("self.model.config.pad_token_id has to be defined.")
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+        return shifted_input_ids
+    def forward_encoder(self, audio):
+        audio_embs = self.audio_encoder(audio)
+        encoder_outputs = self.bart.model.encoder(
+            input_ids=None,
+            inputs_embeds=audio_embs,
+            return_dict=True
+        )["last_hidden_state"]
+        return encoder_outputs, audio_embs
+    def forward_decoder(self, text, encoder_outputs):
+        text = self.tokenizer(text,
+                              padding='longest',
+                              truncation=True,
+                              max_length=self.max_length,
+                              return_tensors="pt")
+        input_ids = text["input_ids"].to(self.device)
+        attention_mask = text["attention_mask"].to(self.device)
+        decoder_targets = input_ids.masked_fill(
+            input_ids == self.tokenizer.pad_token_id, -100
+        )
+        decoder_input_ids = self.shift_tokens_right(
+            decoder_targets, self.bart.config.pad_token_id, self.bart.config.decoder_start_token_id
+        )
+        decoder_outputs = self.bart(
+            input_ids=None,
+            attention_mask=None,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=attention_mask,
+            inputs_embeds=None,
+            labels=None,
+            encoder_outputs=(encoder_outputs,),
+            return_dict=True
+        )
+        lm_logits = decoder_outputs["logits"]
+        loss = self.loss_fct(lm_logits.view(-1, self.tokenizer.vocab_size), decoder_targets.view(-1))
+        return loss
+    def forward(self, audio, text):
+        encoder_outputs, _ = self.forward_encoder(audio)
+        loss = self.forward_decoder(text, encoder_outputs)
+        return loss
+    def generate(self,
+                 samples,
+                 use_nucleus_sampling=False,
+                 num_beams=5,
+                 max_length=128,
+                 min_length=2,
+                 top_p=0.9,
+                 repetition_penalty=1.0,
+                 ):
+        # self.bart.force_bos_token_to_be_generated = True
+        audio_embs = self.audio_encoder(samples)
+        encoder_outputs = self.bart.model.encoder(
+            input_ids=None,
+            attention_mask=None,
+            head_mask=None,
+            inputs_embeds=audio_embs,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=True)
+        input_ids = torch.zeros((encoder_outputs['last_hidden_state'].size(0), 1)).long().to(self.device)
+        input_ids[:, 0] = self.bart.config.decoder_start_token_id
+        decoder_attention_mask = torch.ones((encoder_outputs['last_hidden_state'].size(0), 1)).long().to(self.device)
+        if use_nucleus_sampling:
+            outputs = self.bart.generate(
+                input_ids=None,
+                attention_mask=None,
+                decoder_input_ids=input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                encoder_outputs=encoder_outputs,
+                max_length=max_length,
+                min_length=min_length,
+                do_sample=True,
+                top_p=top_p,
+                num_return_sequences=1,
+                repetition_penalty=1.1)
+        else:
+            outputs = self.bart.generate(input_ids=None,
+                                            attention_mask=None,
+                                            decoder_input_ids=input_ids,
+                                            decoder_attention_mask=decoder_attention_mask,
+                                            encoder_outputs=encoder_outputs,
+                                            head_mask=None,
+                                            decoder_head_mask=None,
+                                            inputs_embeds=None,
+                                            decoder_inputs_embeds=None,
+                                            use_cache=None,
+                                            output_attentions=None,
+                                            output_hidden_states=None,
+                                            max_length=max_length,
+                                            min_length=min_length,
+                                            num_beams=num_beams,
+                                            repetition_penalty=repetition_penalty)
+        captions = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        return captions

MusicCaps/modules.py ADDED Viewed

	@@ -0,0 +1,95 @@

+### code reference: https://github.com/openai/whisper/blob/main/whisper/audio.py
+import os
+import torch
+import torchaudio
+import numpy as np
+import torch.nn.functional as F
+from torch import Tensor, nn
+from typing import Dict, Iterable, Optional
+# hard-coded audio hyperparameters
+SAMPLE_RATE = 16000
+N_FFT = 1024
+N_MELS = 128
+HOP_LENGTH = int(0.01 * SAMPLE_RATE)
+DURATION = 10
+N_SAMPLES = int(DURATION * SAMPLE_RATE)
+N_FRAMES = N_SAMPLES // HOP_LENGTH + 1
+def sinusoids(length, channels, max_timescale=10000):
+    """Returns sinusoids for positional embedding"""
+    log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
+    inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
+    scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
+    return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
+class MelEncoder(nn.Module):
+    """
+    time-frequency represntation
+    """
+    def __init__(self,
+                sample_rate= 16000,
+                f_min=0,
+                f_max=8000,
+                n_fft=1024,
+                win_length=1024,
+                hop_length = int(0.01 * 16000),
+                n_mels = 128,
+                power = None,
+                pad= 0,
+                normalized= False,
+                center= True,
+                pad_mode= "reflect"
+                ):
+        super(MelEncoder, self).__init__()
+        self.window = torch.hann_window(win_length)
+        self.spec_fn = torchaudio.transforms.Spectrogram(
+            n_fft = n_fft,
+            win_length = win_length,
+            hop_length = hop_length,
+            power = power
+        )
+        self.mel_scale = torchaudio.transforms.MelScale(
+            n_mels,
+            sample_rate,
+            f_min,
+            f_max,
+            n_fft // 2 + 1)
+        self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
+    def forward(self, wav):
+        spec = self.spec_fn(wav)
+        power_spec = spec.real.abs().pow(2)
+        mel_spec = self.mel_scale(power_spec)
+        mel_spec = self.amplitude_to_db(mel_spec) # Log10(max(reference value and amin))
+        return mel_spec
+class AudioEncoder(nn.Module):
+    def __init__(
+        self, n_mels: int, n_ctx: int, audio_dim: int, text_dim: int, num_of_stride_conv: int,
+    ):
+        super().__init__()
+        self.mel_encoder = MelEncoder(n_mels=n_mels)
+        self.conv1 = nn.Conv1d(n_mels, audio_dim, kernel_size=3, padding=1)
+        self.conv_stack = nn.ModuleList([])
+        for _ in range(num_of_stride_conv):
+            self.conv_stack.append(
+                nn.Conv1d(audio_dim, audio_dim, kernel_size=3, stride=2, padding=1)
+            )
+        # self.proj = nn.Linear(audio_dim, text_dim, bias=False)
+        self.register_buffer("positional_embedding", sinusoids(n_ctx, text_dim))
+    def forward(self, x: Tensor):
+        """
+        x : torch.Tensor, shape = (batch_size, waveform)
+            single channel wavform
+        """
+        x = self.mel_encoder(x) # (batch_size, n_mels, n_ctx)
+        x = F.gelu(self.conv1(x))
+        for conv in self.conv_stack:
+            x = F.gelu(conv(x))
+        x = x.permute(0, 2, 1)
+        x = (x + self.positional_embedding).to(x.dtype)
+        return x

MusicCaps/train_model.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from bart import BartCaptionModel
+from audio_utils import load_audio, STR_CH_FIRST
+import torch
+try:
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+except:
+    print("1")
+try:
+    model = BartCaptionModel(max_length = 128)
+except:
+    print("2")
+try:
+    pretrained_object = torch.load('transfer.pth', map_location='cpu')
+except:
+    print("3")
+try:
+    state_dict = pretrained_object['state_dict']
+except:
+    print("4")
+try:
+    model.load_state_dict(state_dict)
+except:
+    print("5")
+try:
+    torch.save(model,"model.pth")
+except:
+    print("6")

MusicCaps/transfer.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d04e457e045a09c7c5037222eaed3ffe35f8689b3753a2ce6094c5d5792f9bc
+size 1783650705

app.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import streamlit as st
+from timeit import default_timer as timer
+import torch
+import numpy as np
+import pandas as pd
+from huggingface_hub import hf_hub_download
+from MusicCaps.bart import BartCaptionModel
+from MusicCaps.audio_utils import load_audio, STR_CH_FIRST
+from diffusers import StableDiffusionPipeline, I2VGenXLPipeline
+from diffusers.utils import export_to_video, load_image
+import tensorflow as tf
+import torch
+physical_devices = tf.config.experimental.list_physical_devices('GPU')
+if len(physical_devices) > 0:
+    tf.config.experimental.set_memory_growth(physical_devices[0], True)
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+@st.cache_resource
+def load_text_model():
+    model = BartCaptionModel(max_length = 128)
+    pretrained_object = torch.load('MusicCaps/transfer.pth', map_location='cpu')
+    state_dict = pretrained_object['state_dict']
+    model.load_state_dict(state_dict)
+    if torch.cuda.is_available():
+        torch.cuda.set_device(device)
+    model.eval()
+    return model
+def get_audio(audio_path, duration=10, target_sr=16000):
+    n_samples = int(duration * target_sr)
+    audio, sr = load_audio(
+        path= audio_path,
+        ch_format= STR_CH_FIRST,
+        sample_rate= target_sr,
+        downmix_to_mono= True,
+    )
+    if len(audio.shape) == 2:
+        audio = audio.mean(0, False)  # to mono
+    input_size = int(n_samples)
+    if audio.shape[-1] < input_size:  # pad sequence
+        pad = np.zeros(input_size)
+        pad[: audio.shape[-1]] = audio
+        audio = pad
+    ceil = int(audio.shape[-1] // n_samples)
+    audio = torch.from_numpy(np.stack(np.split(audio[:ceil * n_samples], ceil)).astype('float32'))
+    return audio
+def captioning(model,audio_path):
+    audio_tensor = get_audio(audio_path = audio_path)
+    if device is not None:
+        audio_tensor = audio_tensor.to(device)
+    with torch.no_grad():
+        output = model.generate(
+            samples=audio_tensor,
+            num_beams=5,
+        )
+    inference = []
+    number_of_chunks = range(audio_tensor.shape[0])
+    for chunk, text in zip(number_of_chunks, output):
+        output = ""
+        time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]"
+        output += f"{time}\n{text} \n \n"
+        inference.append(output)
+    return inference
+@st.cache_resource
+def load_image_model():
+     pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5",torch_dtype=torch.float16).to("cuda")
+     pipeline.load_lora_weights("LoRA dataset/Weights/pytorch_lora_weights.safetensors", weight_name="pytorch_lora_weights.safetensors")
+     return pipeline
+@st.cache_resource
+def load_video_model():
+    pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16")
+    return pipeline
+A2C_model = load_text_model()
+image_service = load_image_model()
+video_model = load_video_model()
+if "audio_input" not in st.session_state:
+    st.session_state.audio_input = None
+if "captions" not in st.session_state:
+    st.session_state.captions = None
+if "image" not in st.session_state:
+    st.session_state.image = None
+if "video" not in st.session_state:
+    st.session_state.video = None
+st.title("Testing MusicCaps")
+st.session_state.audio_input = st.file_uploader("Insert Your Audio Clips Here",type = ["wav","mp3"], key = "Audio input")
+if st.session_state.audio_input:
+    audio_input = st.session_state.audio_input
+    st.audio(audio_input)
+    if st.button("Generate text prompt"):
+        st.session_state.captions = captioning(A2C_model,audio_input)[0]
+        captions = st.session_state.captions
+        st.text(captions)
+    if st.session_state.captions:
+        if st.button("Generate Image and video from text prompt"):
+            st.session_state.image = image_service(captions).images[0]
+            image = st.session_state.image
+            video = video_model(
+                prompt = captions,
+                image=image,
+                num_inference_steps=50
+            ).frames[0]
+            st.session_state.video = video
+            export_to_video(video, "generated.mp4", fps=7)
+            c1,c2 = st.columns([1,1])
+            with c1:
+                st.image(image)
+            with c2:
+                st.video("generated.mp4")

requirements.txt ADDED Viewed

Binary file (3.71 kB). View file