Spaces:

bpiyush
/

SoundOfWater

Running

File size: 10,414 Bytes

import os
import sys
sys.path.append("../")

import gradio as gr
import torch
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "serif"
import decord
import PIL, PIL.Image
import librosa
from IPython.display import Markdown, display
import pandas as pd

from util import *


css = """
<style>
    body {
        font-family: 'Arial', serif;
        margin: 0;
        padding: 0;
        color: black;
    }
    .header {
        display: flex;
        align-items: center;
        justify-content: center;
        margin-top: 5px;
        color: black;
    }
    .footer {
        display: flex;
        align-items: center;
        justify-content: center;
        margin-top: 5px;
    }
    .image {
        margin-right: 20px;
    }
    .content {
        text-align: center;
        color: black;
    }
    .title {
        font-size: 2.5em;
        font-weight: bold;
        margin-bottom: 10px;
    }
    .authors {
        color: #4a90e2;
        font-size: 1.05em;
        margin: 10px 0;
    }
    .affiliations {
        font-size: 1.em;
        margin-bottom: 20px;
    }
    .buttons {
        display: flex;
        justify-content: center;
        gap: 10px;
    }
    .button {
        background-color: #545758;
        text-decoration: none;
        padding: 8px 16px;
        border-radius: 5px;
        font-size: 1.05em;
    }
    .button:hover {
        background-color: #333;
    }
</style>
"""


header = css + """
<div class="header">
    <!-- <div class="image">
        <img src="./media_assets/pouring-water-logo5.png" alt="logo" width="100">
    </div> -->
    <div class="content">
        <img src="https://bpiyush.github.io/pouring-water-website/assets/pouring-water-logo5.png" alt="logo" width="80" style="margin-bottom: -50px; margin-right: 30px;">
        <div class="title" style="font-size: 44px; margin-left: -30px;">The Sound of Water</div>
        <div style="font-size: 30px; margin-left: -30px;"><b>Inferring Physical Properties from Pouring Liquids</b></div>
        <div class="authors">
            <a style="color: #92eaff; href="https://bpiyush.github.io/">Piyush Bagad</a><sup>1</sup>,
            <a style="color: #92eaff; href="https://makarandtapaswi.github.io/">Makarand Tapaswi</a><sup>2</sup>,
            <a style="color: #92eaff; href="https://www.ceessnoek.info/">Cees G. M. Snoek</a><sup>3</sup>,
            <a style="color: #92eaff; href="https://www.robots.ox.ac.uk/~az/">Andrew Zisserman</a><sup>1</sup>,
        </div>
        <div class="affiliations">
            <sup>1</sup>University of Oxford, <sup>2</sup>IIIT Hyderabad, <sup>3</sup>University of Amsterdam
        </div>
        
        <div class="buttons">
            <a href="#" style="color: #92eaff;" class="button">arXiv</a>
            <a href="https://bpiyush.github.io/pouring-water-website/" style="color: #92eaff;" class="button">🌐 Project</a>
            <a href="https://github.com/bpiyush/SoundOfWater" style="color: #92eaff;" class="button"> <img src="https://bpiyush.github.io/pouring-water-website/assets/github-logo.png" alt="logo" style="height:16px; float: left;"> &nbsp;Code</a>
            <a href="https://huggingface.co/datasets/bpiyush/sound-of-water" style="color: #92eaff;" class="button">🤗 Data</a>
            <a href="https://huggingface.co/bpiyush/sound-of-water-models" style="color: #92eaff;" class="button">🤗 Models</a>
            <a href="#" style="color: #92eaff;" class="button">🎯 Demo</a>
        </div>
    </div>
</div>
"""

footer = css + """
<div class="header" style="justify-content: left;">
<div class="content" style="font-size: 16px;">
Please give us a 🌟 on <a href='https://github.com/bpiyush/SoundOfWater'>Github</a> if you like our work!
Tips to get better results:
<br><br>
<ol style="text-align: left; font-size: 14px; margin-left: 30px">
    <li>The first example may take up to 30-60s for processing since the model is also loaded.</li>
    <li>
    If you are providing a link, it may take a few seconds to download video from YouTube.
    Note that the entire video shall be used.
    If the sound of pouring is not clear, the results will be random.
    </li>
    <li>Although the model is somewhat robust to noise, make sure there is not too much noise such that the pouring is audible.</li>
    <li>Note that the video is not used during the inference. The displayed frame is only for reference.</li>
</ol>
</div>
</div>
"""


def download_from_youtube(
        video_id,
        save_dir="/tmp/",
        convert_to_mp4=False,
    ):
    """
    Downloads a YouTube video from start to end times.

    Args:
        video_id (str): YouTube video ID.
        save_dir (str): Directory to save the video.
        convert_to_mp4 (bool): Whether to convert the video to mp4 format.

    The saved video is in the format: {save_dir}/{video_id}.mp4
    """

    import datetime
    from subprocess import call

    print("Downloading video from YouTube...")
    print("Video ID:", video_id)

    command = [
        "yt-dlp",
        "-o", "'{}%(id)s.%(ext)s'".format(save_dir),
        "--cookies ./chrome_cookies.txt",
        "--verbose",
        "--force-overwrites",
        f"https://www.youtube.com/watch?v={video_id}",
    ]
    call(" ".join(command), shell=True)

    # If not mp4, convert to mp4
    from glob import glob
    saved_filepath = glob(os.path.join(save_dir, f"{video_id}.*"))[0]
    print("Saved file:", saved_filepath)

    if convert_to_mp4:
        ext = saved_filepath.split(".")[-1]
        to_save = saved_filepath.replace(ext, "mp4")
        if ext != "mp4":
            # convert to mp4 using ffmpeg
            command = "ffmpeg -y -i {} {}".format(saved_filepath, to_save)
            call(command, shell=True)
        return to_save
    else:
        return saved_filepath


def configure_input():
    gr.Markdown(
        "#### Either upload a video file or provide a YouTube link to a video. Note that the entire video shall be used.",
    )
    video_input = gr.Video(label="Upload Video", height=520)
    youtube_link = gr.Textbox(label="YouTube Link", value=None)
    return [video_input, youtube_link]


# video_backend = "decord"
video_backend = "torchvision"
def get_predictions(video_path):
    model = load_model()
    frame = load_frame(video_path, video_backend=video_backend)
    S = load_spectrogram(video_path)
    audio = load_audio_tensor(video_path)
    z_audio, y_audio = get_model_output(audio, model)
    image, df_show, tsne_image = show_output(frame, S, y_audio, z_audio)
    return image, df_show, tsne_image


def get_video_id_from_url(url):
    import re 
    if "v=" in url:
        video_id = re.findall(r"v=([a-zA-Z0-9_-]+)", url)
    elif "youtu.be" in url:
        video_id = re.findall(r"youtu.be/([a-zA-Z0-9_-]+)", url)
    elif "shorts" in url:
        video_id = re.findall(r"shorts/([a-zA-Z0-9_-]+)", url)
    else:
        raise ValueError("Invalid YouTube URL")
    print("Video URL:", url)
    print("Video ID:", video_id)

    if len(video_id) > 0:
        return video_id[0]
    else:
        raise ValueError("Invalid YouTube URL")


note = """
**Note**: Radius (as well as height) estimation depends on accurate wavelength estimation towards the end.
Thus, it may not be accurate if the wavelength is not estimated correctly at the end.

$$
H = l(0) = \\frac{\lambda(0) - \lambda(T)}{4} \ \ \\text{and} \ \ R = \\frac{\lambda(T)}{4\\beta}
$$
"""


# Example usage in a Gradio interface
def process_input(video, youtube_link):

    provided_video = video is not None
    if youtube_link is None:
        provided_link = False
    elif isinstance(youtube_link, str):
        provided_link = len(youtube_link) > 0
    else:
        raise ValueError(f"Invalid type of link {youtube_link}.")
    
    if provided_video and provided_link:
        raise ValueError("Please provide either a video file or a YouTube link, not both.")

    if provided_video:
        print(video)

        # # Load model globally
        # model = load_model()

        # The input is a video file path
        video_path = video

        # Get predictions
        image, df_show, tsne_image = get_predictions(video_path)

        return image, df_show, gr.Markdown(note), tsne_image

    else:
        print(provided_link)
    
        assert provided_link, \
            "YouTube Link cannot be empty if no video is provided."
        
        video_id = get_video_id_from_url(youtube_link)
        video_path = download_from_youtube(
            video_id, save_dir="/tmp/", convert_to_mp4=False,
        )

        # Get predictions
        image, df_show, tsne_image = get_predictions(video_path)

        # Add youtube link to the note
        local_note = f"{note}\n\nYou can watch the original video here: "\
            f"[YouTube Link](https://www.youtube.com/watch?v={video_id})"

        return image, df_show, gr.Markdown(local_note), tsne_image


def configure_outputs():
    image_wide = gr.Image(label="Estimated pitch")
    dataframe = gr.DataFrame(label="Estimated physical properties")
    image_tsne = gr.Image(label="TSNE of features", width=300)
    markdown = gr.Markdown(label="Note")
    return [image_wide, dataframe, markdown, image_tsne]


# Configure pre-defined examples
examples = [
    ["./media_assets/example_video.mp4", None],
    ["./media_assets/ayNzH0uygFw_9.0_21.0.mp4", None],
    ["./media_assets/biDn0Gi6V8U_7.0_15.0.mp4", None],
    ["./media_assets/goWgiQQMugA_2.5_9.0.mp4", None],
    ["./media_assets/K87g4RvO-9k_254.0_259.0.mp4", None],
    # Shows that it works with background noise
    ["./media_assets/l74zJHCZ9uA.webm", None],
    # Shows that it works with a slightly differently shaped container
    ["./media_assets/LpRPV0hIymU.webm", None],
    ["./media_assets/k-HnMsS36J8.webm", None],
    # [None, "https://www.youtube.com/shorts/6eUQTdkTooo"],
    # [None, "https://www.youtube.com/shorts/VxZT15cG6tw"],
    # [None, "https://www.youtube.com/shorts/GSXQnNhliDY"],
]


# Define Gradio interface
with gr.Blocks(
    css=custom_css,
    theme=gr.themes.Default(),
) as demo:

    # Add the header
    gr.HTML(header)
    
    gr.Interface(
        fn=process_input,
        inputs=configure_input(),
        outputs=configure_outputs(),
        examples=examples,
    )

    # Add the footer
    gr.HTML(footer)


# Launch the interface
demo.launch(allowed_paths=["."], share=True)