Spaces:

adityas2410
/

Zero-Shot_Classification_App

Sleeping

App Files Files Community

adityas2410 commited on Dec 5, 2024

Commit

ab0f9b3

verified ·

1 Parent(s): e2f0b1f

Upload 4 files

Browse files

Files changed (4) hide show

app.py +28 -0
requirements.txt +7 -0
zs_audio.py +70 -0
zs_image.py +39 -0

app.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import gradio as gr
+from zs_audio import classify_audio
+from zs_image import classify_image
+audio_interface = gr.Interface(
+    fn=classify_audio,
+    inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
+    outputs=gr.Label(),
+    title="Zero-Shot Audio Classification",
+    description="Classify audio into predefined categories without prior training.",
+    allow_flagging="never",
+)
+image_interface = gr.Interface(
+    fn=classify_image,
+    inputs=gr.Image(type="filepath"),
+    outputs=gr.Label(),
+    title="Zero-Shot Image Classification",
+    description="Classify an image into predefined categories using CLIP.",
+    allow_flagging="never",
+)
+app = gr.TabbedInterface(
+    [audio_interface, image_interface],
+    ["Audio Classification", "Image Classification"]
+)
+app.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio
+transformers
+datasets
+pillow
+torch
+torchaudio
+numpy

zs_audio.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from datasets import load_dataset, Audio
+from transformers import pipeline
+import torchaudio
+import numpy as np
+# Initialize the zero-shot audio classification pipeline
+zero_shot_classifier = pipeline(
+    task="zero-shot-audio-classification",
+    model="laion/clap-htsat-unfused"
+)
+# Define the candidate labels for classification
+candidate_labels = [
+    "Sound of a dog barking",
+    "Sound of car driving",
+    "Sound of a person talking",
+    "Sound of a bird singing",
+    "Sound of a plane flying",
+]
+# Function to perform inference on a dataset
+def audio_dataset_inference():
+    # Load a dataset containing different 5-second sound clips
+    dataset = load_dataset("ashraq/esc50", split="train[0:10]")
+    # Ensure all audio samples in the dataset have the same sampling rate (48kHz)
+    dataset = dataset.cast_column("audio", Audio(sampling_rate=48_000))
+    # Select the first audio sample from the dataset
+    audio_sample = dataset[0]
+    # Perform zero-shot classification on the selected audio sample
+    result = zero_shot_classifier(
+        audio_sample["audio"]["array"],  # Extract the audio array from the dataset sample
+        candidate_labels=candidate_labels  # Pass the candidate labels for classification
+    )
+    print(result)
+def classify_audio(audio_file):
+    """
+    Perform zero-shot classification on a single audio file.
+    Args:
+        audio_file (str): Path to the audio file to classify.
+    Returns:
+        dict: Classification labels and their corresponding scores.
+    """
+    try:
+        # Load audio file using torchaudio
+        waveform, sample_rate = torchaudio.load(audio_file)
+        # Resample audio to 48kHz (if necessary)
+        if sample_rate != 48000:
+            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=48000)
+            waveform = resampler(waveform)
+        # Convert waveform to NumPy array
+        audio_array = waveform.squeeze().numpy()
+        # Perform zero-shot classification
+        result = zero_shot_classifier(
+            audio_array,  # Pass the audio array
+            candidate_labels=candidate_labels
+        )
+        return {label['label']: label['score'] for label in result}
+    except Exception as e:
+        print(f"Error in classify_audio: {e}")
+        return {"Error": str(e)}

zs_image.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from transformers import CLIPModel, AutoProcessor
+from PIL import Image
+model = CLIPModel.from_pretrained(
+    "openai/clip-vit-large-patch14"
+)
+processor = AutoProcessor.from_pretrained(
+    "openai/clip-vit-large-patch14"
+)
+labels = ["a photo of a cat", "a photo of a dog"]
+def classify_image(image_path):
+    """
+    Perform zero-shot classification on a single image.
+    Args:
+        image_path (str): Path to the image.
+    Returns:
+        dict: Classification probabilities for the image.
+    """
+    image = Image.open(image_path) # Open the image
+    # Preprocess the image and labels
+    inputs = processor(
+        text=labels,
+        images=image,
+        return_tensors="pt",
+        padding=True
+    )
+    # Perform inference using the CLIP model
+    outputs = model(**inputs)
+    probs = outputs.logits_per_image.softmax(dim=1)[0]  # Calculate probabilities
+    # Return results as a dictionary with label and probability pairs
+    return {labels[i]: probs[i].item() for i in range(len(labels))}