adityas2410
commited on
Upload 4 files
Browse files- app.py +28 -0
- requirements.txt +7 -0
- zs_audio.py +70 -0
- zs_image.py +39 -0
app.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from zs_audio import classify_audio
|
3 |
+
from zs_image import classify_image
|
4 |
+
|
5 |
+
audio_interface = gr.Interface(
|
6 |
+
fn=classify_audio,
|
7 |
+
inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
|
8 |
+
outputs=gr.Label(),
|
9 |
+
title="Zero-Shot Audio Classification",
|
10 |
+
description="Classify audio into predefined categories without prior training.",
|
11 |
+
allow_flagging="never",
|
12 |
+
)
|
13 |
+
|
14 |
+
image_interface = gr.Interface(
|
15 |
+
fn=classify_image,
|
16 |
+
inputs=gr.Image(type="filepath"),
|
17 |
+
outputs=gr.Label(),
|
18 |
+
title="Zero-Shot Image Classification",
|
19 |
+
description="Classify an image into predefined categories using CLIP.",
|
20 |
+
allow_flagging="never",
|
21 |
+
)
|
22 |
+
|
23 |
+
app = gr.TabbedInterface(
|
24 |
+
[audio_interface, image_interface],
|
25 |
+
["Audio Classification", "Image Classification"]
|
26 |
+
)
|
27 |
+
|
28 |
+
app.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
transformers
|
3 |
+
datasets
|
4 |
+
pillow
|
5 |
+
torch
|
6 |
+
torchaudio
|
7 |
+
numpy
|
zs_audio.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset, Audio
|
2 |
+
from transformers import pipeline
|
3 |
+
import torchaudio
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
# Initialize the zero-shot audio classification pipeline
|
7 |
+
zero_shot_classifier = pipeline(
|
8 |
+
task="zero-shot-audio-classification",
|
9 |
+
model="laion/clap-htsat-unfused"
|
10 |
+
)
|
11 |
+
|
12 |
+
# Define the candidate labels for classification
|
13 |
+
candidate_labels = [
|
14 |
+
"Sound of a dog barking",
|
15 |
+
"Sound of car driving",
|
16 |
+
"Sound of a person talking",
|
17 |
+
"Sound of a bird singing",
|
18 |
+
"Sound of a plane flying",
|
19 |
+
]
|
20 |
+
|
21 |
+
# Function to perform inference on a dataset
|
22 |
+
def audio_dataset_inference():
|
23 |
+
# Load a dataset containing different 5-second sound clips
|
24 |
+
dataset = load_dataset("ashraq/esc50", split="train[0:10]")
|
25 |
+
|
26 |
+
# Ensure all audio samples in the dataset have the same sampling rate (48kHz)
|
27 |
+
dataset = dataset.cast_column("audio", Audio(sampling_rate=48_000))
|
28 |
+
|
29 |
+
# Select the first audio sample from the dataset
|
30 |
+
audio_sample = dataset[0]
|
31 |
+
|
32 |
+
# Perform zero-shot classification on the selected audio sample
|
33 |
+
result = zero_shot_classifier(
|
34 |
+
audio_sample["audio"]["array"], # Extract the audio array from the dataset sample
|
35 |
+
candidate_labels=candidate_labels # Pass the candidate labels for classification
|
36 |
+
)
|
37 |
+
print(result)
|
38 |
+
|
39 |
+
def classify_audio(audio_file):
|
40 |
+
"""
|
41 |
+
Perform zero-shot classification on a single audio file.
|
42 |
+
|
43 |
+
Args:
|
44 |
+
audio_file (str): Path to the audio file to classify.
|
45 |
+
|
46 |
+
Returns:
|
47 |
+
dict: Classification labels and their corresponding scores.
|
48 |
+
"""
|
49 |
+
try:
|
50 |
+
# Load audio file using torchaudio
|
51 |
+
waveform, sample_rate = torchaudio.load(audio_file)
|
52 |
+
|
53 |
+
# Resample audio to 48kHz (if necessary)
|
54 |
+
if sample_rate != 48000:
|
55 |
+
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=48000)
|
56 |
+
waveform = resampler(waveform)
|
57 |
+
|
58 |
+
# Convert waveform to NumPy array
|
59 |
+
audio_array = waveform.squeeze().numpy()
|
60 |
+
|
61 |
+
# Perform zero-shot classification
|
62 |
+
result = zero_shot_classifier(
|
63 |
+
audio_array, # Pass the audio array
|
64 |
+
candidate_labels=candidate_labels
|
65 |
+
)
|
66 |
+
return {label['label']: label['score'] for label in result}
|
67 |
+
except Exception as e:
|
68 |
+
print(f"Error in classify_audio: {e}")
|
69 |
+
return {"Error": str(e)}
|
70 |
+
|
zs_image.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import CLIPModel, AutoProcessor
|
2 |
+
from PIL import Image
|
3 |
+
|
4 |
+
model = CLIPModel.from_pretrained(
|
5 |
+
"openai/clip-vit-large-patch14"
|
6 |
+
)
|
7 |
+
processor = AutoProcessor.from_pretrained(
|
8 |
+
"openai/clip-vit-large-patch14"
|
9 |
+
)
|
10 |
+
|
11 |
+
labels = ["a photo of a cat", "a photo of a dog"]
|
12 |
+
|
13 |
+
def classify_image(image_path):
|
14 |
+
"""
|
15 |
+
Perform zero-shot classification on a single image.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
image_path (str): Path to the image.
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
dict: Classification probabilities for the image.
|
22 |
+
"""
|
23 |
+
|
24 |
+
image = Image.open(image_path) # Open the image
|
25 |
+
|
26 |
+
# Preprocess the image and labels
|
27 |
+
inputs = processor(
|
28 |
+
text=labels,
|
29 |
+
images=image,
|
30 |
+
return_tensors="pt",
|
31 |
+
padding=True
|
32 |
+
)
|
33 |
+
|
34 |
+
# Perform inference using the CLIP model
|
35 |
+
outputs = model(**inputs)
|
36 |
+
probs = outputs.logits_per_image.softmax(dim=1)[0] # Calculate probabilities
|
37 |
+
|
38 |
+
# Return results as a dictionary with label and probability pairs
|
39 |
+
return {labels[i]: probs[i].item() for i in range(len(labels))}
|