adityas2410 commited on
Commit
ab0f9b3
·
verified ·
1 Parent(s): e2f0b1f

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +28 -0
  2. requirements.txt +7 -0
  3. zs_audio.py +70 -0
  4. zs_image.py +39 -0
app.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from zs_audio import classify_audio
3
+ from zs_image import classify_image
4
+
5
+ audio_interface = gr.Interface(
6
+ fn=classify_audio,
7
+ inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
8
+ outputs=gr.Label(),
9
+ title="Zero-Shot Audio Classification",
10
+ description="Classify audio into predefined categories without prior training.",
11
+ allow_flagging="never",
12
+ )
13
+
14
+ image_interface = gr.Interface(
15
+ fn=classify_image,
16
+ inputs=gr.Image(type="filepath"),
17
+ outputs=gr.Label(),
18
+ title="Zero-Shot Image Classification",
19
+ description="Classify an image into predefined categories using CLIP.",
20
+ allow_flagging="never",
21
+ )
22
+
23
+ app = gr.TabbedInterface(
24
+ [audio_interface, image_interface],
25
+ ["Audio Classification", "Image Classification"]
26
+ )
27
+
28
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ datasets
4
+ pillow
5
+ torch
6
+ torchaudio
7
+ numpy
zs_audio.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, Audio
2
+ from transformers import pipeline
3
+ import torchaudio
4
+ import numpy as np
5
+
6
+ # Initialize the zero-shot audio classification pipeline
7
+ zero_shot_classifier = pipeline(
8
+ task="zero-shot-audio-classification",
9
+ model="laion/clap-htsat-unfused"
10
+ )
11
+
12
+ # Define the candidate labels for classification
13
+ candidate_labels = [
14
+ "Sound of a dog barking",
15
+ "Sound of car driving",
16
+ "Sound of a person talking",
17
+ "Sound of a bird singing",
18
+ "Sound of a plane flying",
19
+ ]
20
+
21
+ # Function to perform inference on a dataset
22
+ def audio_dataset_inference():
23
+ # Load a dataset containing different 5-second sound clips
24
+ dataset = load_dataset("ashraq/esc50", split="train[0:10]")
25
+
26
+ # Ensure all audio samples in the dataset have the same sampling rate (48kHz)
27
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=48_000))
28
+
29
+ # Select the first audio sample from the dataset
30
+ audio_sample = dataset[0]
31
+
32
+ # Perform zero-shot classification on the selected audio sample
33
+ result = zero_shot_classifier(
34
+ audio_sample["audio"]["array"], # Extract the audio array from the dataset sample
35
+ candidate_labels=candidate_labels # Pass the candidate labels for classification
36
+ )
37
+ print(result)
38
+
39
+ def classify_audio(audio_file):
40
+ """
41
+ Perform zero-shot classification on a single audio file.
42
+
43
+ Args:
44
+ audio_file (str): Path to the audio file to classify.
45
+
46
+ Returns:
47
+ dict: Classification labels and their corresponding scores.
48
+ """
49
+ try:
50
+ # Load audio file using torchaudio
51
+ waveform, sample_rate = torchaudio.load(audio_file)
52
+
53
+ # Resample audio to 48kHz (if necessary)
54
+ if sample_rate != 48000:
55
+ resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=48000)
56
+ waveform = resampler(waveform)
57
+
58
+ # Convert waveform to NumPy array
59
+ audio_array = waveform.squeeze().numpy()
60
+
61
+ # Perform zero-shot classification
62
+ result = zero_shot_classifier(
63
+ audio_array, # Pass the audio array
64
+ candidate_labels=candidate_labels
65
+ )
66
+ return {label['label']: label['score'] for label in result}
67
+ except Exception as e:
68
+ print(f"Error in classify_audio: {e}")
69
+ return {"Error": str(e)}
70
+
zs_image.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import CLIPModel, AutoProcessor
2
+ from PIL import Image
3
+
4
+ model = CLIPModel.from_pretrained(
5
+ "openai/clip-vit-large-patch14"
6
+ )
7
+ processor = AutoProcessor.from_pretrained(
8
+ "openai/clip-vit-large-patch14"
9
+ )
10
+
11
+ labels = ["a photo of a cat", "a photo of a dog"]
12
+
13
+ def classify_image(image_path):
14
+ """
15
+ Perform zero-shot classification on a single image.
16
+
17
+ Args:
18
+ image_path (str): Path to the image.
19
+
20
+ Returns:
21
+ dict: Classification probabilities for the image.
22
+ """
23
+
24
+ image = Image.open(image_path) # Open the image
25
+
26
+ # Preprocess the image and labels
27
+ inputs = processor(
28
+ text=labels,
29
+ images=image,
30
+ return_tensors="pt",
31
+ padding=True
32
+ )
33
+
34
+ # Perform inference using the CLIP model
35
+ outputs = model(**inputs)
36
+ probs = outputs.logits_per_image.softmax(dim=1)[0] # Calculate probabilities
37
+
38
+ # Return results as a dictionary with label and probability pairs
39
+ return {labels[i]: probs[i].item() for i in range(len(labels))}