Spaces:

ujalaarshad17
/

AudioSpoofing

Running

App Files Files Community

ujalaarshad17 commited on Nov 11, 2024

Commit

384e020

1 Parent(s): 18989b5

Added files

Browse files

Files changed (16) hide show

.gitignore +3 -0
Dockerfile +13 -0
app.py +160 -0
audio_dataset.py +106 -0
checkpoint_epoch_16_eer_0.25.pth +3 -0
checkpoint_epoch_21_eer_0.24.pth +3 -0
checkpoint_epoch_24_eer_0.23.pth +3 -0
inference.py +44 -0
model.py +95 -0
requirements.txt +6 -0
static/prediction_plot.png +0 -0
static/prediction_waveform.png +0 -0
static/styles.css +121 -0
templates/index.html +34 -0
templates/result.html +36 -0
uploads/RFP_R_24918.wav +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+__pycache__
+Fake
+Real

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,160 @@

+from flask import Flask, request, render_template, redirect, url_for
+import torch
+import torchaudio
+import numpy as np
+import plotly.graph_objs as go
+import os  # Import os for file operations
+from model import BoundaryDetectionModel  # Assuming your model is defined here
+from audio_dataset import pad_audio  # Assuming you have a function to pad audio
+app = Flask(__name__)
+# Load the pre-trained model
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = BoundaryDetectionModel().to(device)
+model.load_state_dict(torch.load("checkpoint_epoch_21_eer_0.24.pth", map_location=device)["model_state_dict"])
+model.eval()
+def preprocess_audio(audio_path, sample_rate=16000, target_length=8):
+    waveform, sr = torchaudio.load(audio_path)
+    if sr != sample_rate:
+        waveform = torchaudio.transforms.Resample(sr, sample_rate)(waveform)
+    waveform = pad_audio(waveform, sample_rate, target_length)
+    return waveform.to(device)
+def infer_single_audio(audio_tensor):
+    with torch.no_grad():
+        output = model(audio_tensor).squeeze(-1).cpu().numpy()
+        prediction = (output > 0.5).astype(int)  # Binary prediction for fake/real frames
+    return output, prediction
+@app.route('/')
+def index():
+    return render_template('index.html')  # HTML page for file upload and results display
+@app.route('/predict', methods=['POST'])
+def predict():
+    if 'file' not in request.files:
+        return "No file uploaded", 400
+    file = request.files['file']
+    if file.filename == '':
+        return "No selected file", 400
+    file_path = "temp_audio.wav"  # Temporary file to store uploaded audio
+    file.save(file_path)
+    # Preprocess audio and perform inference
+    audio_tensor = preprocess_audio(file_path)
+    output, prediction = infer_single_audio(audio_tensor)
+    # Flatten the prediction array to handle 2D structure
+    prediction_flat = prediction.flatten()
+    # Calculate total frames, fake frames, and fake percentage (formatted to 4 decimal places)
+    total_frames = len(prediction_flat)
+    fake_frame_count = int(np.sum(prediction_flat))
+    fake_percentage = round((fake_frame_count / total_frames) * 100, 4)
+    result_type = 'Fake' if fake_frame_count >= 5 else 'Real'
+    # Check if audio is classified as real
+    if result_type == 'Real':
+        fake_frame_intervals = "No Frame"  # Set to "No Frame" if audio is real
+    else:
+        # Get precise fake frame timings with start and end times for fake frames
+        fake_frame_intervals = get_fake_frame_intervals(prediction_flat, frame_duration=20)
+    # Debug print to check intervals
+    print("Fake Frame Intervals:", fake_frame_intervals)
+    # Generate Plotly plot
+    plot_html = plot_fake_frames_waveform(output, prediction_flat, audio_tensor.cpu().numpy(), fake_frame_intervals)
+    # Render template with all results and plot
+    return render_template('result.html',
+                           fake_percentage=fake_percentage,
+                           result_type=result_type,
+                           fake_frame_count=fake_frame_count,
+                           total_frames=total_frames,
+                           fake_frame_intervals=fake_frame_intervals,
+                           plot_html=plot_html)
+@app.route('/return', methods=['GET'])
+def return_to_index():
+    # Delete temporary files before returning to index
+    try:
+        os.remove("temp_audio.wav")  # Remove the temporary audio file
+        # If you have any other temporary files (like plots), remove them here too.
+        # Example: os.remove("temp_plot.html") if you save plots as HTML files.
+    except OSError as e:
+        print(f"Error deleting temporary files: {e}")
+    return redirect(url_for('index'))  # Redirect back to the main page
+def get_fake_frame_intervals(prediction, frame_duration=20):
+    """
+    Calculate start and end times in seconds for each consecutive fake frame interval.
+    """
+    intervals = []
+    start_time = None
+    for i, is_fake in enumerate(prediction):
+        if is_fake == 1:
+            if start_time is None:
+                start_time = i * (frame_duration / 1000)  # Convert ms to seconds
+        else:
+            if start_time is not None:
+                end_time = i * (frame_duration / 1000)  # End time of fake segment
+                intervals.append((round(start_time, 4), round(end_time, 4)))
+                start_time = None
+    # Append last interval if it ended on the last frame
+    if start_time is not None:
+        end_time = len(prediction) * (frame_duration / 1000)  # Final end time calculation
+        intervals.append((round(start_time, 4), round(end_time, 4)))
+    return intervals
+def plot_fake_frames_waveform(output, prediction_flat, waveform, fake_frame_intervals, frame_duration=20, sample_rate=16000):
+    # Get actual audio duration from waveform for accurate x-axis scaling
+    actual_duration = waveform.shape[1] / sample_rate
+    num_samples = waveform.shape[1]  # Get number of samples from the actual waveform
+    time = np.linspace(0, actual_duration, num_samples)
+    # Plotly trace for the waveform with different colors for fake and real frames
+    frame_length = int(sample_rate * frame_duration / 1000)  # Samples per frame
+    traces = []
+    for i in range(len(prediction_flat)):
+        start = i * frame_length
+        end = min(start + frame_length, num_samples)  # Ensure we do not exceed the samples
+        color = 'rgba(255,0,0,0.8)' if prediction_flat[i] == 1 else 'rgba(0,128,0,0.5)'
+        traces.append(go.Scatter(
+            x=time[start:end],
+            y=waveform[0][start:end],
+            mode='lines',
+            line=dict(color=color),
+            showlegend=False
+        ))
+    # Full waveform view to show all fake and real segments
+    min_time, max_time = 0, actual_duration
+    # Layout settings for the plot
+    layout = go.Layout(
+        title="Audio Waveform with Fake Frames Highlighted",
+        xaxis=dict(title="Time (seconds)", range=[min_time, max_time]),
+        yaxis=dict(title="Amplitude"),
+        autosize=True,
+        template="plotly_white"
+    )
+    fig = go.Figure(data=traces, layout=layout)
+    # Convert Plotly figure to HTML
+    plot_html = fig.to_html(full_html=False)
+    return plot_html
+if __name__ == '__main__':
+    app.run()

audio_dataset.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import os
+import random
+import torch
+from torch.utils.data import Dataset
+import torchaudio
+import numpy as np
+# Modify to handle dynamic target duration (8s in this case)
+# def pad_audio(audio, sample_rate=16000, target_duration=8.0):
+#     target_length = int(sample_rate * target_duration)  # Calculate target length for 8 seconds
+#     current_length = audio.shape[1]
+#     if current_length < target_length:
+#         padding = target_length - current_length
+#         audio = torch.cat((audio, torch.zeros(audio.shape[0], padding)), dim=1)
+#     else:
+#         audio = audio[:, :target_length]
+#     return audio
+def pad_audio(audio, sample_rate=16000, target_duration=7.98):
+    target_length = int(sample_rate * target_duration)  # Calculate target length for 8 seconds
+    current_length = audio.shape[1]
+    if current_length < target_length:
+        padding = target_length - current_length
+        audio = torch.cat((audio, torch.zeros(audio.shape[0], padding)), dim=1)
+    elif current_length > target_length:
+        # Add one frame if length is one frame more than the target
+        if current_length - target_length == 1:
+            audio = torch.cat((audio, torch.zeros(audio.shape[0], 1)), dim=1)
+        else:
+            audio = audio[:, :target_length]
+    return audio
+# Parse labels with 10ms frame intervals for 8-second audio
+def parse_labels(file_path, audio_length, sample_rate, frame_duration=0.010):
+    frames_per_audio = int(audio_length / frame_duration)
+    labels = np.zeros(frames_per_audio, dtype=np.float32)
+    with open(file_path, 'r') as f:
+        lines = f.readlines()[1:]  # Skip header
+        for line in lines:
+            start, end, authenticity = line.strip().split('-')
+            start_time = float(start)
+            end_time = float(end)
+            if authenticity == 'F':
+                start_frame = int(start_time / frame_duration)
+                end_frame = int(end_time / frame_duration)
+                labels[start_frame:end_frame] = 1
+                # Mark 4 closest frames to boundaries
+                for offset in range(1, 5):
+                    if start_frame - offset >= 0:
+                        labels[start_frame - offset] = 1
+                    if end_frame + offset < frames_per_audio:
+                        labels[end_frame + offset] = 1
+    return labels
+class AudioDataset(Dataset):
+    def __init__(self, audio_files, label_dir, sample_rate=16000, target_length=7.98):
+        self.audio_files = audio_files
+        self.label_dir = label_dir
+        self.sample_rate = sample_rate
+        self.target_length = target_length * sample_rate
+        self.raw_target_length = target_length
+    def __len__(self):
+        return len(self.audio_files)
+    def __getitem__(self, idx):
+        audio_path = self.audio_files[idx]
+        try:
+            waveform, sr = torchaudio.load(audio_path)
+            waveform = torchaudio.transforms.Resample(sr, self.sample_rate)(waveform)
+            waveform = pad_audio(waveform, self.sample_rate, self.raw_target_length)
+            audio_filename = os.path.basename(audio_path).replace(".wav", "")
+            if audio_filename.startswith("RFP_R"):
+                labels = np.zeros(int(self.raw_target_length / 0.010), dtype=np.float32)
+            else:
+                label_path = os.path.join(self.label_dir, f"{audio_filename}.wav_labels.txt")
+                labels = parse_labels(label_path, self.raw_target_length, self.sample_rate).astype(np.float32)
+            return waveform, torch.tensor(labels, dtype=torch.float32)
+        except (OSError, IOError) as e:
+            print(f"Error opening file {audio_path}: {e}")
+            new_idx = random.randint(0, len(self.audio_files) - 1)
+            return self.__getitem__(new_idx)
+def get_audio_file_paths(extrinsic_dir, intrinsic_dir, real_dir):
+    extrinsic_files = [os.path.join(extrinsic_dir, f) for f in os.listdir(extrinsic_dir)
+                       if f.endswith(".wav") and not f.startswith("partial_fake")]
+    intrinsic_files = [os.path.join(intrinsic_dir, f) for f in os.listdir(intrinsic_dir)
+                       if f.endswith(".wav") and not f.startswith("partial_fake")]
+    real_files = [os.path.join(real_dir, f) for f in os.listdir(real_dir)
+                  if f.endswith(".wav") and not f.startswith("partial_fake")]
+    # Combine all audio files into a single list, ensuring valid files only
+    audio_files = [f for f in extrinsic_files + real_files
+                   if os.path.basename(f).startswith(("extrinsic"))]
+    return audio_files

checkpoint_epoch_16_eer_0.25.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75f0fc179f4f1bc0074dd874953ac233db9f86b58a0ca97d1e75472fefd29893
+size 55028923

checkpoint_epoch_21_eer_0.24.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2a3294c037664c8bd16cb9f3fefb15b8527538e2c185d66e5f365ad0e5199b0
+size 55028923

checkpoint_epoch_24_eer_0.23.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:194d1b7e6bd18d8e059a833d59cc096f1693034383b9e45043b4dc57196adaa3
+size 55028923

inference.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import torch
+import torchaudio
+import numpy as np
+from model import BoundaryDetectionModel  # Assume the model definition is in model.py
+from audio_dataset import pad_audio  # Use the provided padding function
+def load_model(checkpoint_path, device):
+    model = BoundaryDetectionModel().to(device)
+    model.load_state_dict(torch.load(checkpoint_path, map_location=device)["model_state_dict"])
+    model.eval()
+    return model
+def preprocess_audio(audio_path, sample_rate=16000, target_length=8):
+    waveform, sr = torchaudio.load(audio_path)
+    waveform = torchaudio.transforms.Resample(sr, sample_rate)(waveform)
+    waveform = pad_audio(waveform, sample_rate, target_length)
+    print(waveform.shape)
+    return waveform
+def infer_single_audio(model, audio_path, device):
+    audio_tensor = preprocess_audio(audio_path).to(device)
+    with torch.no_grad():
+        output = model(audio_tensor).squeeze(-1).cpu().numpy()  # Remove extra dimensions
+        prediction = (output > 0.5).astype(int) # Round outputs for binary prediction if needed
+    return output, prediction
+def main_inference(audio_path, checkpoint_path):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = load_model(checkpoint_path, device)
+    print(f"Running inference on: {audio_path}")
+    output, prediction = infer_single_audio(model, audio_path, device)
+    print(f"Model Output: {output}")
+    print(f"Binary Prediction: {prediction}")
+if __name__ == "__main__":
+    audio_path = "Real\RFP_R_24918.wav"  # Path to the audio file for inference
+    checkpoint_path = "checkpoint_epoch_21_eer_0.24.pth"  # Path to the trained model checkpoint
+    main_inference(audio_path, checkpoint_path)

model.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import torch
+import torch.nn as nn
+import torchaudio
+from torchaudio.transforms import MelSpectrogram
+class FeatureExtractor(nn.Module):
+    def __init__(self, n_mels=13, sample_rate=16000, frame_size_ms=20):
+        super(FeatureExtractor, self).__init__()
+        self.mel_spec = MelSpectrogram(
+            sample_rate=sample_rate,
+            n_mels=n_mels,
+            win_length=int(sample_rate * frame_size_ms / 2000),
+            hop_length=int(sample_rate * frame_size_ms / 2000),
+            normalized=True
+        )
+    def forward(self, audio):
+        # Convert to Mel spectrogram
+        mel_features = self.mel_spec(audio)
+        # Transpose to match Conv1d input shape (batch_size, n_mels, sequence_length)
+        mel_features = mel_features.transpose(1, 2)
+        return mel_features
+# FrameLevelEmbedding and FrameLevelClassifier remain the same
+class FrameLevelEmbedding(nn.Module):
+    def __init__(self):
+        super(FrameLevelEmbedding, self).__init__()
+        self.cnn1 = nn.Conv1d(in_channels=13, out_channels=512, kernel_size=5, padding=2)
+        self.res_blocks = nn.Sequential(*[ResBlock(512) for _ in range(6)])
+        self.cnn2 = nn.Conv1d(in_channels=512, out_channels=240, kernel_size=1)
+    def forward(self, x):
+        x = x.transpose(1, 2)  # (batch_size, seq_len, features) -> (batch_size, features, seq_len)
+        x = self.cnn1(x)
+        x = self.res_blocks(x)
+        x = self.cnn2(x)
+        x = x.transpose(1, 2)  # (batch_size, features, seq_len) -> (batch_size, seq_len, features)
+        return x
+# Keep the other parts of the model unchanged (e.g., ResBlock, FrameLevelClassifier, BoundaryDetectionModel)
+class ResBlock(nn.Module):
+    def __init__(self, channels):
+        super(ResBlock, self).__init__()
+        self.conv1 = nn.Conv1d(in_channels=channels, out_channels=channels, kernel_size=1, bias=False)
+        self.conv2 = nn.Conv1d(in_channels=channels, out_channels=channels, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm1d(channels)
+        self.bn2 = nn.BatchNorm1d(channels)
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        identity = x
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        out += identity
+        return self.relu(out)
+class FrameLevelClassifier(nn.Module):
+    def __init__(self):
+        super(FrameLevelClassifier, self).__init__()
+        self.transformer = nn.TransformerEncoder(
+            nn.TransformerEncoderLayer(d_model=240, nhead=4, dim_feedforward=1024), num_layers=2
+        )
+        self.bilstm = nn.LSTM(input_size=240, hidden_size=128, num_layers=2, bidirectional=True, batch_first=True)
+        self.fc = nn.Linear(256, 1)  # Bidirectional LSTM -> 2 * hidden_size
+    def forward(self, x):
+        # x = self.transformer(x)
+        x, _ = self.bilstm(x)
+        x = self.fc(x)
+        return torch.sigmoid(x)
+class BoundaryDetectionModel(nn.Module):
+    def __init__(self):
+        super(BoundaryDetectionModel, self).__init__()
+        self.feature_extractor = FeatureExtractor()
+        self.frame_embedding = FrameLevelEmbedding()
+        self.classifier = FrameLevelClassifier()
+    def forward(self, audio):
+        features = self.feature_extractor(audio)
+        embeddings = self.frame_embedding(features)
+        output = self.classifier(embeddings)
+        return output
+# model = BoundaryDetectionModel()
+# audio, sr = torchaudio.load("new_files/Extrinsic_Partial_Fakes/extrinsic_partial_fake_RFP_R_00001.wav")
+# if sr != 16000:
+#     resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
+#     audio = resampler(audio)
+# # audio = audio.mean(dim=0).unsqueeze(0)  # Convert to mono and add batch dimension
+# output = model(audio)
+# print(output.squeeze(2).shape)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+torchaudio
+librosa
+flask
+gunicorn
+uvicorn

static/prediction_plot.png ADDED Viewed

static/prediction_waveform.png ADDED Viewed

static/styles.css ADDED Viewed

	@@ -0,0 +1,121 @@

+/* General Reset */
+* {
+  margin: 0;
+  padding: 0;
+  box-sizing: border-box;
+}
+body {
+  font-family: Arial, sans-serif;
+  color: #333;
+  background-color: #f9f9f9;
+  display: flex;
+  justify-content: center;
+  align-items: center;
+  min-height: 100vh;
+  height: 100%;
+}
+.container {
+  width: 100%;
+  height: 100vh;
+  display: grid;
+  place-items: center;
+  gap: 0;
+  padding: 20px;
+  background-color: #fff;
+  box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
+  border-radius: 8px;
+  text-align: center;
+  box-sizing: border-box; /* Ensure padding doesn't affect width */
+}
+.title {
+  font-size: 2em;
+  color: #333;
+  margin-bottom: 1rem;
+}
+.upload-form {
+  display: flex;
+  flex-direction: column;
+  gap: 1rem;
+  margin-bottom: 2rem;
+}
+.file-label {
+  font-size: 1.1em;
+  color: #555;
+}
+.file-input {
+  padding: 8px;
+  border-radius: 4px;
+  border: 1px solid #ccc;
+}
+.submit-button {
+  padding: 10px 20px;
+  font-size: 1em;
+  font-weight: bold;
+  color: #fff;
+  background-color: #4caf50;
+  border: none;
+  border-radius: 5px;
+  cursor: pointer;
+  transition: background-color 0.3s ease;
+}
+.submit-button:hover {
+  background-color: #45a049;
+}
+.result-section {
+  margin-top: 1.5rem;
+  padding: 1.5rem;
+  background-color: #f1f1f1;
+  border-radius: 8px;
+  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+}
+.result-title {
+  font-size: 1.5em;
+  margin-bottom: 1rem;
+  color: #333;
+}
+.result-text {
+  font-size: 1.1em;
+  color: #666;
+  margin: 0.5rem 0;
+}
+.result-image {
+  margin-top: 1rem;
+  max-width: 100%;
+  height: auto;
+  border-radius: 8px;
+  box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+}
+.return-button {
+  padding: 10px 20px;
+  font-size: 1em;
+  font-weight: bold;
+  color: #fff;
+  background-color: #4caf50;
+  border: none;
+  border-radius: 5px;
+  cursor: pointer;
+  transition: background-color 0.3s ease;
+  text-decoration: none;
+}
+.intervals-list {
+  margin-bottom: 2rem;
+}
+ul {
+  list-style-type: none;
+  padding-left: 0; /* Optional: removes default left padding */
+}

templates/index.html ADDED Viewed

	@@ -0,0 +1,34 @@

+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <title>Audio Boundary Detection</title>
+    <link
+      rel="stylesheet"
+      href="{{ url_for('static', filename='styles.css') }}"
+    />
+  </head>
+  <body class="page">
+    <div class="container">
+      <div>
+        <h1 class="title">Audio Boundary Detection</h1>
+        <form
+          action="/predict"
+          method="post"
+          enctype="multipart/form-data"
+          class="upload-form"
+        >
+          <label for="file" class="file-label">Upload an audio file:</label>
+          <input
+            type="file"
+            name="file"
+            accept=".wav"
+            required
+            class="file-input"
+          />
+          <button type="submit" class="submit-button">Analyze</button>
+        </form>
+      </div>
+    </div>
+  </body>
+</html>

templates/result.html ADDED Viewed

	@@ -0,0 +1,36 @@

+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <title>Prediction Results</title>
+    <link
+      rel="stylesheet"
+      href="{{ url_for('static', filename='styles.css') }}"
+    />
+  </head>
+  <body class="page">
+    <div class="container">
+      <h1 class="title">Prediction Results</h1>
+      <p class="result-text">Fake Percentage: {{ fake_percentage }}%</p>
+      <p class="result-text">Result Type: {{ result_type }}</p>
+      <p class="result-text">Fake Frame Count: {{ fake_frame_count }}</p>
+      <div class="plot-container">
+        {{ plot_html|safe }}
+        <!-- Embed Plotly plot here -->
+      </div>
+      <div class="intervals-list">
+        <h2>Fake Frame Intervals:</h2>
+        {% if fake_frame_intervals == "No Frame" %}
+        <p>No Frame</p>
+        {% else %}
+        <ul>
+          {% for start, end in fake_frame_intervals %}
+          <li>{{ start }}s - {{ end }}s</li>
+          {% endfor %}
+        </ul>
+        {% endif %}
+      </div>
+      <a href="/" class="return-button">Analyze Another File</a>
+    </div>
+  </body>
+</html>

uploads/RFP_R_24918.wav ADDED Viewed

Binary file (320 kB). View file