rafaaa2105 commited on
Commit
f418fac
·
verified ·
1 Parent(s): 8590d97

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -19
app.py CHANGED
@@ -2,6 +2,10 @@ import gradio as gr
2
  from pyannote.audio import Pipeline
3
  import torch
4
  import os
 
 
 
 
5
 
6
  hf_token = os.getenv("HF_TOKEN")
7
 
@@ -9,26 +13,59 @@ hf_token = os.getenv("HF_TOKEN")
9
  pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=hf_token)
10
  pipeline.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
11
 
12
- def diarize(audio):
13
- diarization = pipeline({"waveform": audio, "sample_rate": 16000})
14
- speaker1_segments = []
15
- speaker2_segments = []
16
- for segment, _, speaker in diarization.itertracks(yield_label=True):
17
- if speaker == 'SPEAKER_1':
18
- speaker1_segments.append((segment.start, segment.end))
19
- elif speaker == 'SPEAKER_2':
20
- speaker2_segments.append((segment.start, segment.end))
21
- return speaker1_segments, speaker2_segments
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- interface = gr.Interface(
24
- fn=diarize,
 
 
 
 
 
25
  inputs=gr.Audio(type="numpy"),
26
- outputs=[
27
- gr.Textbox(label="Speaker 1 Segments (start, end)"),
28
- gr.Textbox(label="Speaker 2 Segments (start, end)")
29
- ],
30
- title="Speaker Diarization",
31
- description="Upload an audio file and get the segments where each speaker talks."
32
  )
33
 
34
- interface.launch()
 
2
  from pyannote.audio import Pipeline
3
  import torch
4
  import os
5
+ import numpy as np
6
+ from pydub import AudioSegment
7
+ import io
8
+ import zipfile
9
 
10
  hf_token = os.getenv("HF_TOKEN")
11
 
 
13
  pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=hf_token)
14
  pipeline.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
15
 
16
+ def diarize_and_split(audio, sr):
17
+ # Convert to mono if stereo
18
+ if len(audio.shape) > 1:
19
+ audio = np.mean(audio, axis=1)
20
+
21
+ # Perform diarization
22
+ diarization = pipeline({"waveform": torch.from_numpy(audio), "sample_rate": sr})
23
+
24
+ # Create an AudioSegment from the numpy array
25
+ audio_segment = AudioSegment(
26
+ audio.tobytes(),
27
+ frame_rate=sr,
28
+ sample_width=audio.dtype.itemsize,
29
+ channels=1
30
+ )
31
+
32
+ speaker_segments = {}
33
+
34
+ for turn, _, speaker in diarization.itertracks(yield_label=True):
35
+ start_ms = int(turn.start * 1000)
36
+ end_ms = int(turn.end * 1000)
37
+ segment = audio_segment[start_ms:end_ms]
38
+
39
+ if speaker not in speaker_segments:
40
+ speaker_segments[speaker] = []
41
+ speaker_segments[speaker].append(segment)
42
+
43
+ # Create zip files for each speaker
44
+ zip_files = {}
45
+ for speaker, segments in speaker_segments.items():
46
+ zip_buffer = io.BytesIO()
47
+ with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
48
+ for i, segment in enumerate(segments):
49
+ segment_buffer = io.BytesIO()
50
+ segment.export(segment_buffer, format="wav")
51
+ zip_file.writestr(f"{speaker}_segment_{i}.wav", segment_buffer.getvalue())
52
+
53
+ zip_buffer.seek(0)
54
+ zip_files[f"{speaker}.zip"] = zip_buffer.getvalue()
55
+
56
+ return zip_files
57
 
58
+ def process_audio(audio):
59
+ sr, audio_data = audio
60
+ zip_files = diarize_and_split(audio_data, sr)
61
+ return list(zip_files.values())
62
+
63
+ iface = gr.Interface(
64
+ fn=process_audio,
65
  inputs=gr.Audio(type="numpy"),
66
+ outputs=[gr.File(label="Speaker Zip Files") for _ in range(10)], # Assuming max 10 speakers
67
+ title="Speaker Diarization and Audio Splitting",
68
+ description="Upload an audio file to split it into separate files for each speaker."
 
 
 
69
  )
70
 
71
+ iface.launch()