SoulAbi commited on
Commit
9d42a5f
1 Parent(s): e102373

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -0
app.py CHANGED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from google.colab import files
2
+ uploaded = files.upload()
3
+ path = next(iter(uploaded))
4
+
5
+ num_speakers = 2 #@param {type:"integer"}
6
+
7
+ language = 'English' #@param ['any', 'English']
8
+
9
+ model_size = 'large' #@param ['tiny', 'base', 'small', 'medium', 'large']
10
+
11
+
12
+ model_name = model_size
13
+ if language == 'English' and model_size != 'tiny':
14
+ model_name += '.en'
15
+
16
+ !pip install -q git+https://github.com/openai/whisper.git > /dev/null
17
+ !pip install -q git+https://github.com/pyannote/pyannote-audio > /dev/null
18
+
19
+ import whisper
20
+ import datetime
21
+
22
+ import subprocess
23
+
24
+ import torch
25
+ import pyannote.audio
26
+ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
27
+ embedding_model = PretrainedSpeakerEmbedding(
28
+ "speechbrain/spkrec-ecapa-voxceleb",
29
+ device=torch.device("cuda"))
30
+
31
+ from pyannote.audio import Audio
32
+ from pyannote.core import Segment
33
+
34
+ import wave
35
+ import contextlib
36
+
37
+ from sklearn.cluster import AgglomerativeClustering
38
+ import numpy as np
39
+
40
+
41
+ if path[-3:] != 'wav':
42
+ subprocess.call(['ffmpeg', '-i', path, 'audio.wav', '-y'])
43
+ path = 'audio.wav'
44
+
45
+ model = whisper.load_model(model_size)
46
+
47
+ result = model.transcribe(path)
48
+ segments = result["segments"]
49
+
50
+ with contextlib.closing(wave.open(path,'r')) as f:
51
+ frames = f.getnframes()
52
+ rate = f.getframerate()
53
+ duration = frames / float(rate)
54
+
55
+ audio = Audio()
56
+
57
+ def segment_embedding(segment):
58
+ start = segment["start"]
59
+
60
+ end = min(duration, segment["end"])
61
+ clip = Segment(start, end)
62
+ waveform, sample_rate = audio.crop(path, clip)
63
+ return embedding_model(waveform[None])
64
+
65
+ embeddings = np.zeros(shape=(len(segments), 192))
66
+ for i, segment in enumerate(segments):
67
+ embeddings[i] = segment_embedding(segment)
68
+
69
+ embeddings = np.nan_to_num(embeddings)
70
+
71
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
72
+ labels = clustering.labels_
73
+ for i in range(len(segments)):
74
+ segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
75
+ # speaker = 'Held'
76
+ # speaker = 'Heldisha'
77
+ # if segments[i]["speaker"]== 'SPEAKER 1':
78
+ # segments[i]["speaker"] = 'Held'
79
+ # elif segments[i]["speaker"]== 'SPEAKER 2':
80
+ # segments[i]["speaker"] = 'Heldisha'
81
+ # if segments[i]["speaker"]== 'SPEAKER 1':
82
+ # segments[i]["speaker"] = segments.index('n')
83
+ # k = list(segments)
84
+ # print(k[5])
85
+
86
+ def time(secs):
87
+ return datetime.timedelta(seconds=round(secs))
88
+
89
+ f = open("transcript.txt", "w")
90
+
91
+ for (i, segment) in enumerate(segments):
92
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
93
+ f.write("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n')
94
+ f.write(segment["text"][1:] + ' ')
95
+ f.close()
96
+
97
+ # with open('transcript.txt', 'r') as file:
98
+ # text = file.read()
99
+ # words = text.split()
100
+ # i = words.index('name')
101
+ # if (words[i-1] == 'My') or (words[i-1] == 'my') and (words[i+1] == 'is'):
102
+ # name1 = words[i+2]
103
+ # print(name1)
104
+
105
+ # with open('transcript.txt', 'r') as file:
106
+ # text = file.read()
107
+ # new_text = text.replace('SPEAKER 1', name1)
108
+ # with open('transcript.txt', 'w') as file:
109
+ # file.write(new_text)
110
+
111
+
112
+
113
+ # with open('transcript.txt', 'r') as file:
114
+
115
+ # text = file.read()
116
+ # words = text.split()
117
+ # i = words.index('name')
118
+ # if (words[i+3] == 'What') or (1<2) and (words[i+1] == 'is') or 1<2:
119
+ # name2 = words[i+22]
120
+ # print(name2)
121
+ # with open('transcript.txt', 'r') as file:
122
+ # text = file.read()
123
+ # new_text = text.replace('SPEAKER 2', name2)
124
+ # with open('transcript.txt', 'w') as file:
125
+ # file.write(new_text)
126
+
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+