HoneyTian commited on
Commit
382cf0c
1 Parent(s): ccd188f
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ .git/
3
+ .idea/
4
+
5
+ data/
6
+ dotenv/
7
+ logs/
8
+ **/__pycache__/
9
+
10
+ **/*.wav
Dockerfile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM golang:1.18
2
+
3
+ WORKDIR /data/GolandProjects/vad_go
4
+
5
+ COPY . /data/GolandProjects/vad_go
6
+
7
+ RUN apt-get update
8
+ RUN apt-get install -y python3-pip
9
+
10
+ RUN pip install --upgrade pip
11
+ RUN pip install --no-cache-dir --upgrade -r /data/GolandProjects/vad_go/requirements.txt
12
+
13
+ RUN bash build_vad_go.sh
14
+
15
+ USER root
16
+
17
+ RUN chmod -R 777 .
18
+
19
+ CMD ["python3", "main.py"]
build_vad_go.sh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ mkdir build
4
+
5
+ go build -o build vad_go
data/examples/b07ae20f-247d-4e96-9c32-4ea27addcd79.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b59f9910c50776eb704ead2360fdf3a0330da3cf693073575a12c800f6316a9a
3
+ size 78284
dsp/audio/wav.go ADDED
@@ -0,0 +1,1010 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package audio
2
+
3
+ import (
4
+ "bytes"
5
+ "encoding/binary"
6
+ "errors"
7
+ "fmt"
8
+ "io"
9
+ "math"
10
+ "os"
11
+ "path/filepath"
12
+
13
+ )
14
+
15
+ /*
16
+ -----RIFF-----
17
+ RIFF 4 标头字母
18
+ CHUNK_SIZE 4 整个RIFF文件的长度(不包含RIFF和CHUNK_SIZE这两个字段的长度)
19
+ FORMAT 4 格式,WAVE代表是wav文件,WAVE格式要求带有标头为fmt和data的子chunk
20
+ -----FMT -----
21
+ SUB_CHUNK_ID 4 子chunk的标头字母,此处为"fmt "(注意,fmt后面是带一个空格的),其相当于wav的属性字段
22
+ SUB_CHUNK_SIZE 4 此subchunk的长度(不包含SUB_CHUNK_ID和SUB_CHUNK_SIZE两个字段的长度)
23
+ AUDIO_FORMAT 2 音频格式,pcm为1
24
+ NUM_CHANNELS 2 声道数量,理论上可以n声道,一般我们用单声道mono,或者双声道stereo(双声道也叫立体声)
25
+ SAMPLE_RATE 4 采样率,每秒采样多少次,通常都有固定的采样选择(8000, 11025,12000,16000,22050,24000,32000,44100,48000)
26
+ BYTE_RATE 4 码率,即每秒播放多少byte数据,计算公式=SAMPLE_RATE*NUM_CHANNELS*BITS_PER_SAMPLE/8(不明白为何需要这个字段)
27
+ BLOCK_ALIGN 2 块对其,其值=BITS_PER_SAMPLE*NUM_CHANNELS/8
28
+ BITS_PER_SAMPLE 2 每个采样多少bit,通常为8,16,32(为8时候代表的是uint8,16代表的是int16,32代表float32)
29
+ -----DATA-----
30
+ SUB_CHUNK_ID 4 子chunk的标头字母,此处为"data"
31
+ SUB_CHUNK_SIZE 4 此subchunk的长度(不包含SUB_CHUNK_ID和SUB_CHUNK_SIZE两个字段的长度)
32
+ DATA pcm的数据
33
+ -------------
34
+ */
35
+
36
+ //MaxChannelNum 最大声道数量(此处只允许2)
37
+ const MaxChannelNum = 2
38
+
39
+ const (
40
+ LeftChannel = 0
41
+ RightChannel =1
42
+ )
43
+
44
+ //tag tag定义
45
+ type tag [4]byte
46
+
47
+ //一些变量
48
+ var (
49
+ tagRIFF = tag{'R', 'I', 'F', 'F'} // "RIFF"
50
+ tagWAVE = tag{'W', 'A', 'V', 'E'} // "WAVE"
51
+ tagFmt = tag{'f', 'm', 't', ' '} // "fmt "
52
+ tagData = tag{'d', 'a', 't', 'a'} // "data"
53
+ )
54
+
55
+ //WavHeaderType wav子部头结构
56
+ type WavHeaderType struct {
57
+ ID tag
58
+ Size uint32
59
+ }
60
+
61
+ //String 打印
62
+ func (wavHeader *WavHeaderType) String() string {
63
+ return fmt.Sprintf("ID=%s,Size=%d", string(wavHeader.ID[:]), wavHeader.Size)
64
+ }
65
+
66
+ //头部size
67
+ var (
68
+ sizeHeader = binary.Size(WavHeaderType{})
69
+ )
70
+
71
+ //chunkLoc ...
72
+ type chunkLoc struct {
73
+ pos int64
74
+ size int64
75
+ }
76
+
77
+ //RiffType ...
78
+ type RiffType struct {
79
+ WavHeaderType
80
+ Fmt tag
81
+ }
82
+
83
+ //String ...
84
+ func (riff *RiffType) String() string {
85
+ return fmt.Sprintf("ID=%s,Size=%d,Fmt=%s", string(riff.ID[:]), riff.Size, string(riff.Fmt[:]))
86
+ }
87
+
88
+ //WavFmtType wav格式结构(头部)
89
+ type WavFmtType struct {
90
+ WavHeaderType
91
+ AudioFormat uint16
92
+ Channels uint16
93
+ SampleRate uint32
94
+ BytesPerSec uint32
95
+ BytesPerBlock uint16
96
+ BitsPerSample uint16
97
+ }
98
+
99
+ //String ...
100
+ func (wavFmt *WavFmtType) String() string {
101
+ return fmt.Sprintf(
102
+ "ID=%s,Size=%d,AudioFormat=%d,Channels=%d,SampleRate=%d,BytesPerSec=%d,BytesPerBlock=%d,BitsPerSample=%d",
103
+ string(wavFmt.ID[:]), wavFmt.Size, wavFmt.AudioFormat, wavFmt.Channels, wavFmt.SampleRate,
104
+ wavFmt.BytesPerSec, wavFmt.BytesPerBlock, wavFmt.BitsPerSample)
105
+ }
106
+
107
+ //SampleType 采样结构
108
+ type SampleType struct {
109
+ val8s [MaxChannelNum]uint8
110
+ val16s [MaxChannelNum]int16
111
+ val32s [MaxChannelNum]float32
112
+ }
113
+
114
+ //WavDataType wav整体结构(头部+采样数据结构)
115
+ type WavDataType struct {
116
+ WavHeaderType
117
+ Sample []SampleType
118
+ }
119
+
120
+ //String ...
121
+ func (wavData *WavDataType) String() string {
122
+ blockNum := len(wavData.Sample)
123
+ return fmt.Sprintf("ID=%s,Size=%d,BlockNum=%d", string(wavData.ID[:]), wavData.Size, blockNum)
124
+ }
125
+
126
+ //WavInfoType wav操作实例
127
+ type WavInfoType struct {
128
+ Riff RiffType
129
+ Fmt WavFmtType
130
+ Data WavDataType
131
+
132
+ //create info
133
+ createMs int64
134
+ }
135
+
136
+ //String ...
137
+ func (wavInfo *WavInfoType) String() string {
138
+ f := &wavInfo.Fmt
139
+ blockNum := len(wavInfo.Data.Sample)
140
+ return fmt.Sprintf("SampleRate=%d,BitsPerSample=%d,Channels=%d,BlockNum=%d",
141
+ f.SampleRate, f.BitsPerSample, f.Channels, blockNum)
142
+
143
+ }
144
+
145
+ //SetCreateTs ...
146
+ func (wavInfo *WavInfoType) SetCreateTs(timestampMs int64) {
147
+ wavInfo.createMs = timestampMs
148
+ }
149
+
150
+ //CopyFormat 复制头部结构
151
+ func (wavInfo *WavInfoType) CopyFormat(w *WavInfoType) (err error) {
152
+ wavInfo.Riff.ID = w.Riff.ID
153
+ wavInfo.Riff.Size = w.Riff.Size
154
+ wavInfo.Riff.Fmt = w.Riff.Fmt
155
+ wavInfo.Fmt.ID = w.Fmt.ID
156
+ wavInfo.Fmt.Size = w.Fmt.Size
157
+ wavInfo.Fmt.AudioFormat = w.Fmt.AudioFormat
158
+ wavInfo.Fmt.Channels = w.Fmt.Channels
159
+ wavInfo.Fmt.SampleRate = w.Fmt.SampleRate
160
+ wavInfo.Fmt.BytesPerSec = w.Fmt.BytesPerSec
161
+ wavInfo.Fmt.BytesPerBlock = w.Fmt.BytesPerBlock
162
+ wavInfo.Fmt.BitsPerSample = w.Fmt.BitsPerSample
163
+ wavInfo.Data.ID = w.Data.ID
164
+ wavInfo.Data.Size = w.Data.Size
165
+ return
166
+ }
167
+
168
+ //ParseFromFile 从文件中导入
169
+ func (wavInfo *WavInfoType) ParseFromFile(absFile string) (err error) {
170
+ absFile, err = filepath.Abs(absFile) //#nosec
171
+ if err != nil {
172
+ return err
173
+ }
174
+ fileHandler, err := os.Open(absFile) //#nosec
175
+ if err != nil {
176
+ return err
177
+ }
178
+ defer fileHandler.Close()
179
+
180
+ _, err = fileHandler.Seek(0, os.SEEK_SET)
181
+ if err != nil {
182
+ return err
183
+ }
184
+
185
+ var pos int64
186
+ var ch WavHeaderType
187
+
188
+ //-----------------------------------------------------
189
+ // RIFF header
190
+ err = binary.Read(fileHandler, binary.LittleEndian, &wavInfo.Riff)
191
+ if err != nil {
192
+ return err
193
+ }
194
+ pos += int64(sizeHeader) + int64(len(tagWAVE))
195
+ if wavInfo.Riff.ID != tagRIFF {
196
+ return errors.New("File Format Not Riff")
197
+ }
198
+ if wavInfo.Riff.Fmt != tagWAVE {
199
+ return errors.New("File Format Not Wave")
200
+ }
201
+ fileSize := int64(sizeHeader) + int64(wavInfo.Riff.Size)
202
+ _ = fileSize
203
+
204
+ //r := &wavInfo.Riff
205
+
206
+ //-----------------------------------------------------
207
+ // read all chunks
208
+
209
+ var chunks = make(map[tag]*chunkLoc)
210
+
211
+ for {
212
+ err = binary.Read(fileHandler, binary.LittleEndian, &ch)
213
+ if err != nil {
214
+ if err == io.EOF {
215
+ break
216
+ }
217
+ return err
218
+ }
219
+ pos += int64(sizeHeader)
220
+
221
+ loc := chunkLoc{
222
+ pos: pos,
223
+ size: int64(ch.Size),
224
+ }
225
+
226
+ _, err = fileHandler.Seek(loc.size, os.SEEK_CUR)
227
+ if err != nil {
228
+ return err
229
+ }
230
+ pos += loc.size // chunk data
231
+
232
+ chunks[ch.ID] = &loc
233
+ }
234
+
235
+ // check fileHandler size
236
+ if pos != fileSize {
237
+ return errors.New("pos != fileSize")
238
+ }
239
+
240
+ //-----------------------------------------------------
241
+ // chunk fmt_
242
+ loc, ok := chunks[tagFmt]
243
+ if !ok {
244
+ return errors.New("wav: has not chunk \"fmt \"")
245
+ }
246
+ _, err = fileHandler.Seek(loc.pos-int64(sizeHeader), os.SEEK_SET)
247
+ if err != nil {
248
+ return err
249
+ }
250
+ err = binary.Read(fileHandler, binary.LittleEndian, &wavInfo.Fmt)
251
+ if err != nil {
252
+ return err
253
+ }
254
+
255
+ //-----------------------------------------------------
256
+ // chunk data
257
+ loc, ok = chunks[tagData]
258
+ if !ok {
259
+ return errors.New("wav: has not chunk \"data\"")
260
+ }
261
+ _, err = fileHandler.Seek(loc.pos, os.SEEK_SET)
262
+ if err != nil {
263
+ return err
264
+ }
265
+
266
+ channel := wavInfo.Fmt.Channels
267
+ bytePerSample := wavInfo.Fmt.BitsPerSample / 8
268
+ blockSize := channel * bytePerSample
269
+
270
+ wavInfo.Data.ID = tagData
271
+ wavInfo.Data.Size = uint32(loc.size)
272
+
273
+ blockNum := wavInfo.Data.Size / uint32(blockSize)
274
+ wavInfo.Data.Sample = make([]SampleType, blockNum)
275
+
276
+ _, err = fileHandler.Seek(loc.pos, os.SEEK_SET)
277
+ if err != nil {
278
+ return err
279
+ }
280
+ blockIdx := 0
281
+ for i := 0; i < int(wavInfo.Data.Size); i += int(blockSize) {
282
+ sample := &wavInfo.Data.Sample[blockIdx]
283
+ blockIdx++
284
+ for c := 0; c < int(channel); c++ {
285
+ switch bytePerSample {
286
+ case 1:
287
+ var val uint8
288
+ err = binary.Read(fileHandler, binary.LittleEndian, &val)
289
+ //sample.val8s = append(sample.val8s,val)
290
+ sample.val8s[c] = val
291
+ case 2:
292
+ var val int16
293
+ err = binary.Read(fileHandler, binary.LittleEndian, &val)
294
+ //sample.val16s = append(sample.val16s,val)
295
+ sample.val16s[c] = val
296
+ //fmt.Printf("pos=%d,val=%d\n",i,sample.val16s)
297
+ case 4:
298
+ var val float32
299
+ err = binary.Read(fileHandler, binary.LittleEndian, &val)
300
+ //sample.val32s = append(sample.val32s,val)
301
+ sample.val32s[c] = val
302
+
303
+ }
304
+ if err != nil {
305
+ return err
306
+ }
307
+ }
308
+ //wavInfo.Data.Sample = append(wavInfo.Data.Sample,sample)
309
+ }
310
+
311
+ //for i:=0;i<len(wavInfo.Data.Sample);i++{
312
+ // sample := &wavInfo.Data.Sample[i]
313
+ // fmt.Printf("idx=%d,left=%d,right=%d\n",i,sample.val16s[0],sample.val16s[1])
314
+ //}
315
+
316
+ for id, chunk := range chunks {
317
+ if id != tagData && id != tagFmt {
318
+ //util.MainLogger.Error(fmt.Sprintf("unkonw chunk=%s,size=%d\n", string(id[:]), chunk.size))
319
+ fmt.Printf("unkonw chunk=%s,size=%d\n", string(id[:]), chunk.size)
320
+ }
321
+ }
322
+ return
323
+ }
324
+
325
+ //ParseFromBuffer 从buffer中导入
326
+ func (wavInfo *WavInfoType) ParseFromBuffer(buffer []byte) (err error) {
327
+ var ch WavHeaderType
328
+
329
+ bufferReader := bytes.NewBuffer(buffer)
330
+ //-----------------------------------------------------
331
+ // RIFF header
332
+ err = binary.Read(bufferReader, binary.LittleEndian, &wavInfo.Riff)
333
+ if err != nil {
334
+ return err
335
+ }
336
+
337
+ //pos := int64(sizeHeader) + int64(len(tagWAVE))
338
+ if wavInfo.Riff.ID != tagRIFF {
339
+ return errors.New("File Format Not Riff")
340
+ }
341
+ if wavInfo.Riff.Fmt != tagWAVE {
342
+ return errors.New("File Format Not Wave")
343
+ }
344
+ fileSize := int64(sizeHeader) + int64(wavInfo.Riff.Size)
345
+ _ = fileSize
346
+
347
+ //r := &wavInfo.Riff
348
+
349
+ //-----------------------------------------------------
350
+ // read all chunks
351
+ for {
352
+ err = binary.Read(bufferReader, binary.LittleEndian, &ch)
353
+ if err != nil {
354
+ if err == io.EOF {
355
+ //文件读取结束
356
+ err = nil
357
+ return
358
+ }
359
+ return err
360
+ }
361
+
362
+ //fmt格式的chunk
363
+ if ch.ID == tagFmt {
364
+ wavInfo.Fmt.WavHeaderType = ch
365
+ err = binary.Read(bufferReader, binary.LittleEndian, &wavInfo.Fmt.AudioFormat)
366
+ err = binary.Read(bufferReader, binary.LittleEndian, &wavInfo.Fmt.Channels)
367
+ err = binary.Read(bufferReader, binary.LittleEndian, &wavInfo.Fmt.SampleRate)
368
+ err = binary.Read(bufferReader, binary.LittleEndian, &wavInfo.Fmt.BytesPerSec)
369
+ err = binary.Read(bufferReader, binary.LittleEndian, &wavInfo.Fmt.BytesPerBlock)
370
+ err = binary.Read(bufferReader, binary.LittleEndian, &wavInfo.Fmt.BitsPerSample)
371
+ if err != nil {
372
+ return err
373
+ }
374
+ continue
375
+ }
376
+
377
+ //data格式的chunk
378
+ if ch.ID == tagData {
379
+ channel := wavInfo.Fmt.Channels
380
+ bytePerSample := wavInfo.Fmt.BitsPerSample / 8
381
+ blockSize := channel * bytePerSample
382
+
383
+ wavInfo.Data.ID = tagData
384
+ wavInfo.Data.Size = ch.Size
385
+
386
+ blockNum := wavInfo.Data.Size / uint32(blockSize)
387
+ wavInfo.Data.Sample = make([]SampleType, blockNum)
388
+
389
+ blockIdx := 0
390
+ for i := 0; i < int(wavInfo.Data.Size); i += int(blockSize) {
391
+ sample := &wavInfo.Data.Sample[blockIdx]
392
+ blockIdx++
393
+ for c := 0; c < int(channel); c++ {
394
+ switch bytePerSample {
395
+ case 1:
396
+ var val uint8
397
+ err = binary.Read(bufferReader, binary.LittleEndian, &val)
398
+ //sample.val8s = append(sample.val8s,val)
399
+ sample.val8s[c] = val
400
+ case 2:
401
+ var val int16
402
+ err = binary.Read(bufferReader, binary.LittleEndian, &val)
403
+ //sample.val16s = append(sample.val16s,val)
404
+ sample.val16s[c] = val
405
+ //fmt.Printf("pos=%d,val=%d\n",i,sample.val16s)
406
+ case 4:
407
+ var val float32
408
+ err = binary.Read(bufferReader, binary.LittleEndian, &val)
409
+ //sample.val32s = append(sample.val32s,val)
410
+ sample.val32s[c] = val
411
+
412
+ }
413
+ if err != nil {
414
+ return err
415
+ }
416
+ }
417
+ //wavInfo.Data.Sample = append(wavInfo.Data.Sample,sample)
418
+ }
419
+ continue
420
+ }
421
+
422
+ //其他格式的chunk
423
+ byteData := make([]byte, ch.Size)
424
+ err = binary.Read(bufferReader, binary.LittleEndian, byteData)
425
+ }
426
+ return
427
+ }
428
+
429
+ //SaveToFile 保存到文件中
430
+ func (wavInfo *WavInfoType) SaveToFile(absFile string) (err error) {
431
+ file, err := os.Create(absFile)
432
+ if err != nil {
433
+ return
434
+ }
435
+ defer file.Close()
436
+ _, err = file.Seek(0, os.SEEK_SET)
437
+ if err != nil {
438
+ return
439
+ }
440
+
441
+ //calc size
442
+ wavInfo.Data.Size = (uint32(wavInfo.Fmt.Channels) * uint32(wavInfo.Fmt.BitsPerSample) / 8) *
443
+ uint32(len(wavInfo.Data.Sample))
444
+ wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
445
+ wavInfo.Riff.Size = uint32(len(wavInfo.Riff.Fmt)) +
446
+ uint32(sizeHeader) +
447
+ wavInfo.Fmt.Size +
448
+ uint32(sizeHeader) +
449
+ wavInfo.Data.Size
450
+
451
+ //----------------------------------------
452
+ // RIFF header
453
+ err = binary.Write(file, binary.LittleEndian, wavInfo.Riff)
454
+ if err != nil {
455
+ return err
456
+ }
457
+
458
+ //----------------------------------------
459
+ // chunk fmt_
460
+ wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
461
+ err = binary.Write(file, binary.LittleEndian, wavInfo.Fmt)
462
+ if err != nil {
463
+ return err
464
+ }
465
+
466
+ //----------------------------------------
467
+ // chunk data
468
+ ch := WavHeaderType{
469
+ ID: tagData,
470
+ Size: wavInfo.Data.Size,
471
+ }
472
+ err = binary.Write(file, binary.LittleEndian, ch)
473
+ if err != nil {
474
+ return err
475
+ }
476
+
477
+ channel := wavInfo.Fmt.Channels
478
+ bytePerSample := wavInfo.Fmt.BitsPerSample / 8
479
+ //blockSize := channel*bytePerSample
480
+
481
+ for i := 0; i < len(wavInfo.Data.Sample); i++ {
482
+ sample := &wavInfo.Data.Sample[i]
483
+ switch bytePerSample {
484
+ case 1:
485
+ for c := 0; c < int(channel); c++ {
486
+ err = binary.Write(file, binary.LittleEndian, sample.val8s[c])
487
+ if err != nil {
488
+ return
489
+ }
490
+
491
+ }
492
+ case 2:
493
+ for c := 0; c < int(channel); c++ {
494
+ err = binary.Write(file, binary.LittleEndian, sample.val16s[c])
495
+ if err != nil {
496
+ return
497
+ }
498
+ }
499
+ case 4:
500
+ for c := 0; c < int(channel); c++ {
501
+ err = binary.Write(file, binary.LittleEndian, sample.val32s[c])
502
+ if err != nil {
503
+ return
504
+ }
505
+ }
506
+ }
507
+ }
508
+ return
509
+ }
510
+
511
+ //SaveToBuffer 保存到buffer中
512
+ func (wavInfo *WavInfoType) SaveToBuffer() (buffer []byte, err error) {
513
+
514
+ //var bufferWriter bytes.Buffer
515
+ buffer = make([]byte, 0)
516
+ bufferWriter := bytes.NewBuffer(buffer)
517
+
518
+ //calc size
519
+ wavInfo.Data.Size = (uint32(wavInfo.Fmt.Channels) * uint32(wavInfo.Fmt.BitsPerSample) / 8) *
520
+ uint32(len(wavInfo.Data.Sample))
521
+ wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
522
+ wavInfo.Riff.Size = uint32(len(wavInfo.Riff.Fmt)) +
523
+ uint32(sizeHeader) +
524
+ wavInfo.Fmt.Size +
525
+ uint32(sizeHeader) +
526
+ wavInfo.Data.Size
527
+
528
+ //----------------------------------------
529
+ // RIFF header
530
+ err = binary.Write(bufferWriter, binary.LittleEndian, wavInfo.Riff)
531
+ if err != nil {
532
+ return
533
+ }
534
+
535
+ //----------------------------------------
536
+ // chunk fmt_
537
+ wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
538
+ err = binary.Write(bufferWriter, binary.LittleEndian, wavInfo.Fmt)
539
+ if err != nil {
540
+ return
541
+ }
542
+
543
+ //----------------------------------------
544
+ // chunk data
545
+ ch := WavHeaderType{
546
+ ID: tagData,
547
+ Size: wavInfo.Data.Size,
548
+ }
549
+ err = binary.Write(bufferWriter, binary.LittleEndian, ch)
550
+ if err != nil {
551
+ return
552
+ }
553
+
554
+ channel := wavInfo.Fmt.Channels
555
+ bytePerSample := wavInfo.Fmt.BitsPerSample / 8
556
+ //blockSize := channel*bytePerSample
557
+
558
+ for i := 0; i < len(wavInfo.Data.Sample); i++ {
559
+ sample := &wavInfo.Data.Sample[i]
560
+ switch bytePerSample {
561
+ case 1:
562
+ for c := 0; c < int(channel); c++ {
563
+ err = binary.Write(bufferWriter, binary.LittleEndian, sample.val8s[c])
564
+ if err != nil {
565
+ return
566
+ }
567
+ }
568
+ case 2:
569
+ for c := 0; c < int(channel); c++ {
570
+ err = binary.Write(bufferWriter, binary.LittleEndian, sample.val16s[c])
571
+ if err != nil {
572
+ return
573
+ }
574
+ }
575
+ case 4:
576
+ for c := 0; c < int(channel); c++ {
577
+ err = binary.Write(bufferWriter, binary.LittleEndian, sample.val32s[c])
578
+ if err != nil {
579
+ return
580
+ }
581
+ }
582
+ }
583
+ }
584
+ buffer = bufferWriter.Bytes()
585
+ return
586
+ }
587
+
588
+ //SaveToBuffer 保存到buffer中
589
+ func (wavInfo *WavInfoType) SaveToBufferWithChannel(voiceChannel int) (buffer []byte, err error) {
590
+
591
+ //var bufferWriter bytes.Buffer
592
+ buffer = make([]byte, 0)
593
+ bufferWriter := bytes.NewBuffer(buffer)
594
+
595
+ //calc size
596
+ channel := wavInfo.Fmt.Channels
597
+ wavInfo.Fmt.Channels = 1
598
+ wavInfo.Data.Size = (uint32(wavInfo.Fmt.Channels) * uint32(wavInfo.Fmt.BitsPerSample) / 8) *
599
+ uint32(len(wavInfo.Data.Sample))
600
+ wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
601
+ wavInfo.Riff.Size = uint32(len(wavInfo.Riff.Fmt)) +
602
+ uint32(sizeHeader) +
603
+ wavInfo.Fmt.Size +
604
+ uint32(sizeHeader) +
605
+ wavInfo.Data.Size
606
+
607
+ //----------------------------------------
608
+ // RIFF header
609
+ err = binary.Write(bufferWriter, binary.LittleEndian, wavInfo.Riff)
610
+ if err != nil {
611
+ return
612
+ }
613
+
614
+ //----------------------------------------
615
+ // chunk fmt_
616
+ wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
617
+ err = binary.Write(bufferWriter, binary.LittleEndian, wavInfo.Fmt)
618
+ if err != nil {
619
+ return
620
+ }
621
+
622
+ //----------------------------------------
623
+ // chunk data
624
+ ch := WavHeaderType{
625
+ ID: tagData,
626
+ Size: wavInfo.Data.Size,
627
+ }
628
+ err = binary.Write(bufferWriter, binary.LittleEndian, ch)
629
+ if err != nil {
630
+ return
631
+ }
632
+
633
+
634
+ bytePerSample := wavInfo.Fmt.BitsPerSample / 8
635
+ //blockSize := channel*bytePerSample
636
+
637
+ for i := 0; i < len(wavInfo.Data.Sample); i++ {
638
+ sample := &wavInfo.Data.Sample[i]
639
+ switch bytePerSample {
640
+ case 1:
641
+ for c := 0; c < int(channel); c++ {
642
+ if voiceChannel == c {
643
+ err = binary.Write(bufferWriter, binary.LittleEndian, sample.val8s[c])
644
+ if err != nil {
645
+ return
646
+ }
647
+ }
648
+ }
649
+ case 2:
650
+ for c := 0; c < int(channel); c++ {
651
+ if voiceChannel == c {
652
+ err = binary.Write(bufferWriter, binary.LittleEndian, sample.val16s[c])
653
+ if err != nil {
654
+ return
655
+ }
656
+ }
657
+ }
658
+ case 4:
659
+ for c := 0; c < int(channel); c++ {
660
+ if voiceChannel == c {
661
+ err = binary.Write(bufferWriter, binary.LittleEndian, sample.val32s[c])
662
+ if err != nil {
663
+ return
664
+ }
665
+ }
666
+ }
667
+ }
668
+ }
669
+ buffer = bufferWriter.Bytes()
670
+ wavInfo.Fmt.Channels = 2
671
+ return
672
+ }
673
+
674
+ //SaveToFile 保存到文件中
675
+ func (wavInfo *WavInfoType) SaveToFileWithChannel(absFile string, voiceChannel int) (err error) {
676
+ file, err := os.Create(absFile)
677
+ if err != nil {
678
+ return
679
+ }
680
+ defer file.Close()
681
+ _, err = file.Seek(0, io.SeekStart)
682
+ if err != nil {
683
+ return
684
+ }
685
+
686
+ //calc size
687
+ channel := wavInfo.Fmt.Channels
688
+ wavInfo.Fmt.Channels = 1
689
+ wavInfo.Data.Size = (uint32(wavInfo.Fmt.Channels) * uint32(wavInfo.Fmt.BitsPerSample) / 8) *
690
+ uint32(len(wavInfo.Data.Sample))
691
+ wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
692
+ wavInfo.Riff.Size = uint32(len(wavInfo.Riff.Fmt)) +
693
+ uint32(sizeHeader) +
694
+ wavInfo.Fmt.Size +
695
+ uint32(sizeHeader) +
696
+ wavInfo.Data.Size
697
+
698
+ //----------------------------------------
699
+ // RIFF header
700
+ err = binary.Write(file, binary.LittleEndian, wavInfo.Riff)
701
+ if err != nil {
702
+ return err
703
+ }
704
+
705
+ //----------------------------------------
706
+ // chunk fmt_
707
+ wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
708
+ err = binary.Write(file, binary.LittleEndian, wavInfo.Fmt)
709
+ if err != nil {
710
+ return err
711
+ }
712
+
713
+ //----------------------------------------
714
+ // chunk data
715
+ ch := WavHeaderType{
716
+ ID: tagData,
717
+ Size: wavInfo.Data.Size,
718
+ }
719
+ err = binary.Write(file, binary.LittleEndian, ch)
720
+ if err != nil {
721
+ return err
722
+ }
723
+
724
+ bytePerSample := wavInfo.Fmt.BitsPerSample / 8
725
+ //blockSize := channel*bytePerSample
726
+
727
+ for i := 0; i < len(wavInfo.Data.Sample); i++ {
728
+ sample := &wavInfo.Data.Sample[i]
729
+ switch bytePerSample {
730
+ case 1:
731
+ for c := 0; c < int(channel); c++ {
732
+ if voiceChannel == c {
733
+ err = binary.Write(file, binary.LittleEndian, sample.val8s[c])
734
+ if err != nil {
735
+ return
736
+ }
737
+ }
738
+ }
739
+ case 2:
740
+ for c := 0; c < int(channel); c++ {
741
+ if voiceChannel == c {
742
+ err = binary.Write(file, binary.LittleEndian, sample.val16s[c])
743
+ if err != nil {
744
+ return
745
+ }
746
+ }
747
+ }
748
+ case 4:
749
+ for c := 0; c < int(channel); c++ {
750
+ if voiceChannel == c {
751
+ err = binary.Write(file, binary.LittleEndian, sample.val32s[c])
752
+ if err != nil {
753
+ return
754
+ }
755
+ }
756
+ }
757
+ }
758
+ }
759
+ wavInfo.Fmt.Channels = 2
760
+ return
761
+ }
762
+
763
+ //AdjusterVolume 调整音量
764
+ func (wavInfo *WavInfoType) AdjusterVolume(rateAsRaw float32) {
765
+ channel := wavInfo.Fmt.Channels
766
+ bytePerSample := wavInfo.Fmt.BitsPerSample / 8
767
+ //blockSize := channel*bytePerSample
768
+
769
+ const MaxUint8 = math.MaxUint8
770
+ const MinUint8 = 0
771
+ const MaxInt16 = math.MaxInt16
772
+ const MinInt16 = math.MinInt16
773
+
774
+ for i := 0; i < len(wavInfo.Data.Sample); i++ {
775
+ switch bytePerSample {
776
+ case 1:
777
+ for c := 0; c < int(channel); c++ {
778
+ val := wavInfo.Data.Sample[i].val8s[c]
779
+ clip := float64(val) * float64(rateAsRaw)
780
+ if clip < MinUint8 {
781
+ clip = MinUint8
782
+ }
783
+ if clip > MaxUint8 {
784
+ clip = MaxUint8
785
+ }
786
+ //wavInfo.Data.Sample[i].val8s[c] = uint8((float32(val) * rateAsRaw))
787
+ wavInfo.Data.Sample[i].val8s[c] = uint8(clip)
788
+ }
789
+ case 2:
790
+ for c := 0; c < int(channel); c++ {
791
+ //val := wavInfo.Data.Sample[i].val16s[c]
792
+ //wavInfo.Data.Sample[i].val16s[c] = int16((float32(val) * rateAsRaw))
793
+ val := wavInfo.Data.Sample[i].val16s[c]
794
+ clip := float64(val) * float64(rateAsRaw)
795
+ if clip < MinInt16 {
796
+ clip = MinInt16
797
+ }
798
+ if clip > MaxInt16 {
799
+ clip = MaxInt16
800
+ }
801
+ wavInfo.Data.Sample[i].val16s[c] = int16(clip)
802
+ }
803
+ case 4:
804
+ for c := 0; c < int(channel); c++ {
805
+ val := wavInfo.Data.Sample[i].val32s[c]
806
+ wavInfo.Data.Sample[i].val32s[c] = val * rateAsRaw
807
+ }
808
+ }
809
+ }
810
+ }
811
+
812
+ //Resample 重置采样率
813
+ func (wavInfo *WavInfoType) Resample(resampleRate uint32) {
814
+ sampleRate := wavInfo.Fmt.SampleRate
815
+ rate := float64(sampleRate) / float64(resampleRate)
816
+ channel := wavInfo.Fmt.Channels
817
+ bytePerSample := wavInfo.Fmt.BitsPerSample / 8
818
+ rawLen := len(wavInfo.Data.Sample)
819
+
820
+ resampleData := make([]SampleType, 0)
821
+
822
+ resampleIdx := 0
823
+ for {
824
+ rawIdx := int(float64(resampleIdx) * rate)
825
+ if rawIdx < rawLen {
826
+ sample := SampleType{}
827
+ for c := 0; c < int(channel); c++ {
828
+ switch bytePerSample {
829
+ case 1:
830
+ val := wavInfo.Data.Sample[rawIdx].val8s[c]
831
+ //sample.val8s = append(sample.val8s,val)
832
+ sample.val8s[c] = val
833
+ case 2:
834
+ val := wavInfo.Data.Sample[rawIdx].val16s[c]
835
+ //sample.val16s = append(sample.val16s,val)
836
+ sample.val16s[c] = val
837
+ case 4:
838
+ val := wavInfo.Data.Sample[rawIdx].val32s[c]
839
+ //sample.val32s = append(sample.val32s,val)
840
+ sample.val32s[c] = val
841
+ }
842
+ }
843
+ resampleData = append(resampleData, sample)
844
+ } else {
845
+ break
846
+ }
847
+ resampleIdx++
848
+ }
849
+
850
+ wavInfo.Data.Sample = resampleData
851
+ wavInfo.Fmt.SampleRate = resampleRate
852
+
853
+ }
854
+
855
+ //ConvertToFloat32 将采样值转换到 0 到 1 之间
856
+ func (wavInfo *WavInfoType) GetFloat32Samples(channel int, bytePerSample int) []float32 {
857
+ //fmt.Println(wavInfo.cha)
858
+
859
+ var floatSamples []float32
860
+
861
+ var point float32
862
+ for i := 0; i < len(wavInfo.Data.Sample); i++ {
863
+ sample := &wavInfo.Data.Sample[i]
864
+ switch bytePerSample {
865
+ case 1:
866
+ point = float32(sample.val8s[channel]) / (1 << 8)
867
+ case 2:
868
+ point = float32(sample.val16s[channel]) / (1 << 15)
869
+ case 4:
870
+ point = sample.val32s[channel]
871
+ }
872
+ floatSamples = append(floatSamples, point)
873
+ }
874
+ return floatSamples
875
+ }
876
+
877
+ //Trim 切头切尾
878
+ func (wavInfo *WavInfoType) Trim(dbPercent float32) {
879
+ channel := wavInfo.Fmt.Channels
880
+ bytePerSample := wavInfo.Fmt.BitsPerSample / 8
881
+ //blockSize := channel*bytePerSample
882
+
883
+ const MaxUint8 = math.MaxUint8
884
+ const MinUint8 = 0
885
+ const MaxInt16 = math.MaxInt16
886
+ const MinInt16 = math.MinInt16
887
+
888
+ //trim head
889
+ silenceHeadIdx := 0 //头部静音截止位置
890
+ done := false
891
+ for i := 0; i < len(wavInfo.Data.Sample); i++ {
892
+ switch bytePerSample {
893
+ case 1:
894
+ for c := 0; c < int(channel); c++ {
895
+ val := wavInfo.Data.Sample[i].val8s[c]
896
+ if float32(val) > MaxUint8*dbPercent {
897
+ silenceHeadIdx = i
898
+ done = true
899
+ break
900
+ }
901
+ }
902
+ case 2:
903
+ for c := 0; c < int(channel); c++ {
904
+ val := wavInfo.Data.Sample[i].val16s[c]
905
+ if float32(val) > MaxInt16*dbPercent {
906
+ silenceHeadIdx = i
907
+ done = true
908
+ break
909
+ }
910
+ if float32(val) < MinInt16*dbPercent {
911
+ silenceHeadIdx = i
912
+ done = true
913
+ break
914
+ }
915
+ }
916
+ case 4:
917
+ for c := 0; c < int(channel); c++ {
918
+
919
+ val := wavInfo.Data.Sample[i].val32s[c]
920
+ if float32(val) > math.MaxFloat32*dbPercent {
921
+ silenceHeadIdx = i
922
+ done = true
923
+ break
924
+ }
925
+ if val < 0 && (-val > math.MaxFloat32*dbPercent) {
926
+ silenceHeadIdx = i
927
+ done = true
928
+ break
929
+ }
930
+ }
931
+ }
932
+
933
+ if done {
934
+ break
935
+ }
936
+ }
937
+
938
+ //trim tail,截断尾部
939
+ silenceTailIdx := len(wavInfo.Data.Sample) - 1 //尾部静音截止位置
940
+ done = false
941
+ for i := len(wavInfo.Data.Sample) - 1; i >= 0; i-- {
942
+ switch bytePerSample {
943
+ case 1:
944
+ for c := 0; c < int(channel); c++ {
945
+ val := wavInfo.Data.Sample[i].val8s[c]
946
+ if float32(val) > MaxUint8*dbPercent {
947
+ silenceTailIdx = i
948
+ done = true
949
+ break
950
+ }
951
+ }
952
+ case 2:
953
+ for c := 0; c < int(channel); c++ {
954
+ val := wavInfo.Data.Sample[i].val16s[c]
955
+ if val >= 0 && float32(val) > float32(MaxInt16*dbPercent) {
956
+ silenceTailIdx = i
957
+ done = true
958
+ break
959
+ }
960
+ if val < 0 && float32(val) < float32(MinInt16*dbPercent) {
961
+ silenceTailIdx = i
962
+ done = true
963
+ break
964
+ }
965
+ }
966
+ case 4:
967
+ for c := 0; c < int(channel); c++ {
968
+
969
+ val := wavInfo.Data.Sample[i].val32s[c]
970
+ if float32(val) > math.MaxFloat32*dbPercent {
971
+ silenceTailIdx = i
972
+ done = true
973
+ break
974
+ }
975
+ if val < 0 && (-val > math.MaxFloat32*dbPercent) {
976
+ silenceTailIdx = i
977
+ done = true
978
+ break
979
+ }
980
+ }
981
+ }
982
+
983
+ if done {
984
+ break
985
+ }
986
+ }
987
+
988
+ wavInfo.Data.Sample = wavInfo.Data.Sample[:silenceTailIdx]
989
+ wavInfo.Data.Sample = wavInfo.Data.Sample[silenceHeadIdx:]
990
+ }
991
+
992
+ func (wavInfo *WavInfoType) TrimFirstWithTime(milliseconds int64) error{
993
+ sampleRate := wavInfo.Fmt.SampleRate
994
+ sizeToTrim := int64(sampleRate) * milliseconds / 1000
995
+ if int(sizeToTrim) >= len(wavInfo.Data.Sample) {
996
+ return errors.New("check time err")
997
+ }
998
+ wavInfo.Data.Sample = wavInfo.Data.Sample[sizeToTrim:]
999
+ return nil
1000
+ }
1001
+
1002
+ func (wavInfo *WavInfoType) GetWavTime() int {
1003
+ return int(math.Ceil(float64(len(wavInfo.Data.Sample)) / float64(wavInfo.Fmt.SampleRate)))
1004
+ }
1005
+
1006
+ //NewWavInfo 新建一个wav操作实例
1007
+ func NewWavInfo() *WavInfoType {
1008
+ w := &WavInfoType{}
1009
+ return w
1010
+ }
dsp/streaming_vad/streaming_vad.go ADDED
@@ -0,0 +1,656 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package streaming_vad
2
+
3
+ import (
4
+ "fmt"
5
+ "math"
6
+ )
7
+
8
+ const (
9
+ FrameFlagSpeechPre = iota
10
+ FrameFlagSpeechStart
11
+ FrameFlagSpeechPresent
12
+ FrameFlagSpeechEnd
13
+ FrameFlagSpeechPost
14
+ )
15
+
16
+ const (
17
+ VadFlagPrepare = "VadFlagPrepare" //准备
18
+ VadFlagSpeaking = "VadFlagSpeaking" //说话中
19
+ VadFlagPause = "VadFlagPause" //逗号停顿
20
+ VadFlagNoSpeech = "VadFlagNoSpeech" //句号停顿
21
+ VadFlagUnknown = "VadFlagUnknown" //未知状态
22
+ )
23
+
24
+
25
+ type ParametersForFdType struct {
26
+ SampleRate uint32
27
+ Threshold float32
28
+ MinThreshold float32
29
+
30
+ FrameLengthInSecond float32
31
+ StartRejectUpdateNoiseLevelTimeInSecond float32
32
+ StartRejectSpeechTimeInSecond float32
33
+
34
+ SpeechStartWindowLengthInSecond float32 //检测语音开始,所需的窗口长度.
35
+ SpeechPresentWindowLengthInSecond float32 //检测语音活动,所需的窗口长度.
36
+ SpeechEndConfirmWindowLengthInSecond float32 //确认语音结束,所需的窗口长度.
37
+
38
+ SpeechStartRequiredLengthInSecond float32
39
+ SpeechStartConfirmRequiredLengthInSecond float32
40
+ SpeechPresentMaintainRequiredLengthInSecond float32
41
+ SpeechEndConfirmRequiredLengthInSecond float32
42
+ }
43
+
44
+ func (pd *ParametersForFdType) Init () {
45
+ pd.SampleRate = 8000
46
+ pd.Threshold = 150.0
47
+ pd.MinThreshold = 50.0
48
+
49
+ pd.FrameLengthInSecond = 0.01
50
+
51
+ //Start Reject
52
+ pd.StartRejectUpdateNoiseLevelTimeInSecond = 0.2
53
+ pd.StartRejectSpeechTimeInSecond = 0.25
54
+
55
+ //Window Length
56
+ pd.SpeechStartWindowLengthInSecond = 0.15
57
+ pd.SpeechPresentWindowLengthInSecond = 0.4
58
+ pd.SpeechEndConfirmWindowLengthInSecond = 0.15
59
+
60
+ //Required Length
61
+ pd.SpeechStartRequiredLengthInSecond = 0.09
62
+ pd.SpeechStartConfirmRequiredLengthInSecond = 0.075
63
+ pd.SpeechPresentMaintainRequiredLengthInSecond = 0.1
64
+ pd.SpeechEndConfirmRequiredLengthInSecond = 0.12
65
+ }
66
+
67
+ type DecisionStateType struct {
68
+ decisionFlag bool
69
+ timeInMilliSecond uint32
70
+ }
71
+
72
+ type FrameDecisionType struct {
73
+ params ParametersForFdType
74
+
75
+ sampleRate uint32
76
+ threshold float32
77
+ minThreshold float32
78
+ adaptFactor float32
79
+
80
+ //
81
+ frameLengthInSecond float32
82
+
83
+ noiseLevelValue float32
84
+ startRejectUpdateNoiseLevelTimeInSecond float32
85
+ startRejectUpdateNoiseLevelFrameNumber uint32
86
+ startRejectSpeechTimeInSecond float32
87
+ startRejectSpeechTimeInMilliSecond uint32
88
+
89
+ speechStartWindowLengthInSecond float32 //检测语音开始,所需的窗口长度.
90
+ speechPresentWindowLengthInSecond float32 //检测语音活动,所需的窗口长度.
91
+ speechEndConfirmWindowLengthInSecond float32 //确认语音结束,所需的窗口长度.
92
+
93
+ speechStartRequiredLengthInSecond float32
94
+ speechStartConfirmRequiredLengthInSecond float32
95
+ speechPresentMaintainRequiredLengthInSecond float32
96
+ speechEndConfirmRequiredLengthInSecond float32
97
+
98
+ decisionStateDeque []DecisionStateType
99
+ decisionStateDequeSize uint32
100
+ decisionStateDequeIndex uint32
101
+
102
+ processedFramesNumber uint32
103
+ lastFrameFlag int
104
+ thisFrameFlag int
105
+
106
+ }
107
+
108
+ func (fd *FrameDecisionType) Init (params ParametersForFdType) {
109
+ fd.params = params
110
+
111
+ fd.sampleRate = params.SampleRate
112
+ fd.threshold = params.Threshold
113
+ fd.minThreshold = params.MinThreshold
114
+ fd.adaptFactor = fd.threshold
115
+
116
+ fd.frameLengthInSecond = params.FrameLengthInSecond
117
+
118
+ fd.noiseLevelValue = fd.threshold / 2.0
119
+ fd.startRejectUpdateNoiseLevelTimeInSecond = params.StartRejectUpdateNoiseLevelTimeInSecond
120
+ fd.startRejectUpdateNoiseLevelFrameNumber = uint32(fd.startRejectUpdateNoiseLevelTimeInSecond / fd.frameLengthInSecond)
121
+
122
+ fd.startRejectSpeechTimeInSecond = params.StartRejectSpeechTimeInSecond
123
+ fd.startRejectSpeechTimeInMilliSecond = uint32(fd.startRejectSpeechTimeInSecond * 1e3 + 0.5)
124
+ fd.speechStartWindowLengthInSecond = params.SpeechStartWindowLengthInSecond
125
+ fd.speechPresentWindowLengthInSecond = params.SpeechPresentWindowLengthInSecond
126
+ fd.speechEndConfirmWindowLengthInSecond = params.SpeechEndConfirmWindowLengthInSecond
127
+
128
+ fd.speechStartRequiredLengthInSecond = params.SpeechStartRequiredLengthInSecond
129
+ fd.speechStartConfirmRequiredLengthInSecond = params.SpeechStartConfirmRequiredLengthInSecond
130
+ fd.speechPresentMaintainRequiredLengthInSecond = params.SpeechPresentMaintainRequiredLengthInSecond
131
+ fd.speechEndConfirmRequiredLengthInSecond = params.SpeechEndConfirmRequiredLengthInSecond
132
+
133
+ //initialize: decisionStateDeque, decisionStateDequeSize, decisionStateDequeIndex
134
+ largestWindowLengthInSecond := fd.speechStartWindowLengthInSecond
135
+ if largestWindowLengthInSecond < fd.speechPresentWindowLengthInSecond {
136
+ largestWindowLengthInSecond = fd.speechPresentWindowLengthInSecond
137
+ }
138
+ if largestWindowLengthInSecond < fd.speechEndConfirmWindowLengthInSecond {
139
+ largestWindowLengthInSecond = fd.speechEndConfirmWindowLengthInSecond
140
+ }
141
+ decisionStateDequeSize := uint32(largestWindowLengthInSecond / fd.frameLengthInSecond + 0.5)
142
+ fd.RefreshDecisionStateDeque(decisionStateDequeSize)
143
+
144
+ fd.processedFramesNumber = 0
145
+ fd.lastFrameFlag = FrameFlagSpeechPre
146
+ fd.thisFrameFlag = FrameFlagSpeechPre
147
+
148
+ }
149
+
150
+ /*
151
+ ProcessStart 当连续语音太长被强制截断时, 就需要有一个方法来重置状态
152
+ */
153
+ func (fd *FrameDecisionType) ProcessStart(resetThreshold bool) {
154
+ fd.RefreshDecisionStateDeque(fd.decisionStateDequeSize)
155
+
156
+ if resetThreshold {
157
+ fd.threshold = fd.params.Threshold
158
+ fd.adaptFactor = fd.threshold
159
+ fd.noiseLevelValue = fd.threshold / 2.0
160
+ fd.processedFramesNumber = 0
161
+ }
162
+
163
+ fd.lastFrameFlag = FrameFlagSpeechPre
164
+ fd.thisFrameFlag = FrameFlagSpeechPre
165
+ }
166
+
167
+ func (fd *FrameDecisionType) RefreshDecisionStateDeque(decisionStateDequeSize uint32) {
168
+ fd.decisionStateDeque = make([]DecisionStateType, decisionStateDequeSize)
169
+ fd.decisionStateDequeSize = decisionStateDequeSize
170
+ fd.decisionStateDequeIndex = 0
171
+ }
172
+
173
+ func (fd *FrameDecisionType) UpdateDecisionState (frameStartTimeInMilliSecond uint32, decisionFlag bool) {
174
+ fd.decisionStateDeque[fd.decisionStateDequeIndex].decisionFlag = decisionFlag
175
+ fd.decisionStateDeque[fd.decisionStateDequeIndex].timeInMilliSecond = frameStartTimeInMilliSecond
176
+ fd.decisionStateDequeIndex = (fd.decisionStateDequeIndex + 1) % fd.decisionStateDequeSize
177
+ }
178
+
179
+ func (fd *FrameDecisionType) SumDecisionTrue (durationInSecond float32) (activeDurationInSecond float32) {
180
+ if len(fd.decisionStateDeque) == 0 {
181
+ return 0.0
182
+ }
183
+
184
+ indexTemp := int64(fd.decisionStateDequeIndex) - 1
185
+ if indexTemp < 0 {
186
+ indexTemp = int64(fd.decisionStateDequeSize) - 1
187
+ }
188
+
189
+ decisionFlag := fd.decisionStateDeque[indexTemp].decisionFlag
190
+ endInMilliSecond := int64(fd.decisionStateDeque[indexTemp].timeInMilliSecond)
191
+ beginInMilliSecond := endInMilliSecond - int64(durationInSecond * 1e3)
192
+ if beginInMilliSecond < 0 {
193
+ beginInMilliSecond = 0
194
+ }
195
+
196
+ var timeSum uint32 = 0
197
+ for i := uint32(1); i < fd.decisionStateDequeSize; i++ {
198
+ if fd.decisionStateDeque[indexTemp].timeInMilliSecond < uint32(beginInMilliSecond) {
199
+ break
200
+ }
201
+ indexTemp--
202
+ if indexTemp < 0 {
203
+ indexTemp = int64(fd.decisionStateDequeSize) - 1
204
+ }
205
+ if decisionFlag {
206
+ timeSum += uint32(endInMilliSecond) - fd.decisionStateDeque[indexTemp].timeInMilliSecond
207
+ }
208
+ decisionFlag = fd.decisionStateDeque[indexTemp].decisionFlag
209
+ endInMilliSecond = int64(fd.decisionStateDeque[indexTemp].timeInMilliSecond)
210
+ }
211
+
212
+ activeDurationInSecond = float32(timeSum) * 1e-3
213
+ return activeDurationInSecond
214
+ }
215
+
216
+ /*
217
+ SpeechFrameProcess 处理一帧音频, 给该帧音频配一个标签.
218
+ */
219
+ func (fd *FrameDecisionType) SpeechFrameProcess (frameStartTimeInMilliSecond uint32, buffer []int16) {
220
+ bufferSize := uint32(len(buffer))
221
+
222
+ /**************************Calculate the RMS***************************/
223
+ sumTemp := int64(0)
224
+ ssqTemp := int64(0)
225
+ for i := uint32(0); i < bufferSize; i++ {
226
+ sumTemp = sumTemp + int64(buffer[i])
227
+ ssqTemp = ssqTemp + int64(buffer[i]) * int64(buffer[i])
228
+ }
229
+
230
+ sum := float64(sumTemp)
231
+ sum /= float64(bufferSize)
232
+
233
+ ssq := float64(ssqTemp)
234
+ rms := float32(math.Sqrt((ssq / float64(bufferSize)) - (sum * sum)))
235
+
236
+ //fmt.Printf("rms %f\n", rms)
237
+ /**********************************************************************/
238
+ var decisionFlag bool
239
+ if frameStartTimeInMilliSecond < fd.startRejectSpeechTimeInMilliSecond {
240
+ decisionFlag = false
241
+ } else {
242
+ decisionFlag = rms > fd.threshold && rms > 400
243
+ }
244
+ //fmt.Printf("decisionFlag %t\n", decisionFlag)
245
+
246
+ fd.UpdateDecisionState(frameStartTimeInMilliSecond, decisionFlag)
247
+
248
+ if fd.thisFrameFlag == FrameFlagSpeechPre {
249
+ if fd.SumDecisionTrue(fd.speechStartWindowLengthInSecond) > fd.speechStartRequiredLengthInSecond {
250
+ fd.thisFrameFlag = FrameFlagSpeechStart
251
+ }
252
+ } else if fd.thisFrameFlag == FrameFlagSpeechStart {
253
+ if fd.SumDecisionTrue(fd.speechStartWindowLengthInSecond) > fd.speechStartConfirmRequiredLengthInSecond {
254
+ fd.thisFrameFlag = FrameFlagSpeechPresent
255
+ } else {
256
+ //TODO: 感觉这一部分是不会触发的吧.
257
+ if fd.speechStartConfirmRequiredLengthInSecond != 0 {
258
+ fd.thisFrameFlag = FrameFlagSpeechPre
259
+ }
260
+ }
261
+ } else if fd.thisFrameFlag == FrameFlagSpeechPresent {
262
+ if fd.SumDecisionTrue(fd.speechPresentWindowLengthInSecond) < fd.speechPresentMaintainRequiredLengthInSecond {
263
+ fd.thisFrameFlag = FrameFlagSpeechEnd
264
+ }
265
+ } else if fd.thisFrameFlag == FrameFlagSpeechEnd {
266
+ if fd.SumDecisionTrue(fd.speechEndConfirmWindowLengthInSecond) <= fd.speechEndConfirmRequiredLengthInSecond {
267
+ fd.thisFrameFlag = FrameFlagSpeechPre
268
+ } else if fd.SumDecisionTrue(fd.speechPresentWindowLengthInSecond) >= fd.speechPresentMaintainRequiredLengthInSecond {
269
+ //fd.thisFrameFlag = FrameFlagSpeechPre
270
+ //我感觉这里的条件判断应该是 < 而不是 >=.
271
+ //有可能他是想在这里添加一个短暂的停顿,用于添加逗号.
272
+ fd.thisFrameFlag = FrameFlagSpeechPre
273
+ }
274
+ }
275
+
276
+ //
277
+ if fd.thisFrameFlag == FrameFlagSpeechPre && !decisionFlag {
278
+ fd.threshold = (0.02 * rms * 2) + (0.98 * fd.threshold)
279
+ fd.adaptFactor = fd.threshold
280
+ } else if decisionFlag && fd.thisFrameFlag == FrameFlagSpeechPresent {
281
+ if rms < fd.adaptFactor {
282
+ fd.adaptFactor = 0.01 * rms + 0.99 * fd.adaptFactor
283
+ } else {
284
+ fd.adaptFactor = 0.05 * rms + 0.95 * fd.adaptFactor
285
+ }
286
+
287
+ thresholdTemp := fd.noiseLevelValue + 0.3 * fd.adaptFactor
288
+ fd.threshold = (0.1 * thresholdTemp) + 0.9 * fd.threshold
289
+ }
290
+
291
+ //
292
+ if fd.threshold < fd.minThreshold {
293
+ fd.threshold = fd.minThreshold
294
+ }
295
+
296
+ // Update the Threshold
297
+ if fd.processedFramesNumber < fd.startRejectUpdateNoiseLevelFrameNumber {
298
+ alphaAdapt := float32(fd.processedFramesNumber) / float32(fd.startRejectUpdateNoiseLevelFrameNumber)
299
+ fd.noiseLevelValue = (alphaAdapt * fd.noiseLevelValue) + ((1 - alphaAdapt) * rms)
300
+ } else {
301
+ if rms > fd.noiseLevelValue {
302
+ fd.noiseLevelValue = (0.001 * rms) + (0.999 * fd.noiseLevelValue)
303
+ } else {
304
+ fd.noiseLevelValue = (0.05 * rms) + (0.95 * fd.noiseLevelValue)
305
+ }
306
+ }
307
+
308
+ if fd.processedFramesNumber < fd.startRejectUpdateNoiseLevelFrameNumber {
309
+ if fd.noiseLevelValue > 400 {
310
+ fd.noiseLevelValue = fd.noiseLevelValue * 0.1
311
+ }
312
+ fd.threshold = fd.noiseLevelValue * 2
313
+
314
+ if fd.threshold < fd.minThreshold {
315
+ fd.threshold = fd.minThreshold
316
+ }
317
+ }
318
+
319
+ fd.processedFramesNumber++
320
+ }
321
+
322
+ type VadEventMarkerType struct {
323
+ VadFlag string
324
+ Time uint32
325
+ }
326
+
327
+ type StreamingVadType struct{
328
+ sampleRate uint32 //采样率
329
+ silenceTime float32 //判断语音结束时需要的静音时长
330
+ timeout float32 //单个语音的最大长度. 语音活动时长超过时, 将被强制判断定为结束
331
+ timeoutInMilliSecond uint32
332
+
333
+ //VAD检测按帧指定, 每一次接收到音频信号时, 需要将 frameLength 的余数部分保存起来, 下一次则将要将余下的部分拼接到信号的开始位置.
334
+ frameLength uint32 //每一帖的长度
335
+ unfinishedFrame []int16 //剩余帧
336
+ unfinishedFrameSize uint32 //剩余帧长度
337
+
338
+ frameDecision FrameDecisionType
339
+
340
+ //
341
+ startRejectSpeechTimeInMilliSecond uint32
342
+
343
+ allowedSilenceTimeInSpeechInMilliSecond uint32
344
+ allowedLongestSpeechDurationInMilliSecond uint32
345
+ minDurationOfLongSpeechInMilliSecond uint32
346
+ endOfLongSpeechRequiredSilenceTimeInMilliSecond uint32
347
+ endOfNormalSpeechRequiredSilenceTimeInMilliSecond uint32
348
+ minDurationOfSpeechToAddCommaInMilliSecond uint32 //where to add comma if speech
349
+
350
+ //只有检测到 VadFlagSpeaking 标签时, 才知道语音已经开始了,
351
+ //此时向前推 prepareDurationInMilliSecond 的时间, 将其定义为 VadFlagPrepare 的位置.
352
+ prepareDurationInMilliSecond uint32
353
+
354
+ //检测到语音结束时, 并不会马上判断语音结束,
355
+ //而是需要一定时间 SpeechEndConfirmWindowLengthInSecond 的静音后再确认它,
356
+ //语音结束的事件节点是`语音结束`后 nonSpeechPadInInMilliSecond 的时间位置.
357
+ nonSpeechPadInInMilliSecond uint32
358
+
359
+ speechFrameGlobalTimeInMilliSecond uint32
360
+
361
+ speechDetectedStartTimeInMilliSecond uint32
362
+ speechDetectedStartTimeIsValid bool
363
+ speechDetectedEndTimeInMilliSecond uint32
364
+ speechDetectedEndTimeIsValid bool
365
+ speechDetectedEndTimeIsValidPossible bool
366
+ speechDetectedStartAndEnd bool
367
+
368
+ //
369
+ lastVadEndTimeInMilliSecond uint32
370
+ thisDetectedState string //VadFlag
371
+ VadEventMarkerDeque []VadEventMarkerType
372
+ }
373
+
374
+ /*
375
+ silenceTime: 0.4
376
+ timeout: 3.0
377
+
378
+ 以下条件应满足:
379
+ minDurationOfSpeechToAddCommaInMilliSecond < endOfNormalSpeechRequiredSilenceTimeInMilliSecond
380
+ endOfNormalSpeechRequiredSilenceTimeInMilliSecond < endOfLongSpeechRequiredSilenceTimeInMilliSecond
381
+
382
+ */
383
+ func (sv *StreamingVadType) Init (sampleRate uint32, silenceTime float32, timeout float32) {
384
+ sv.sampleRate = sampleRate
385
+ sv.timeout = timeout
386
+ sv.timeoutInMilliSecond = uint32(timeout * 1e3)
387
+
388
+ sv.frameLength = uint32(0.02 * float32(sampleRate))
389
+ sv.unfinishedFrameSize = 0
390
+
391
+ var params ParametersForFdType
392
+ params.Init()
393
+ params.SampleRate = sampleRate
394
+ sv.frameDecision.Init(params)
395
+
396
+ //
397
+ sv.startRejectSpeechTimeInMilliSecond = uint32(0.7 * 1e3)
398
+ sv.allowedSilenceTimeInSpeechInMilliSecond = uint32(0.2 * 1e3)
399
+ sv.minDurationOfLongSpeechInMilliSecond = 0
400
+ sv.endOfLongSpeechRequiredSilenceTimeInMilliSecond = 0
401
+ sv.endOfNormalSpeechRequiredSilenceTimeInMilliSecond = uint32(silenceTime * 1e3)
402
+ sv.minDurationOfSpeechToAddCommaInMilliSecond = uint32(0.3 * 1e3)
403
+ sv.allowedLongestSpeechDurationInMilliSecond = 0
404
+
405
+ sv.prepareDurationInMilliSecond = uint32(sv.frameDecision.speechStartWindowLengthInSecond * 1e3) * 2
406
+ sv.nonSpeechPadInInMilliSecond = uint32(silenceTime * 1e3 * 0.5)
407
+ sv.speechFrameGlobalTimeInMilliSecond = 0
408
+
409
+ //
410
+ sv.speechDetectedStartTimeInMilliSecond = 0
411
+ sv.speechDetectedStartTimeIsValid = false
412
+ sv.speechDetectedEndTimeInMilliSecond = 0
413
+ sv.speechDetectedEndTimeIsValid = false
414
+ sv.speechDetectedEndTimeIsValidPossible = false
415
+ sv.speechDetectedStartAndEnd = false
416
+
417
+ //
418
+ sv.lastVadEndTimeInMilliSecond = 0
419
+ sv.thisDetectedState = VadFlagNoSpeech
420
+ fmt.Println("do StreamingVad Init...")
421
+ }
422
+
423
+
424
+ func (sv *StreamingVadType) ProcessSpeechByChunk(buffer []int16) (err error) {
425
+ var validBuffer []int16
426
+
427
+ //unfinished frame
428
+ unfinishedFrameSize := uint32(len(sv.unfinishedFrame))
429
+ for i := uint32(0); i < unfinishedFrameSize; i++ {
430
+ validBuffer = append(validBuffer, sv.unfinishedFrame[i])
431
+ }
432
+
433
+ //buffer
434
+ bufferSize := uint32(len(buffer))
435
+ for i := uint32(0); i < bufferSize; i++ {
436
+ validBuffer = append(validBuffer, buffer[i])
437
+ }
438
+
439
+ //remainder
440
+ remainderSize := uint32(len(validBuffer)) % sv.frameLength
441
+ boundary := uint32(len(validBuffer)) - remainderSize
442
+ sv.unfinishedFrame = validBuffer[boundary:]
443
+ validBuffer = validBuffer[:boundary]
444
+ if uint32(len(validBuffer)) > sv.frameLength {
445
+ sv.ProcessSpeech(validBuffer)
446
+ }
447
+ return nil
448
+ }
449
+
450
+
451
+ //ProcessSpeechByChunk 需要将 buffer 更新成指定 frameLength 的倍数, 多余的部分保存起来以供下次使用.
452
+ func (sv *StreamingVadType) DeprecatedProcessSpeechByChunk(buffer []int16) (err error) {
453
+ bufferSize := uint32(len(buffer))
454
+
455
+ var validBuffer []int16
456
+ var unfinishedFrame []int16
457
+ var point int16
458
+ validSize := (uint32(len(buffer)) + sv.unfinishedFrameSize) / sv.frameLength * sv.frameLength
459
+ if validSize >= sv.frameLength {
460
+ if sv.unfinishedFrameSize != 0 {
461
+ for i := uint32(0); i < sv.unfinishedFrameSize; i++ {
462
+ point = sv.unfinishedFrame[i]
463
+ validBuffer = append(validBuffer, point)
464
+ }
465
+ for i := uint32(0); i < validSize - sv.unfinishedFrameSize; i++ {
466
+ point = buffer[i]
467
+ validBuffer = append(validBuffer, point)
468
+ }
469
+ } else {
470
+ for i := uint32(0); i < validSize; i++ {
471
+ point = buffer[i]
472
+ validBuffer = append(validBuffer, point)
473
+ }
474
+ }
475
+ sv.ProcessSpeech(validBuffer)
476
+ }
477
+
478
+ //fmt.Printf("validBuffer size: %d\n", len(validBuffer))
479
+ //fmt.Printf("validSize: %d\n", validSize)
480
+ //fmt.Printf("last unfinishedFrameSize: %d\n", sv.unfinishedFrameSize)
481
+
482
+ sv.unfinishedFrameSize = (bufferSize + sv.unfinishedFrameSize) - validSize
483
+ begin := bufferSize - sv.unfinishedFrameSize - 1
484
+ for i := uint32(0); i < sv.unfinishedFrameSize; i++ {
485
+ point = buffer[begin + i]
486
+ unfinishedFrame = append(unfinishedFrame, point)
487
+ }
488
+ sv.unfinishedFrame = unfinishedFrame
489
+
490
+ fmt.Println("do StreamingVad ProcessSpeechByChunk...")
491
+ return nil
492
+ }
493
+
494
+ //ProcessSpeech 根据上一帧的语音标签和当前帧的语音标签来判断VAD状态.
495
+ func (sv *StreamingVadType) ProcessSpeech(buffer []int16) {
496
+ bufferLength := uint32(len(buffer))
497
+ if bufferLength % sv.frameLength != 0 {
498
+ panic(fmt.Sprintf("bufferLength (%d) should be a multiple of B frameLength (%d)", bufferLength, sv.frameLength))
499
+ }
500
+
501
+ var frameBuffer []int16
502
+ for begin := uint32(0); begin + sv.frameLength <= bufferLength; {
503
+ frameBuffer = buffer[begin: begin + sv.frameLength]
504
+ sv.frameDecision.SpeechFrameProcess(sv.speechFrameGlobalTimeInMilliSecond, frameBuffer)
505
+ begin += sv.frameLength
506
+
507
+ if sv.frameDecision.lastFrameFlag == FrameFlagSpeechStart && sv.frameDecision.thisFrameFlag == FrameFlagSpeechPresent {
508
+ if sv.thisDetectedState == VadFlagNoSpeech {
509
+ //start
510
+ var prepareTime uint32 = 0
511
+ if sv.speechFrameGlobalTimeInMilliSecond > sv.prepareDurationInMilliSecond {
512
+ prepareTime = sv.speechFrameGlobalTimeInMilliSecond - sv.prepareDurationInMilliSecond
513
+ }
514
+ if prepareTime < sv.lastVadEndTimeInMilliSecond {
515
+ prepareTime = sv.lastVadEndTimeInMilliSecond
516
+ }
517
+ vadEventMarker := VadEventMarkerType{
518
+ VadFlag: VadFlagPrepare,
519
+ Time: prepareTime,
520
+ }
521
+ sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
522
+
523
+ sv.thisDetectedState = VadFlagSpeaking
524
+
525
+ vadEventMarker = VadEventMarkerType{
526
+ VadFlag: VadFlagSpeaking,
527
+ Time: sv.speechFrameGlobalTimeInMilliSecond,
528
+ }
529
+ sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
530
+
531
+ //
532
+ sv.speechDetectedEndTimeInMilliSecond = uint32(0)
533
+ sv.speechDetectedEndTimeIsValid = false
534
+ //sv.speechDetectedEndTimeIsValidPossible = false
535
+ sv.speechDetectedStartTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond - uint32(sv.frameDecision.speechStartWindowLengthInSecond * 1e3)
536
+ sv.speechDetectedStartTimeIsValid = true
537
+ } else if sv.thisDetectedState == VadFlagSpeaking && sv.speechDetectedEndTimeIsValid &&
538
+ sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond > sv.minDurationOfSpeechToAddCommaInMilliSecond &&
539
+ sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond < sv.endOfNormalSpeechRequiredSilenceTimeInMilliSecond {
540
+
541
+ //pause
542
+ vadEventMarker := VadEventMarkerType{
543
+ VadFlag: VadFlagPause,
544
+ Time: sv.speechDetectedEndTimeInMilliSecond,
545
+ }
546
+ sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
547
+
548
+ sv.thisDetectedState = VadFlagSpeaking
549
+
550
+ vadEventMarker = VadEventMarkerType{
551
+ VadFlag: VadFlagSpeaking,
552
+ Time: sv.speechFrameGlobalTimeInMilliSecond,
553
+ }
554
+ sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
555
+
556
+ //
557
+ sv.speechDetectedEndTimeInMilliSecond = uint32(0)
558
+ sv.speechDetectedEndTimeIsValid = false
559
+ //sv.speechDetectedEndTimeIsValidPossible = false
560
+ } else if sv.thisDetectedState == VadFlagSpeaking &&
561
+ sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond <= sv.minDurationOfSpeechToAddCommaInMilliSecond {
562
+ //
563
+ sv.speechDetectedEndTimeInMilliSecond = uint32(0)
564
+ sv.speechDetectedEndTimeIsValid = false
565
+ //sv.speechDetectedEndTimeIsValidPossible = false
566
+ } else {}
567
+ }
568
+
569
+ //end
570
+ if sv.frameDecision.lastFrameFlag == FrameFlagSpeechEnd && sv.frameDecision.thisFrameFlag == FrameFlagSpeechPre {
571
+ sv.speechDetectedEndTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond
572
+ sv.speechDetectedEndTimeIsValid = true
573
+ //sv.speechDetectedEndTimeIsValidPossible = true
574
+ }
575
+
576
+ //只在开始一定时间后, 才能检测到 Vad 结束.
577
+ if sv.speechFrameGlobalTimeInMilliSecond > sv.startRejectSpeechTimeInMilliSecond {
578
+ if sv.speechDetectedEndTimeIsValid {
579
+ var endOfSpeechRequiredSilenceTime uint32
580
+ if sv.minDurationOfLongSpeechInMilliSecond > 0 &&
581
+ sv.endOfLongSpeechRequiredSilenceTimeInMilliSecond > 0 &&
582
+ (sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedStartTimeInMilliSecond) > sv.minDurationOfLongSpeechInMilliSecond {
583
+ endOfSpeechRequiredSilenceTime = sv.endOfLongSpeechRequiredSilenceTimeInMilliSecond
584
+ } else {
585
+ endOfSpeechRequiredSilenceTime = sv.endOfNormalSpeechRequiredSilenceTimeInMilliSecond
586
+ }
587
+
588
+ if (sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond) >= endOfSpeechRequiredSilenceTime {
589
+ endTime := sv.speechDetectedEndTimeInMilliSecond + sv.nonSpeechPadInInMilliSecond
590
+ sv.speechDetectedEndTimeInMilliSecond = uint32(0)
591
+ sv.speechDetectedEndTimeIsValid = false
592
+ sv.speechDetectedStartTimeInMilliSecond = uint32(0)
593
+ sv.speechDetectedStartTimeIsValid = false
594
+ sv.thisDetectedState = VadFlagNoSpeech
595
+
596
+ sv.lastVadEndTimeInMilliSecond = endTime
597
+ vadEventMarker := VadEventMarkerType{
598
+ VadFlag: VadFlagNoSpeech,
599
+ Time: endTime,
600
+ }
601
+ sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
602
+ }
603
+ }
604
+ }
605
+
606
+ //当语音时长超过时, 强制切断
607
+ if sv.speechDetectedStartTimeIsValid && sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedStartTimeInMilliSecond > sv.timeoutInMilliSecond {
608
+ //end
609
+ sv.speechDetectedEndTimeInMilliSecond = uint32(0)
610
+ sv.speechDetectedEndTimeIsValid = false
611
+ sv.speechDetectedStartTimeInMilliSecond = uint32(0)
612
+ sv.speechDetectedStartTimeIsValid = false
613
+ sv.thisDetectedState = VadFlagNoSpeech
614
+
615
+ sv.lastVadEndTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond
616
+ vadEventMarker := VadEventMarkerType{
617
+ VadFlag: VadFlagNoSpeech,
618
+ Time: sv.speechFrameGlobalTimeInMilliSecond,
619
+ }
620
+ sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
621
+
622
+ //start
623
+ var prepareTime uint32 = 0
624
+ if sv.speechFrameGlobalTimeInMilliSecond > sv.prepareDurationInMilliSecond {
625
+ prepareTime = sv.speechFrameGlobalTimeInMilliSecond - sv.prepareDurationInMilliSecond
626
+ }
627
+ if prepareTime < sv.lastVadEndTimeInMilliSecond {
628
+ prepareTime = sv.lastVadEndTimeInMilliSecond
629
+ }
630
+ vadEventMarker = VadEventMarkerType{
631
+ VadFlag: VadFlagPrepare,
632
+ Time: prepareTime,
633
+ }
634
+ sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
635
+
636
+ sv.thisDetectedState = VadFlagSpeaking
637
+
638
+ vadEventMarker = VadEventMarkerType{
639
+ VadFlag: VadFlagSpeaking,
640
+ Time: sv.speechFrameGlobalTimeInMilliSecond,
641
+ }
642
+ sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
643
+
644
+ //
645
+ sv.speechDetectedEndTimeInMilliSecond = uint32(0)
646
+ sv.speechDetectedEndTimeIsValid = false
647
+ //sv.speechDetectedEndTimeIsValidPossible = false
648
+ sv.speechDetectedStartTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond - uint32(sv.frameDecision.speechStartWindowLengthInSecond * 1e3)
649
+ sv.speechDetectedStartTimeIsValid = true
650
+ }
651
+
652
+ //loop
653
+ sv.frameDecision.lastFrameFlag = sv.frameDecision.thisFrameFlag
654
+ sv.speechFrameGlobalTimeInMilliSecond += uint32(float32(sv.frameLength) / float32(sv.sampleRate) * 1e3)
655
+ }
656
+ }
go.mod ADDED
@@ -0,0 +1 @@
 
 
1
+ module vad_go
log.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import logging
4
+ from logging.handlers import TimedRotatingFileHandler
5
+ import os
6
+
7
+
8
+ def setup(log_directory: str):
9
+ fmt = "%(asctime)s - %(name)s - %(levelname)s %(filename)s:%(lineno)d > %(message)s"
10
+
11
+ stream_handler = logging.StreamHandler()
12
+ stream_handler.setLevel(logging.INFO)
13
+ stream_handler.setFormatter(logging.Formatter(fmt))
14
+
15
+ # main
16
+ main_logger = logging.getLogger("main")
17
+ main_logger.addHandler(stream_handler)
18
+ main_info_file_handler = TimedRotatingFileHandler(
19
+ filename=os.path.join(log_directory, "main.log"),
20
+ encoding="utf-8",
21
+ when="midnight",
22
+ interval=1,
23
+ backupCount=30
24
+ )
25
+ main_info_file_handler.setLevel(logging.INFO)
26
+ main_info_file_handler.setFormatter(logging.Formatter(fmt))
27
+ main_logger.addHandler(main_info_file_handler)
28
+
29
+ # http
30
+ http_logger = logging.getLogger("http")
31
+ http_file_handler = TimedRotatingFileHandler(
32
+ filename=os.path.join(log_directory, "http.log"),
33
+ encoding='utf-8',
34
+ when="midnight",
35
+ interval=1,
36
+ backupCount=30
37
+ )
38
+ http_file_handler.setLevel(logging.DEBUG)
39
+ http_file_handler.setFormatter(logging.Formatter(fmt))
40
+ http_logger.addHandler(http_file_handler)
41
+
42
+ # api
43
+ api_logger = logging.getLogger("api")
44
+ api_file_handler = TimedRotatingFileHandler(
45
+ filename=os.path.join(log_directory, "api.log"),
46
+ encoding='utf-8',
47
+ when="midnight",
48
+ interval=1,
49
+ backupCount=30
50
+ )
51
+ api_file_handler.setLevel(logging.DEBUG)
52
+ api_file_handler.setFormatter(logging.Formatter(fmt))
53
+ api_logger.addHandler(api_file_handler)
54
+
55
+ # alarm
56
+ alarm_logger = logging.getLogger("alarm")
57
+ alarm_file_handler = TimedRotatingFileHandler(
58
+ filename=os.path.join(log_directory, "alarm.log"),
59
+ encoding="utf-8",
60
+ when="midnight",
61
+ interval=1,
62
+ backupCount=30
63
+ )
64
+ alarm_file_handler.setLevel(logging.DEBUG)
65
+ alarm_file_handler.setFormatter(logging.Formatter(fmt))
66
+ alarm_logger.addHandler(alarm_file_handler)
67
+
68
+ debug_file_handler = TimedRotatingFileHandler(
69
+ filename=os.path.join(log_directory, "debug.log"),
70
+ encoding="utf-8",
71
+ when="D",
72
+ interval=1,
73
+ backupCount=7
74
+ )
75
+ debug_file_handler.setLevel(logging.DEBUG)
76
+ debug_file_handler.setFormatter(logging.Formatter(fmt))
77
+
78
+ info_file_handler = TimedRotatingFileHandler(
79
+ filename=os.path.join(log_directory, "info.log"),
80
+ encoding="utf-8",
81
+ when="D",
82
+ interval=1,
83
+ backupCount=7
84
+ )
85
+ info_file_handler.setLevel(logging.INFO)
86
+ info_file_handler.setFormatter(logging.Formatter(fmt))
87
+
88
+ error_file_handler = TimedRotatingFileHandler(
89
+ filename=os.path.join(log_directory, "error.log"),
90
+ encoding="utf-8",
91
+ when="D",
92
+ interval=1,
93
+ backupCount=7
94
+ )
95
+ error_file_handler.setLevel(logging.ERROR)
96
+ error_file_handler.setFormatter(logging.Formatter(fmt))
97
+
98
+ logging.basicConfig(
99
+ level=logging.DEBUG,
100
+ datefmt="%a, %d %b %Y %H:%M:%S",
101
+ handlers=[
102
+ debug_file_handler,
103
+ info_file_handler,
104
+ error_file_handler,
105
+ ]
106
+ )
107
+
108
+
109
+ if __name__ == "__main__":
110
+ pass
main.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import logging
5
+ from pathlib import Path
6
+ import platform
7
+ import re
8
+
9
+ from project_settings import project_path, log_directory
10
+ import log
11
+
12
+ log.setup(log_directory=log_directory)
13
+
14
+ import gradio as gr
15
+
16
+ from toolbox.os.command import Command
17
+
18
+ main_logger = logging.getLogger("main")
19
+
20
+
21
+ def get_args():
22
+ parser = argparse.ArgumentParser()
23
+
24
+ parser.add_argument(
25
+ "--example_wav_dir",
26
+ default=(project_path / "data/examples").as_posix(),
27
+ type=str
28
+ )
29
+ args = parser.parse_args()
30
+ return args
31
+
32
+
33
+ def process_uploaded_file(filename: str) -> str:
34
+ filename = Path(filename).as_posix()
35
+
36
+ main_logger.info("asr recognize: {}".format(filename))
37
+
38
+ cmd = "build/asr_id --filename {}".format(
39
+ filename
40
+ )
41
+ asr_result = Command.popen(cmd)
42
+
43
+ pattern = "text: (.*)textSize: (.*)wordSize: (.*)timeCost: (.+)"
44
+ match = re.search(pattern, asr_result, flags=re.IGNORECASE | re.DOTALL)
45
+
46
+ if match is None:
47
+ raise AssertionError("run asr recognize failed: \n{}".format(asr_result))
48
+
49
+ text = match.group(1)
50
+
51
+ return text
52
+
53
+
54
+ def shell(cmd: str):
55
+ return Command.popen(cmd)
56
+
57
+
58
+ def main():
59
+ args = get_args()
60
+
61
+ title = "## 针对电话场景的印尼语ASR."
62
+
63
+ # examples
64
+ example_wav_dir = Path(args.example_wav_dir)
65
+
66
+ examples = list()
67
+ for filename in example_wav_dir.glob("*.wav"):
68
+ examples.append(
69
+ [
70
+ filename.as_posix()
71
+ ]
72
+ )
73
+
74
+ # blocks
75
+ with gr.Blocks() as blocks:
76
+ gr.Markdown(value=title)
77
+
78
+ with gr.Tabs():
79
+ with gr.TabItem("Upload from disk"):
80
+ uploaded_file = gr.Audio(
81
+ sources=["upload"],
82
+ type="filepath",
83
+ label="Upload from disk",
84
+ )
85
+ upload_button = gr.Button("Submit for recognition")
86
+ uploaded_output = gr.Textbox(label="Recognized speech from uploaded file")
87
+
88
+ gr.Examples(
89
+ examples=examples,
90
+ inputs=[
91
+ uploaded_file,
92
+ ],
93
+ outputs=[
94
+ uploaded_output
95
+ ],
96
+ fn=process_uploaded_file
97
+ )
98
+
99
+ upload_button.click(
100
+ process_uploaded_file,
101
+ inputs=[
102
+ uploaded_file,
103
+ ],
104
+ outputs=[
105
+ uploaded_output
106
+ ],
107
+ )
108
+ with gr.TabItem("shell"):
109
+ shell_text = gr.Textbox(label="cmd")
110
+ shell_button = gr.Button("run")
111
+ shell_output = gr.Textbox(label="output")
112
+
113
+ shell_button.click(
114
+ shell,
115
+ inputs=[
116
+ shell_text,
117
+ ],
118
+ outputs=[
119
+ shell_output
120
+ ],
121
+ )
122
+
123
+ blocks.queue().launch(
124
+ share=False if platform.system() == "Windows" else False,
125
+ server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0",
126
+ server_port=7860
127
+ )
128
+
129
+ return
130
+
131
+
132
+ if __name__ == "__main__":
133
+ main()
project_settings.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import os
4
+ from pathlib import Path
5
+
6
+ from toolbox.os.environment import EnvironmentManager
7
+
8
+
9
+ project_path = os.path.abspath(os.path.dirname(__file__))
10
+ project_path = Path(project_path)
11
+
12
+ log_directory = project_path / "logs"
13
+ log_directory.mkdir(parents=True, exist_ok=True)
14
+
15
+ temp_directory = project_path / "temp"
16
+ temp_directory.mkdir(parents=True, exist_ok=True)
17
+
18
+ environment = EnvironmentManager(
19
+ path=os.path.join(project_path, "dotenv"),
20
+ env=os.environ.get("environment", "dev"),
21
+ )
22
+
23
+
24
+ if __name__ == '__main__':
25
+ pass
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ gradio==4.36.1
toolbox/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ if __name__ == '__main__':
5
+ pass
toolbox/json/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == '__main__':
6
+ pass
toolbox/json/misc.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ from typing import Callable
4
+
5
+
6
+ def traverse(js, callback: Callable, *args, **kwargs):
7
+ if isinstance(js, list):
8
+ result = list()
9
+ for l in js:
10
+ l = traverse(l, callback, *args, **kwargs)
11
+ result.append(l)
12
+ return result
13
+ elif isinstance(js, tuple):
14
+ result = list()
15
+ for l in js:
16
+ l = traverse(l, callback, *args, **kwargs)
17
+ result.append(l)
18
+ return tuple(result)
19
+ elif isinstance(js, dict):
20
+ result = dict()
21
+ for k, v in js.items():
22
+ k = traverse(k, callback, *args, **kwargs)
23
+ v = traverse(v, callback, *args, **kwargs)
24
+ result[k] = v
25
+ return result
26
+ elif isinstance(js, int):
27
+ return callback(js, *args, **kwargs)
28
+ elif isinstance(js, str):
29
+ return callback(js, *args, **kwargs)
30
+ else:
31
+ return js
32
+
33
+
34
+ def demo1():
35
+ d = {
36
+ "env": "ppe",
37
+ "mysql_connect": {
38
+ "host": "$mysql_connect_host",
39
+ "port": 3306,
40
+ "user": "callbot",
41
+ "password": "NxcloudAI2021!",
42
+ "database": "callbot_ppe",
43
+ "charset": "utf8"
44
+ },
45
+ "es_connect": {
46
+ "hosts": ["10.20.251.8"],
47
+ "http_auth": ["elastic", "ElasticAI2021!"],
48
+ "port": 9200
49
+ }
50
+ }
51
+
52
+ def callback(s):
53
+ if isinstance(s, str) and s.startswith('$'):
54
+ return s[1:]
55
+ return s
56
+
57
+ result = traverse(d, callback=callback)
58
+ print(result)
59
+ return
60
+
61
+
62
+ if __name__ == '__main__':
63
+ demo1()
toolbox/os/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == '__main__':
6
+ pass
toolbox/os/command.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import os
4
+
5
+
6
+ class Command(object):
7
+ custom_command = [
8
+ "cd"
9
+ ]
10
+
11
+ @staticmethod
12
+ def _get_cmd(command):
13
+ command = str(command).strip()
14
+ if command == "":
15
+ return None
16
+ cmd_and_args = command.split(sep=" ")
17
+ cmd = cmd_and_args[0]
18
+ args = " ".join(cmd_and_args[1:])
19
+ return cmd, args
20
+
21
+ @classmethod
22
+ def popen(cls, command):
23
+ cmd, args = cls._get_cmd(command)
24
+ if cmd in cls.custom_command:
25
+ method = getattr(cls, cmd)
26
+ return method(args)
27
+ else:
28
+ resp = os.popen(command)
29
+ result = resp.read()
30
+ resp.close()
31
+ return result
32
+
33
+ @classmethod
34
+ def cd(cls, args):
35
+ if args.startswith("/"):
36
+ os.chdir(args)
37
+ else:
38
+ pwd = os.getcwd()
39
+ path = os.path.join(pwd, args)
40
+ os.chdir(path)
41
+
42
+ @classmethod
43
+ def system(cls, command):
44
+ return os.system(command)
45
+
46
+ def __init__(self):
47
+ pass
48
+
49
+
50
+ def ps_ef_grep(keyword: str):
51
+ cmd = "ps -ef | grep {}".format(keyword)
52
+ rows = Command.popen(cmd)
53
+ rows = str(rows).split("\n")
54
+ rows = [row for row in rows if row.__contains__(keyword) and not row.__contains__("grep")]
55
+ return rows
56
+
57
+
58
+ if __name__ == "__main__":
59
+ pass
toolbox/os/environment.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import json
4
+ import os
5
+
6
+ from dotenv import load_dotenv
7
+ from dotenv.main import DotEnv
8
+
9
+ from toolbox.json.misc import traverse
10
+
11
+
12
+ class EnvironmentManager(object):
13
+ def __init__(self, path, env, override=False):
14
+ filename = os.path.join(path, '{}.env'.format(env))
15
+ self.filename = filename
16
+
17
+ load_dotenv(
18
+ dotenv_path=filename,
19
+ override=override
20
+ )
21
+
22
+ self._environ = dict()
23
+
24
+ def open_dotenv(self, filename: str = None):
25
+ filename = filename or self.filename
26
+ dotenv = DotEnv(
27
+ dotenv_path=filename,
28
+ stream=None,
29
+ verbose=False,
30
+ interpolate=False,
31
+ override=False,
32
+ encoding="utf-8",
33
+ )
34
+ result = dotenv.dict()
35
+ return result
36
+
37
+ def get(self, key, default=None, dtype=str):
38
+ result = os.environ.get(key)
39
+ if result is None:
40
+ if default is None:
41
+ result = None
42
+ else:
43
+ result = default
44
+ else:
45
+ result = dtype(result)
46
+ self._environ[key] = result
47
+ return result
48
+
49
+
50
+ _DEFAULT_DTYPE_MAP = {
51
+ 'int': int,
52
+ 'float': float,
53
+ 'str': str,
54
+ 'json.loads': json.loads
55
+ }
56
+
57
+
58
+ class JsonConfig(object):
59
+ """
60
+ 将 json 中, 形如 `$float:threshold` 的值, 处理为:
61
+ 从环境变量中查到 threshold, 再将其转换为 float 类型.
62
+ """
63
+ def __init__(self, dtype_map: dict = None, environment: EnvironmentManager = None):
64
+ self.dtype_map = dtype_map or _DEFAULT_DTYPE_MAP
65
+ self.environment = environment or os.environ
66
+
67
+ def sanitize_by_filename(self, filename: str):
68
+ with open(filename, 'r', encoding='utf-8') as f:
69
+ js = json.load(f)
70
+
71
+ return self.sanitize_by_json(js)
72
+
73
+ def sanitize_by_json(self, js):
74
+ js = traverse(
75
+ js,
76
+ callback=self.sanitize,
77
+ environment=self.environment
78
+ )
79
+ return js
80
+
81
+ def sanitize(self, string, environment):
82
+ """支持 $ 符开始的, 环境变量配置"""
83
+ if isinstance(string, str) and string.startswith('$'):
84
+ dtype, key = string[1:].split(':')
85
+ dtype = self.dtype_map[dtype]
86
+
87
+ value = environment.get(key)
88
+ if value is None:
89
+ raise AssertionError('environment not exist. key: {}'.format(key))
90
+
91
+ value = dtype(value)
92
+ result = value
93
+ else:
94
+ result = string
95
+ return result
96
+
97
+
98
+ def demo1():
99
+ import json
100
+
101
+ from project_settings import project_path
102
+
103
+ environment = EnvironmentManager(
104
+ path=os.path.join(project_path, 'server/callbot_server/dotenv'),
105
+ env='dev',
106
+ )
107
+ init_scenes = environment.get(key='init_scenes', dtype=json.loads)
108
+ print(init_scenes)
109
+ print(environment._environ)
110
+ return
111
+
112
+
113
+ if __name__ == '__main__':
114
+ demo1()
toolbox/os/other.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import inspect
3
+
4
+
5
+ def pwd():
6
+ """你在哪个文件调用此函数, 它就会返回那个文件所在的 dir 目标"""
7
+ frame = inspect.stack()[1]
8
+ module = inspect.getmodule(frame[0])
9
+ return os.path.dirname(os.path.abspath(module.__file__))
vad_go.go ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package main
2
+
3
+ import (
4
+ "flag"
5
+ "vad_go/dsp/audio"
6
+ "vad_go/dsp/streaming_vad"
7
+ "fmt"
8
+ "os"
9
+ "path"
10
+ )
11
+
12
+ func main () {
13
+ args := flag.String("filename", "", "input wav audio file")
14
+ flag.Parse()
15
+
16
+ var filename string = *args
17
+
18
+ pwd, _ := os.Getwd()
19
+ filename = path.Join(pwd, filename)
20
+ fmt.Println(filename)
21
+
22
+ var wavInfo audio.WavInfoType
23
+ err := wavInfo.ParseFromFile(filename)
24
+ if err != nil {
25
+ fmt.Println("wavInfo.ParseFromFile failed.")
26
+ }
27
+ fmt.Printf("sample rate: %d\n", wavInfo.Fmt.SampleRate)
28
+
29
+ float32Samples := wavInfo.GetFloat32Samples(0, 2)
30
+
31
+ var int16Samples []int16
32
+ var size uint32
33
+ var point int16
34
+ for i := 0; i < len(wavInfo.Data.Sample); i++ {
35
+ point = int16(float32Samples[i] * (1 << 15)) + 1
36
+ int16Samples = append(int16Samples, point)
37
+ }
38
+ size = uint32(len(int16Samples))
39
+
40
+ fmt.Printf("sample number: %d\n", size)
41
+
42
+ winSize := uint32(7000)
43
+ winStep := uint32(7000)
44
+ count := uint32(0)
45
+
46
+ sv := streaming_vad.StreamingVadType{}
47
+ sv.Init(8000, 0.4, 3.0)
48
+
49
+ var begin uint32
50
+ var end uint32
51
+ //var bufferSize uint32
52
+
53
+ for true {
54
+ begin = count * winStep
55
+ end = begin + winSize
56
+ //fmt.Println(end)
57
+
58
+ if begin >= size {
59
+ break
60
+ }
61
+
62
+ if end >= size {
63
+ end = size
64
+ }
65
+ buffer := int16Samples[begin:end]
66
+
67
+ //fmt.Printf("bufferSize: %d\n", bufferSize)
68
+ //fmt.Printf("buffer: %d\n", buffer)
69
+
70
+ count++
71
+
72
+ err = sv.ProcessSpeechByChunk(buffer)
73
+ if err != nil {
74
+ fmt.Println(err)
75
+ break
76
+ }
77
+ //if count > 3 {
78
+ // break
79
+ //}
80
+ }
81
+
82
+ fmt.Println(len(sv.VadEventMarkerDeque))
83
+ var marker streaming_vad.VadEventMarkerType
84
+ for i := 0; i < len(sv.VadEventMarkerDeque); i++ {
85
+ marker = sv.VadEventMarkerDeque[i]
86
+
87
+ fmt.Println(marker.Time)
88
+ fmt.Println(marker.VadFlag)
89
+
90
+ }
91
+ }