Spaces:
Running
Running
update
Browse files- .gitattributes +1 -0
- .gitignore +10 -0
- Dockerfile +19 -0
- build_vad_go.sh +5 -0
- data/examples/b07ae20f-247d-4e96-9c32-4ea27addcd79.wav +3 -0
- dsp/audio/wav.go +1010 -0
- dsp/streaming_vad/streaming_vad.go +656 -0
- go.mod +1 -0
- log.py +110 -0
- main.py +133 -0
- project_settings.py +25 -0
- requirements.txt +1 -0
- toolbox/__init__.py +5 -0
- toolbox/json/__init__.py +6 -0
- toolbox/json/misc.py +63 -0
- toolbox/os/__init__.py +6 -0
- toolbox/os/command.py +59 -0
- toolbox/os/environment.py +114 -0
- toolbox/os/other.py +9 -0
- vad_go.go +91 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
.git/
|
3 |
+
.idea/
|
4 |
+
|
5 |
+
data/
|
6 |
+
dotenv/
|
7 |
+
logs/
|
8 |
+
**/__pycache__/
|
9 |
+
|
10 |
+
**/*.wav
|
Dockerfile
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM golang:1.18
|
2 |
+
|
3 |
+
WORKDIR /data/GolandProjects/vad_go
|
4 |
+
|
5 |
+
COPY . /data/GolandProjects/vad_go
|
6 |
+
|
7 |
+
RUN apt-get update
|
8 |
+
RUN apt-get install -y python3-pip
|
9 |
+
|
10 |
+
RUN pip install --upgrade pip
|
11 |
+
RUN pip install --no-cache-dir --upgrade -r /data/GolandProjects/vad_go/requirements.txt
|
12 |
+
|
13 |
+
RUN bash build_vad_go.sh
|
14 |
+
|
15 |
+
USER root
|
16 |
+
|
17 |
+
RUN chmod -R 777 .
|
18 |
+
|
19 |
+
CMD ["python3", "main.py"]
|
build_vad_go.sh
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env bash
|
2 |
+
|
3 |
+
mkdir build
|
4 |
+
|
5 |
+
go build -o build vad_go
|
data/examples/b07ae20f-247d-4e96-9c32-4ea27addcd79.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b59f9910c50776eb704ead2360fdf3a0330da3cf693073575a12c800f6316a9a
|
3 |
+
size 78284
|
dsp/audio/wav.go
ADDED
@@ -0,0 +1,1010 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package audio
|
2 |
+
|
3 |
+
import (
|
4 |
+
"bytes"
|
5 |
+
"encoding/binary"
|
6 |
+
"errors"
|
7 |
+
"fmt"
|
8 |
+
"io"
|
9 |
+
"math"
|
10 |
+
"os"
|
11 |
+
"path/filepath"
|
12 |
+
|
13 |
+
)
|
14 |
+
|
15 |
+
/*
|
16 |
+
-----RIFF-----
|
17 |
+
RIFF 4 标头字母
|
18 |
+
CHUNK_SIZE 4 整个RIFF文件的长度(不包含RIFF和CHUNK_SIZE这两个字段的长度)
|
19 |
+
FORMAT 4 格式,WAVE代表是wav文件,WAVE格式要求带有标头为fmt和data的子chunk
|
20 |
+
-----FMT -----
|
21 |
+
SUB_CHUNK_ID 4 子chunk的标头字母,此处为"fmt "(注意,fmt后面是带一个空格的),其相当于wav的属性字段
|
22 |
+
SUB_CHUNK_SIZE 4 此subchunk的长度(不包含SUB_CHUNK_ID和SUB_CHUNK_SIZE两个字段的长度)
|
23 |
+
AUDIO_FORMAT 2 音频格式,pcm为1
|
24 |
+
NUM_CHANNELS 2 声道数量,理论上可以n声道,一般我们用单声道mono,或者双声道stereo(双声道也叫立体声)
|
25 |
+
SAMPLE_RATE 4 采样率,每秒采样多少次,通常都有固定的采样选择(8000, 11025,12000,16000,22050,24000,32000,44100,48000)
|
26 |
+
BYTE_RATE 4 码率,即每秒播放多少byte数据,计算公式=SAMPLE_RATE*NUM_CHANNELS*BITS_PER_SAMPLE/8(不明白为何需要这个字段)
|
27 |
+
BLOCK_ALIGN 2 块对其,其值=BITS_PER_SAMPLE*NUM_CHANNELS/8
|
28 |
+
BITS_PER_SAMPLE 2 每个采样多少bit,通常为8,16,32(为8时候代表的是uint8,16代表的是int16,32代表float32)
|
29 |
+
-----DATA-----
|
30 |
+
SUB_CHUNK_ID 4 子chunk的标头字母,此处为"data"
|
31 |
+
SUB_CHUNK_SIZE 4 此subchunk的长度(不包含SUB_CHUNK_ID和SUB_CHUNK_SIZE两个字段的长度)
|
32 |
+
DATA pcm的数据
|
33 |
+
-------------
|
34 |
+
*/
|
35 |
+
|
36 |
+
//MaxChannelNum 最大声道数量(此处只允许2)
|
37 |
+
const MaxChannelNum = 2
|
38 |
+
|
39 |
+
const (
|
40 |
+
LeftChannel = 0
|
41 |
+
RightChannel =1
|
42 |
+
)
|
43 |
+
|
44 |
+
//tag tag定义
|
45 |
+
type tag [4]byte
|
46 |
+
|
47 |
+
//一些变量
|
48 |
+
var (
|
49 |
+
tagRIFF = tag{'R', 'I', 'F', 'F'} // "RIFF"
|
50 |
+
tagWAVE = tag{'W', 'A', 'V', 'E'} // "WAVE"
|
51 |
+
tagFmt = tag{'f', 'm', 't', ' '} // "fmt "
|
52 |
+
tagData = tag{'d', 'a', 't', 'a'} // "data"
|
53 |
+
)
|
54 |
+
|
55 |
+
//WavHeaderType wav子部头结构
|
56 |
+
type WavHeaderType struct {
|
57 |
+
ID tag
|
58 |
+
Size uint32
|
59 |
+
}
|
60 |
+
|
61 |
+
//String 打印
|
62 |
+
func (wavHeader *WavHeaderType) String() string {
|
63 |
+
return fmt.Sprintf("ID=%s,Size=%d", string(wavHeader.ID[:]), wavHeader.Size)
|
64 |
+
}
|
65 |
+
|
66 |
+
//头部size
|
67 |
+
var (
|
68 |
+
sizeHeader = binary.Size(WavHeaderType{})
|
69 |
+
)
|
70 |
+
|
71 |
+
//chunkLoc ...
|
72 |
+
type chunkLoc struct {
|
73 |
+
pos int64
|
74 |
+
size int64
|
75 |
+
}
|
76 |
+
|
77 |
+
//RiffType ...
|
78 |
+
type RiffType struct {
|
79 |
+
WavHeaderType
|
80 |
+
Fmt tag
|
81 |
+
}
|
82 |
+
|
83 |
+
//String ...
|
84 |
+
func (riff *RiffType) String() string {
|
85 |
+
return fmt.Sprintf("ID=%s,Size=%d,Fmt=%s", string(riff.ID[:]), riff.Size, string(riff.Fmt[:]))
|
86 |
+
}
|
87 |
+
|
88 |
+
//WavFmtType wav格式结构(头部)
|
89 |
+
type WavFmtType struct {
|
90 |
+
WavHeaderType
|
91 |
+
AudioFormat uint16
|
92 |
+
Channels uint16
|
93 |
+
SampleRate uint32
|
94 |
+
BytesPerSec uint32
|
95 |
+
BytesPerBlock uint16
|
96 |
+
BitsPerSample uint16
|
97 |
+
}
|
98 |
+
|
99 |
+
//String ...
|
100 |
+
func (wavFmt *WavFmtType) String() string {
|
101 |
+
return fmt.Sprintf(
|
102 |
+
"ID=%s,Size=%d,AudioFormat=%d,Channels=%d,SampleRate=%d,BytesPerSec=%d,BytesPerBlock=%d,BitsPerSample=%d",
|
103 |
+
string(wavFmt.ID[:]), wavFmt.Size, wavFmt.AudioFormat, wavFmt.Channels, wavFmt.SampleRate,
|
104 |
+
wavFmt.BytesPerSec, wavFmt.BytesPerBlock, wavFmt.BitsPerSample)
|
105 |
+
}
|
106 |
+
|
107 |
+
//SampleType 采样结构
|
108 |
+
type SampleType struct {
|
109 |
+
val8s [MaxChannelNum]uint8
|
110 |
+
val16s [MaxChannelNum]int16
|
111 |
+
val32s [MaxChannelNum]float32
|
112 |
+
}
|
113 |
+
|
114 |
+
//WavDataType wav整体结构(头部+采样数据结构)
|
115 |
+
type WavDataType struct {
|
116 |
+
WavHeaderType
|
117 |
+
Sample []SampleType
|
118 |
+
}
|
119 |
+
|
120 |
+
//String ...
|
121 |
+
func (wavData *WavDataType) String() string {
|
122 |
+
blockNum := len(wavData.Sample)
|
123 |
+
return fmt.Sprintf("ID=%s,Size=%d,BlockNum=%d", string(wavData.ID[:]), wavData.Size, blockNum)
|
124 |
+
}
|
125 |
+
|
126 |
+
//WavInfoType wav操作实例
|
127 |
+
type WavInfoType struct {
|
128 |
+
Riff RiffType
|
129 |
+
Fmt WavFmtType
|
130 |
+
Data WavDataType
|
131 |
+
|
132 |
+
//create info
|
133 |
+
createMs int64
|
134 |
+
}
|
135 |
+
|
136 |
+
//String ...
|
137 |
+
func (wavInfo *WavInfoType) String() string {
|
138 |
+
f := &wavInfo.Fmt
|
139 |
+
blockNum := len(wavInfo.Data.Sample)
|
140 |
+
return fmt.Sprintf("SampleRate=%d,BitsPerSample=%d,Channels=%d,BlockNum=%d",
|
141 |
+
f.SampleRate, f.BitsPerSample, f.Channels, blockNum)
|
142 |
+
|
143 |
+
}
|
144 |
+
|
145 |
+
//SetCreateTs ...
|
146 |
+
func (wavInfo *WavInfoType) SetCreateTs(timestampMs int64) {
|
147 |
+
wavInfo.createMs = timestampMs
|
148 |
+
}
|
149 |
+
|
150 |
+
//CopyFormat 复制头部结构
|
151 |
+
func (wavInfo *WavInfoType) CopyFormat(w *WavInfoType) (err error) {
|
152 |
+
wavInfo.Riff.ID = w.Riff.ID
|
153 |
+
wavInfo.Riff.Size = w.Riff.Size
|
154 |
+
wavInfo.Riff.Fmt = w.Riff.Fmt
|
155 |
+
wavInfo.Fmt.ID = w.Fmt.ID
|
156 |
+
wavInfo.Fmt.Size = w.Fmt.Size
|
157 |
+
wavInfo.Fmt.AudioFormat = w.Fmt.AudioFormat
|
158 |
+
wavInfo.Fmt.Channels = w.Fmt.Channels
|
159 |
+
wavInfo.Fmt.SampleRate = w.Fmt.SampleRate
|
160 |
+
wavInfo.Fmt.BytesPerSec = w.Fmt.BytesPerSec
|
161 |
+
wavInfo.Fmt.BytesPerBlock = w.Fmt.BytesPerBlock
|
162 |
+
wavInfo.Fmt.BitsPerSample = w.Fmt.BitsPerSample
|
163 |
+
wavInfo.Data.ID = w.Data.ID
|
164 |
+
wavInfo.Data.Size = w.Data.Size
|
165 |
+
return
|
166 |
+
}
|
167 |
+
|
168 |
+
//ParseFromFile 从文件中导入
|
169 |
+
func (wavInfo *WavInfoType) ParseFromFile(absFile string) (err error) {
|
170 |
+
absFile, err = filepath.Abs(absFile) //#nosec
|
171 |
+
if err != nil {
|
172 |
+
return err
|
173 |
+
}
|
174 |
+
fileHandler, err := os.Open(absFile) //#nosec
|
175 |
+
if err != nil {
|
176 |
+
return err
|
177 |
+
}
|
178 |
+
defer fileHandler.Close()
|
179 |
+
|
180 |
+
_, err = fileHandler.Seek(0, os.SEEK_SET)
|
181 |
+
if err != nil {
|
182 |
+
return err
|
183 |
+
}
|
184 |
+
|
185 |
+
var pos int64
|
186 |
+
var ch WavHeaderType
|
187 |
+
|
188 |
+
//-----------------------------------------------------
|
189 |
+
// RIFF header
|
190 |
+
err = binary.Read(fileHandler, binary.LittleEndian, &wavInfo.Riff)
|
191 |
+
if err != nil {
|
192 |
+
return err
|
193 |
+
}
|
194 |
+
pos += int64(sizeHeader) + int64(len(tagWAVE))
|
195 |
+
if wavInfo.Riff.ID != tagRIFF {
|
196 |
+
return errors.New("File Format Not Riff")
|
197 |
+
}
|
198 |
+
if wavInfo.Riff.Fmt != tagWAVE {
|
199 |
+
return errors.New("File Format Not Wave")
|
200 |
+
}
|
201 |
+
fileSize := int64(sizeHeader) + int64(wavInfo.Riff.Size)
|
202 |
+
_ = fileSize
|
203 |
+
|
204 |
+
//r := &wavInfo.Riff
|
205 |
+
|
206 |
+
//-----------------------------------------------------
|
207 |
+
// read all chunks
|
208 |
+
|
209 |
+
var chunks = make(map[tag]*chunkLoc)
|
210 |
+
|
211 |
+
for {
|
212 |
+
err = binary.Read(fileHandler, binary.LittleEndian, &ch)
|
213 |
+
if err != nil {
|
214 |
+
if err == io.EOF {
|
215 |
+
break
|
216 |
+
}
|
217 |
+
return err
|
218 |
+
}
|
219 |
+
pos += int64(sizeHeader)
|
220 |
+
|
221 |
+
loc := chunkLoc{
|
222 |
+
pos: pos,
|
223 |
+
size: int64(ch.Size),
|
224 |
+
}
|
225 |
+
|
226 |
+
_, err = fileHandler.Seek(loc.size, os.SEEK_CUR)
|
227 |
+
if err != nil {
|
228 |
+
return err
|
229 |
+
}
|
230 |
+
pos += loc.size // chunk data
|
231 |
+
|
232 |
+
chunks[ch.ID] = &loc
|
233 |
+
}
|
234 |
+
|
235 |
+
// check fileHandler size
|
236 |
+
if pos != fileSize {
|
237 |
+
return errors.New("pos != fileSize")
|
238 |
+
}
|
239 |
+
|
240 |
+
//-----------------------------------------------------
|
241 |
+
// chunk fmt_
|
242 |
+
loc, ok := chunks[tagFmt]
|
243 |
+
if !ok {
|
244 |
+
return errors.New("wav: has not chunk \"fmt \"")
|
245 |
+
}
|
246 |
+
_, err = fileHandler.Seek(loc.pos-int64(sizeHeader), os.SEEK_SET)
|
247 |
+
if err != nil {
|
248 |
+
return err
|
249 |
+
}
|
250 |
+
err = binary.Read(fileHandler, binary.LittleEndian, &wavInfo.Fmt)
|
251 |
+
if err != nil {
|
252 |
+
return err
|
253 |
+
}
|
254 |
+
|
255 |
+
//-----------------------------------------------------
|
256 |
+
// chunk data
|
257 |
+
loc, ok = chunks[tagData]
|
258 |
+
if !ok {
|
259 |
+
return errors.New("wav: has not chunk \"data\"")
|
260 |
+
}
|
261 |
+
_, err = fileHandler.Seek(loc.pos, os.SEEK_SET)
|
262 |
+
if err != nil {
|
263 |
+
return err
|
264 |
+
}
|
265 |
+
|
266 |
+
channel := wavInfo.Fmt.Channels
|
267 |
+
bytePerSample := wavInfo.Fmt.BitsPerSample / 8
|
268 |
+
blockSize := channel * bytePerSample
|
269 |
+
|
270 |
+
wavInfo.Data.ID = tagData
|
271 |
+
wavInfo.Data.Size = uint32(loc.size)
|
272 |
+
|
273 |
+
blockNum := wavInfo.Data.Size / uint32(blockSize)
|
274 |
+
wavInfo.Data.Sample = make([]SampleType, blockNum)
|
275 |
+
|
276 |
+
_, err = fileHandler.Seek(loc.pos, os.SEEK_SET)
|
277 |
+
if err != nil {
|
278 |
+
return err
|
279 |
+
}
|
280 |
+
blockIdx := 0
|
281 |
+
for i := 0; i < int(wavInfo.Data.Size); i += int(blockSize) {
|
282 |
+
sample := &wavInfo.Data.Sample[blockIdx]
|
283 |
+
blockIdx++
|
284 |
+
for c := 0; c < int(channel); c++ {
|
285 |
+
switch bytePerSample {
|
286 |
+
case 1:
|
287 |
+
var val uint8
|
288 |
+
err = binary.Read(fileHandler, binary.LittleEndian, &val)
|
289 |
+
//sample.val8s = append(sample.val8s,val)
|
290 |
+
sample.val8s[c] = val
|
291 |
+
case 2:
|
292 |
+
var val int16
|
293 |
+
err = binary.Read(fileHandler, binary.LittleEndian, &val)
|
294 |
+
//sample.val16s = append(sample.val16s,val)
|
295 |
+
sample.val16s[c] = val
|
296 |
+
//fmt.Printf("pos=%d,val=%d\n",i,sample.val16s)
|
297 |
+
case 4:
|
298 |
+
var val float32
|
299 |
+
err = binary.Read(fileHandler, binary.LittleEndian, &val)
|
300 |
+
//sample.val32s = append(sample.val32s,val)
|
301 |
+
sample.val32s[c] = val
|
302 |
+
|
303 |
+
}
|
304 |
+
if err != nil {
|
305 |
+
return err
|
306 |
+
}
|
307 |
+
}
|
308 |
+
//wavInfo.Data.Sample = append(wavInfo.Data.Sample,sample)
|
309 |
+
}
|
310 |
+
|
311 |
+
//for i:=0;i<len(wavInfo.Data.Sample);i++{
|
312 |
+
// sample := &wavInfo.Data.Sample[i]
|
313 |
+
// fmt.Printf("idx=%d,left=%d,right=%d\n",i,sample.val16s[0],sample.val16s[1])
|
314 |
+
//}
|
315 |
+
|
316 |
+
for id, chunk := range chunks {
|
317 |
+
if id != tagData && id != tagFmt {
|
318 |
+
//util.MainLogger.Error(fmt.Sprintf("unkonw chunk=%s,size=%d\n", string(id[:]), chunk.size))
|
319 |
+
fmt.Printf("unkonw chunk=%s,size=%d\n", string(id[:]), chunk.size)
|
320 |
+
}
|
321 |
+
}
|
322 |
+
return
|
323 |
+
}
|
324 |
+
|
325 |
+
//ParseFromBuffer 从buffer中导入
|
326 |
+
func (wavInfo *WavInfoType) ParseFromBuffer(buffer []byte) (err error) {
|
327 |
+
var ch WavHeaderType
|
328 |
+
|
329 |
+
bufferReader := bytes.NewBuffer(buffer)
|
330 |
+
//-----------------------------------------------------
|
331 |
+
// RIFF header
|
332 |
+
err = binary.Read(bufferReader, binary.LittleEndian, &wavInfo.Riff)
|
333 |
+
if err != nil {
|
334 |
+
return err
|
335 |
+
}
|
336 |
+
|
337 |
+
//pos := int64(sizeHeader) + int64(len(tagWAVE))
|
338 |
+
if wavInfo.Riff.ID != tagRIFF {
|
339 |
+
return errors.New("File Format Not Riff")
|
340 |
+
}
|
341 |
+
if wavInfo.Riff.Fmt != tagWAVE {
|
342 |
+
return errors.New("File Format Not Wave")
|
343 |
+
}
|
344 |
+
fileSize := int64(sizeHeader) + int64(wavInfo.Riff.Size)
|
345 |
+
_ = fileSize
|
346 |
+
|
347 |
+
//r := &wavInfo.Riff
|
348 |
+
|
349 |
+
//-----------------------------------------------------
|
350 |
+
// read all chunks
|
351 |
+
for {
|
352 |
+
err = binary.Read(bufferReader, binary.LittleEndian, &ch)
|
353 |
+
if err != nil {
|
354 |
+
if err == io.EOF {
|
355 |
+
//文件读取结束
|
356 |
+
err = nil
|
357 |
+
return
|
358 |
+
}
|
359 |
+
return err
|
360 |
+
}
|
361 |
+
|
362 |
+
//fmt格式的chunk
|
363 |
+
if ch.ID == tagFmt {
|
364 |
+
wavInfo.Fmt.WavHeaderType = ch
|
365 |
+
err = binary.Read(bufferReader, binary.LittleEndian, &wavInfo.Fmt.AudioFormat)
|
366 |
+
err = binary.Read(bufferReader, binary.LittleEndian, &wavInfo.Fmt.Channels)
|
367 |
+
err = binary.Read(bufferReader, binary.LittleEndian, &wavInfo.Fmt.SampleRate)
|
368 |
+
err = binary.Read(bufferReader, binary.LittleEndian, &wavInfo.Fmt.BytesPerSec)
|
369 |
+
err = binary.Read(bufferReader, binary.LittleEndian, &wavInfo.Fmt.BytesPerBlock)
|
370 |
+
err = binary.Read(bufferReader, binary.LittleEndian, &wavInfo.Fmt.BitsPerSample)
|
371 |
+
if err != nil {
|
372 |
+
return err
|
373 |
+
}
|
374 |
+
continue
|
375 |
+
}
|
376 |
+
|
377 |
+
//data格式的chunk
|
378 |
+
if ch.ID == tagData {
|
379 |
+
channel := wavInfo.Fmt.Channels
|
380 |
+
bytePerSample := wavInfo.Fmt.BitsPerSample / 8
|
381 |
+
blockSize := channel * bytePerSample
|
382 |
+
|
383 |
+
wavInfo.Data.ID = tagData
|
384 |
+
wavInfo.Data.Size = ch.Size
|
385 |
+
|
386 |
+
blockNum := wavInfo.Data.Size / uint32(blockSize)
|
387 |
+
wavInfo.Data.Sample = make([]SampleType, blockNum)
|
388 |
+
|
389 |
+
blockIdx := 0
|
390 |
+
for i := 0; i < int(wavInfo.Data.Size); i += int(blockSize) {
|
391 |
+
sample := &wavInfo.Data.Sample[blockIdx]
|
392 |
+
blockIdx++
|
393 |
+
for c := 0; c < int(channel); c++ {
|
394 |
+
switch bytePerSample {
|
395 |
+
case 1:
|
396 |
+
var val uint8
|
397 |
+
err = binary.Read(bufferReader, binary.LittleEndian, &val)
|
398 |
+
//sample.val8s = append(sample.val8s,val)
|
399 |
+
sample.val8s[c] = val
|
400 |
+
case 2:
|
401 |
+
var val int16
|
402 |
+
err = binary.Read(bufferReader, binary.LittleEndian, &val)
|
403 |
+
//sample.val16s = append(sample.val16s,val)
|
404 |
+
sample.val16s[c] = val
|
405 |
+
//fmt.Printf("pos=%d,val=%d\n",i,sample.val16s)
|
406 |
+
case 4:
|
407 |
+
var val float32
|
408 |
+
err = binary.Read(bufferReader, binary.LittleEndian, &val)
|
409 |
+
//sample.val32s = append(sample.val32s,val)
|
410 |
+
sample.val32s[c] = val
|
411 |
+
|
412 |
+
}
|
413 |
+
if err != nil {
|
414 |
+
return err
|
415 |
+
}
|
416 |
+
}
|
417 |
+
//wavInfo.Data.Sample = append(wavInfo.Data.Sample,sample)
|
418 |
+
}
|
419 |
+
continue
|
420 |
+
}
|
421 |
+
|
422 |
+
//其他格式的chunk
|
423 |
+
byteData := make([]byte, ch.Size)
|
424 |
+
err = binary.Read(bufferReader, binary.LittleEndian, byteData)
|
425 |
+
}
|
426 |
+
return
|
427 |
+
}
|
428 |
+
|
429 |
+
//SaveToFile 保存到文件中
|
430 |
+
func (wavInfo *WavInfoType) SaveToFile(absFile string) (err error) {
|
431 |
+
file, err := os.Create(absFile)
|
432 |
+
if err != nil {
|
433 |
+
return
|
434 |
+
}
|
435 |
+
defer file.Close()
|
436 |
+
_, err = file.Seek(0, os.SEEK_SET)
|
437 |
+
if err != nil {
|
438 |
+
return
|
439 |
+
}
|
440 |
+
|
441 |
+
//calc size
|
442 |
+
wavInfo.Data.Size = (uint32(wavInfo.Fmt.Channels) * uint32(wavInfo.Fmt.BitsPerSample) / 8) *
|
443 |
+
uint32(len(wavInfo.Data.Sample))
|
444 |
+
wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
|
445 |
+
wavInfo.Riff.Size = uint32(len(wavInfo.Riff.Fmt)) +
|
446 |
+
uint32(sizeHeader) +
|
447 |
+
wavInfo.Fmt.Size +
|
448 |
+
uint32(sizeHeader) +
|
449 |
+
wavInfo.Data.Size
|
450 |
+
|
451 |
+
//----------------------------------------
|
452 |
+
// RIFF header
|
453 |
+
err = binary.Write(file, binary.LittleEndian, wavInfo.Riff)
|
454 |
+
if err != nil {
|
455 |
+
return err
|
456 |
+
}
|
457 |
+
|
458 |
+
//----------------------------------------
|
459 |
+
// chunk fmt_
|
460 |
+
wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
|
461 |
+
err = binary.Write(file, binary.LittleEndian, wavInfo.Fmt)
|
462 |
+
if err != nil {
|
463 |
+
return err
|
464 |
+
}
|
465 |
+
|
466 |
+
//----------------------------------------
|
467 |
+
// chunk data
|
468 |
+
ch := WavHeaderType{
|
469 |
+
ID: tagData,
|
470 |
+
Size: wavInfo.Data.Size,
|
471 |
+
}
|
472 |
+
err = binary.Write(file, binary.LittleEndian, ch)
|
473 |
+
if err != nil {
|
474 |
+
return err
|
475 |
+
}
|
476 |
+
|
477 |
+
channel := wavInfo.Fmt.Channels
|
478 |
+
bytePerSample := wavInfo.Fmt.BitsPerSample / 8
|
479 |
+
//blockSize := channel*bytePerSample
|
480 |
+
|
481 |
+
for i := 0; i < len(wavInfo.Data.Sample); i++ {
|
482 |
+
sample := &wavInfo.Data.Sample[i]
|
483 |
+
switch bytePerSample {
|
484 |
+
case 1:
|
485 |
+
for c := 0; c < int(channel); c++ {
|
486 |
+
err = binary.Write(file, binary.LittleEndian, sample.val8s[c])
|
487 |
+
if err != nil {
|
488 |
+
return
|
489 |
+
}
|
490 |
+
|
491 |
+
}
|
492 |
+
case 2:
|
493 |
+
for c := 0; c < int(channel); c++ {
|
494 |
+
err = binary.Write(file, binary.LittleEndian, sample.val16s[c])
|
495 |
+
if err != nil {
|
496 |
+
return
|
497 |
+
}
|
498 |
+
}
|
499 |
+
case 4:
|
500 |
+
for c := 0; c < int(channel); c++ {
|
501 |
+
err = binary.Write(file, binary.LittleEndian, sample.val32s[c])
|
502 |
+
if err != nil {
|
503 |
+
return
|
504 |
+
}
|
505 |
+
}
|
506 |
+
}
|
507 |
+
}
|
508 |
+
return
|
509 |
+
}
|
510 |
+
|
511 |
+
//SaveToBuffer 保存到buffer中
|
512 |
+
func (wavInfo *WavInfoType) SaveToBuffer() (buffer []byte, err error) {
|
513 |
+
|
514 |
+
//var bufferWriter bytes.Buffer
|
515 |
+
buffer = make([]byte, 0)
|
516 |
+
bufferWriter := bytes.NewBuffer(buffer)
|
517 |
+
|
518 |
+
//calc size
|
519 |
+
wavInfo.Data.Size = (uint32(wavInfo.Fmt.Channels) * uint32(wavInfo.Fmt.BitsPerSample) / 8) *
|
520 |
+
uint32(len(wavInfo.Data.Sample))
|
521 |
+
wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
|
522 |
+
wavInfo.Riff.Size = uint32(len(wavInfo.Riff.Fmt)) +
|
523 |
+
uint32(sizeHeader) +
|
524 |
+
wavInfo.Fmt.Size +
|
525 |
+
uint32(sizeHeader) +
|
526 |
+
wavInfo.Data.Size
|
527 |
+
|
528 |
+
//----------------------------------------
|
529 |
+
// RIFF header
|
530 |
+
err = binary.Write(bufferWriter, binary.LittleEndian, wavInfo.Riff)
|
531 |
+
if err != nil {
|
532 |
+
return
|
533 |
+
}
|
534 |
+
|
535 |
+
//----------------------------------------
|
536 |
+
// chunk fmt_
|
537 |
+
wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
|
538 |
+
err = binary.Write(bufferWriter, binary.LittleEndian, wavInfo.Fmt)
|
539 |
+
if err != nil {
|
540 |
+
return
|
541 |
+
}
|
542 |
+
|
543 |
+
//----------------------------------------
|
544 |
+
// chunk data
|
545 |
+
ch := WavHeaderType{
|
546 |
+
ID: tagData,
|
547 |
+
Size: wavInfo.Data.Size,
|
548 |
+
}
|
549 |
+
err = binary.Write(bufferWriter, binary.LittleEndian, ch)
|
550 |
+
if err != nil {
|
551 |
+
return
|
552 |
+
}
|
553 |
+
|
554 |
+
channel := wavInfo.Fmt.Channels
|
555 |
+
bytePerSample := wavInfo.Fmt.BitsPerSample / 8
|
556 |
+
//blockSize := channel*bytePerSample
|
557 |
+
|
558 |
+
for i := 0; i < len(wavInfo.Data.Sample); i++ {
|
559 |
+
sample := &wavInfo.Data.Sample[i]
|
560 |
+
switch bytePerSample {
|
561 |
+
case 1:
|
562 |
+
for c := 0; c < int(channel); c++ {
|
563 |
+
err = binary.Write(bufferWriter, binary.LittleEndian, sample.val8s[c])
|
564 |
+
if err != nil {
|
565 |
+
return
|
566 |
+
}
|
567 |
+
}
|
568 |
+
case 2:
|
569 |
+
for c := 0; c < int(channel); c++ {
|
570 |
+
err = binary.Write(bufferWriter, binary.LittleEndian, sample.val16s[c])
|
571 |
+
if err != nil {
|
572 |
+
return
|
573 |
+
}
|
574 |
+
}
|
575 |
+
case 4:
|
576 |
+
for c := 0; c < int(channel); c++ {
|
577 |
+
err = binary.Write(bufferWriter, binary.LittleEndian, sample.val32s[c])
|
578 |
+
if err != nil {
|
579 |
+
return
|
580 |
+
}
|
581 |
+
}
|
582 |
+
}
|
583 |
+
}
|
584 |
+
buffer = bufferWriter.Bytes()
|
585 |
+
return
|
586 |
+
}
|
587 |
+
|
588 |
+
//SaveToBuffer 保存到buffer中
|
589 |
+
func (wavInfo *WavInfoType) SaveToBufferWithChannel(voiceChannel int) (buffer []byte, err error) {
|
590 |
+
|
591 |
+
//var bufferWriter bytes.Buffer
|
592 |
+
buffer = make([]byte, 0)
|
593 |
+
bufferWriter := bytes.NewBuffer(buffer)
|
594 |
+
|
595 |
+
//calc size
|
596 |
+
channel := wavInfo.Fmt.Channels
|
597 |
+
wavInfo.Fmt.Channels = 1
|
598 |
+
wavInfo.Data.Size = (uint32(wavInfo.Fmt.Channels) * uint32(wavInfo.Fmt.BitsPerSample) / 8) *
|
599 |
+
uint32(len(wavInfo.Data.Sample))
|
600 |
+
wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
|
601 |
+
wavInfo.Riff.Size = uint32(len(wavInfo.Riff.Fmt)) +
|
602 |
+
uint32(sizeHeader) +
|
603 |
+
wavInfo.Fmt.Size +
|
604 |
+
uint32(sizeHeader) +
|
605 |
+
wavInfo.Data.Size
|
606 |
+
|
607 |
+
//----------------------------------------
|
608 |
+
// RIFF header
|
609 |
+
err = binary.Write(bufferWriter, binary.LittleEndian, wavInfo.Riff)
|
610 |
+
if err != nil {
|
611 |
+
return
|
612 |
+
}
|
613 |
+
|
614 |
+
//----------------------------------------
|
615 |
+
// chunk fmt_
|
616 |
+
wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
|
617 |
+
err = binary.Write(bufferWriter, binary.LittleEndian, wavInfo.Fmt)
|
618 |
+
if err != nil {
|
619 |
+
return
|
620 |
+
}
|
621 |
+
|
622 |
+
//----------------------------------------
|
623 |
+
// chunk data
|
624 |
+
ch := WavHeaderType{
|
625 |
+
ID: tagData,
|
626 |
+
Size: wavInfo.Data.Size,
|
627 |
+
}
|
628 |
+
err = binary.Write(bufferWriter, binary.LittleEndian, ch)
|
629 |
+
if err != nil {
|
630 |
+
return
|
631 |
+
}
|
632 |
+
|
633 |
+
|
634 |
+
bytePerSample := wavInfo.Fmt.BitsPerSample / 8
|
635 |
+
//blockSize := channel*bytePerSample
|
636 |
+
|
637 |
+
for i := 0; i < len(wavInfo.Data.Sample); i++ {
|
638 |
+
sample := &wavInfo.Data.Sample[i]
|
639 |
+
switch bytePerSample {
|
640 |
+
case 1:
|
641 |
+
for c := 0; c < int(channel); c++ {
|
642 |
+
if voiceChannel == c {
|
643 |
+
err = binary.Write(bufferWriter, binary.LittleEndian, sample.val8s[c])
|
644 |
+
if err != nil {
|
645 |
+
return
|
646 |
+
}
|
647 |
+
}
|
648 |
+
}
|
649 |
+
case 2:
|
650 |
+
for c := 0; c < int(channel); c++ {
|
651 |
+
if voiceChannel == c {
|
652 |
+
err = binary.Write(bufferWriter, binary.LittleEndian, sample.val16s[c])
|
653 |
+
if err != nil {
|
654 |
+
return
|
655 |
+
}
|
656 |
+
}
|
657 |
+
}
|
658 |
+
case 4:
|
659 |
+
for c := 0; c < int(channel); c++ {
|
660 |
+
if voiceChannel == c {
|
661 |
+
err = binary.Write(bufferWriter, binary.LittleEndian, sample.val32s[c])
|
662 |
+
if err != nil {
|
663 |
+
return
|
664 |
+
}
|
665 |
+
}
|
666 |
+
}
|
667 |
+
}
|
668 |
+
}
|
669 |
+
buffer = bufferWriter.Bytes()
|
670 |
+
wavInfo.Fmt.Channels = 2
|
671 |
+
return
|
672 |
+
}
|
673 |
+
|
674 |
+
//SaveToFile 保存到文件中
|
675 |
+
func (wavInfo *WavInfoType) SaveToFileWithChannel(absFile string, voiceChannel int) (err error) {
|
676 |
+
file, err := os.Create(absFile)
|
677 |
+
if err != nil {
|
678 |
+
return
|
679 |
+
}
|
680 |
+
defer file.Close()
|
681 |
+
_, err = file.Seek(0, io.SeekStart)
|
682 |
+
if err != nil {
|
683 |
+
return
|
684 |
+
}
|
685 |
+
|
686 |
+
//calc size
|
687 |
+
channel := wavInfo.Fmt.Channels
|
688 |
+
wavInfo.Fmt.Channels = 1
|
689 |
+
wavInfo.Data.Size = (uint32(wavInfo.Fmt.Channels) * uint32(wavInfo.Fmt.BitsPerSample) / 8) *
|
690 |
+
uint32(len(wavInfo.Data.Sample))
|
691 |
+
wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
|
692 |
+
wavInfo.Riff.Size = uint32(len(wavInfo.Riff.Fmt)) +
|
693 |
+
uint32(sizeHeader) +
|
694 |
+
wavInfo.Fmt.Size +
|
695 |
+
uint32(sizeHeader) +
|
696 |
+
wavInfo.Data.Size
|
697 |
+
|
698 |
+
//----------------------------------------
|
699 |
+
// RIFF header
|
700 |
+
err = binary.Write(file, binary.LittleEndian, wavInfo.Riff)
|
701 |
+
if err != nil {
|
702 |
+
return err
|
703 |
+
}
|
704 |
+
|
705 |
+
//----------------------------------------
|
706 |
+
// chunk fmt_
|
707 |
+
wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
|
708 |
+
err = binary.Write(file, binary.LittleEndian, wavInfo.Fmt)
|
709 |
+
if err != nil {
|
710 |
+
return err
|
711 |
+
}
|
712 |
+
|
713 |
+
//----------------------------------------
|
714 |
+
// chunk data
|
715 |
+
ch := WavHeaderType{
|
716 |
+
ID: tagData,
|
717 |
+
Size: wavInfo.Data.Size,
|
718 |
+
}
|
719 |
+
err = binary.Write(file, binary.LittleEndian, ch)
|
720 |
+
if err != nil {
|
721 |
+
return err
|
722 |
+
}
|
723 |
+
|
724 |
+
bytePerSample := wavInfo.Fmt.BitsPerSample / 8
|
725 |
+
//blockSize := channel*bytePerSample
|
726 |
+
|
727 |
+
for i := 0; i < len(wavInfo.Data.Sample); i++ {
|
728 |
+
sample := &wavInfo.Data.Sample[i]
|
729 |
+
switch bytePerSample {
|
730 |
+
case 1:
|
731 |
+
for c := 0; c < int(channel); c++ {
|
732 |
+
if voiceChannel == c {
|
733 |
+
err = binary.Write(file, binary.LittleEndian, sample.val8s[c])
|
734 |
+
if err != nil {
|
735 |
+
return
|
736 |
+
}
|
737 |
+
}
|
738 |
+
}
|
739 |
+
case 2:
|
740 |
+
for c := 0; c < int(channel); c++ {
|
741 |
+
if voiceChannel == c {
|
742 |
+
err = binary.Write(file, binary.LittleEndian, sample.val16s[c])
|
743 |
+
if err != nil {
|
744 |
+
return
|
745 |
+
}
|
746 |
+
}
|
747 |
+
}
|
748 |
+
case 4:
|
749 |
+
for c := 0; c < int(channel); c++ {
|
750 |
+
if voiceChannel == c {
|
751 |
+
err = binary.Write(file, binary.LittleEndian, sample.val32s[c])
|
752 |
+
if err != nil {
|
753 |
+
return
|
754 |
+
}
|
755 |
+
}
|
756 |
+
}
|
757 |
+
}
|
758 |
+
}
|
759 |
+
wavInfo.Fmt.Channels = 2
|
760 |
+
return
|
761 |
+
}
|
762 |
+
|
763 |
+
//AdjusterVolume 调整音量
|
764 |
+
func (wavInfo *WavInfoType) AdjusterVolume(rateAsRaw float32) {
|
765 |
+
channel := wavInfo.Fmt.Channels
|
766 |
+
bytePerSample := wavInfo.Fmt.BitsPerSample / 8
|
767 |
+
//blockSize := channel*bytePerSample
|
768 |
+
|
769 |
+
const MaxUint8 = math.MaxUint8
|
770 |
+
const MinUint8 = 0
|
771 |
+
const MaxInt16 = math.MaxInt16
|
772 |
+
const MinInt16 = math.MinInt16
|
773 |
+
|
774 |
+
for i := 0; i < len(wavInfo.Data.Sample); i++ {
|
775 |
+
switch bytePerSample {
|
776 |
+
case 1:
|
777 |
+
for c := 0; c < int(channel); c++ {
|
778 |
+
val := wavInfo.Data.Sample[i].val8s[c]
|
779 |
+
clip := float64(val) * float64(rateAsRaw)
|
780 |
+
if clip < MinUint8 {
|
781 |
+
clip = MinUint8
|
782 |
+
}
|
783 |
+
if clip > MaxUint8 {
|
784 |
+
clip = MaxUint8
|
785 |
+
}
|
786 |
+
//wavInfo.Data.Sample[i].val8s[c] = uint8((float32(val) * rateAsRaw))
|
787 |
+
wavInfo.Data.Sample[i].val8s[c] = uint8(clip)
|
788 |
+
}
|
789 |
+
case 2:
|
790 |
+
for c := 0; c < int(channel); c++ {
|
791 |
+
//val := wavInfo.Data.Sample[i].val16s[c]
|
792 |
+
//wavInfo.Data.Sample[i].val16s[c] = int16((float32(val) * rateAsRaw))
|
793 |
+
val := wavInfo.Data.Sample[i].val16s[c]
|
794 |
+
clip := float64(val) * float64(rateAsRaw)
|
795 |
+
if clip < MinInt16 {
|
796 |
+
clip = MinInt16
|
797 |
+
}
|
798 |
+
if clip > MaxInt16 {
|
799 |
+
clip = MaxInt16
|
800 |
+
}
|
801 |
+
wavInfo.Data.Sample[i].val16s[c] = int16(clip)
|
802 |
+
}
|
803 |
+
case 4:
|
804 |
+
for c := 0; c < int(channel); c++ {
|
805 |
+
val := wavInfo.Data.Sample[i].val32s[c]
|
806 |
+
wavInfo.Data.Sample[i].val32s[c] = val * rateAsRaw
|
807 |
+
}
|
808 |
+
}
|
809 |
+
}
|
810 |
+
}
|
811 |
+
|
812 |
+
//Resample 重置采样率
|
813 |
+
func (wavInfo *WavInfoType) Resample(resampleRate uint32) {
|
814 |
+
sampleRate := wavInfo.Fmt.SampleRate
|
815 |
+
rate := float64(sampleRate) / float64(resampleRate)
|
816 |
+
channel := wavInfo.Fmt.Channels
|
817 |
+
bytePerSample := wavInfo.Fmt.BitsPerSample / 8
|
818 |
+
rawLen := len(wavInfo.Data.Sample)
|
819 |
+
|
820 |
+
resampleData := make([]SampleType, 0)
|
821 |
+
|
822 |
+
resampleIdx := 0
|
823 |
+
for {
|
824 |
+
rawIdx := int(float64(resampleIdx) * rate)
|
825 |
+
if rawIdx < rawLen {
|
826 |
+
sample := SampleType{}
|
827 |
+
for c := 0; c < int(channel); c++ {
|
828 |
+
switch bytePerSample {
|
829 |
+
case 1:
|
830 |
+
val := wavInfo.Data.Sample[rawIdx].val8s[c]
|
831 |
+
//sample.val8s = append(sample.val8s,val)
|
832 |
+
sample.val8s[c] = val
|
833 |
+
case 2:
|
834 |
+
val := wavInfo.Data.Sample[rawIdx].val16s[c]
|
835 |
+
//sample.val16s = append(sample.val16s,val)
|
836 |
+
sample.val16s[c] = val
|
837 |
+
case 4:
|
838 |
+
val := wavInfo.Data.Sample[rawIdx].val32s[c]
|
839 |
+
//sample.val32s = append(sample.val32s,val)
|
840 |
+
sample.val32s[c] = val
|
841 |
+
}
|
842 |
+
}
|
843 |
+
resampleData = append(resampleData, sample)
|
844 |
+
} else {
|
845 |
+
break
|
846 |
+
}
|
847 |
+
resampleIdx++
|
848 |
+
}
|
849 |
+
|
850 |
+
wavInfo.Data.Sample = resampleData
|
851 |
+
wavInfo.Fmt.SampleRate = resampleRate
|
852 |
+
|
853 |
+
}
|
854 |
+
|
855 |
+
//ConvertToFloat32 将采样值转换到 0 到 1 之间
|
856 |
+
func (wavInfo *WavInfoType) GetFloat32Samples(channel int, bytePerSample int) []float32 {
|
857 |
+
//fmt.Println(wavInfo.cha)
|
858 |
+
|
859 |
+
var floatSamples []float32
|
860 |
+
|
861 |
+
var point float32
|
862 |
+
for i := 0; i < len(wavInfo.Data.Sample); i++ {
|
863 |
+
sample := &wavInfo.Data.Sample[i]
|
864 |
+
switch bytePerSample {
|
865 |
+
case 1:
|
866 |
+
point = float32(sample.val8s[channel]) / (1 << 8)
|
867 |
+
case 2:
|
868 |
+
point = float32(sample.val16s[channel]) / (1 << 15)
|
869 |
+
case 4:
|
870 |
+
point = sample.val32s[channel]
|
871 |
+
}
|
872 |
+
floatSamples = append(floatSamples, point)
|
873 |
+
}
|
874 |
+
return floatSamples
|
875 |
+
}
|
876 |
+
|
877 |
+
//Trim 切头切尾
|
878 |
+
func (wavInfo *WavInfoType) Trim(dbPercent float32) {
|
879 |
+
channel := wavInfo.Fmt.Channels
|
880 |
+
bytePerSample := wavInfo.Fmt.BitsPerSample / 8
|
881 |
+
//blockSize := channel*bytePerSample
|
882 |
+
|
883 |
+
const MaxUint8 = math.MaxUint8
|
884 |
+
const MinUint8 = 0
|
885 |
+
const MaxInt16 = math.MaxInt16
|
886 |
+
const MinInt16 = math.MinInt16
|
887 |
+
|
888 |
+
//trim head
|
889 |
+
silenceHeadIdx := 0 //头部静音截止位置
|
890 |
+
done := false
|
891 |
+
for i := 0; i < len(wavInfo.Data.Sample); i++ {
|
892 |
+
switch bytePerSample {
|
893 |
+
case 1:
|
894 |
+
for c := 0; c < int(channel); c++ {
|
895 |
+
val := wavInfo.Data.Sample[i].val8s[c]
|
896 |
+
if float32(val) > MaxUint8*dbPercent {
|
897 |
+
silenceHeadIdx = i
|
898 |
+
done = true
|
899 |
+
break
|
900 |
+
}
|
901 |
+
}
|
902 |
+
case 2:
|
903 |
+
for c := 0; c < int(channel); c++ {
|
904 |
+
val := wavInfo.Data.Sample[i].val16s[c]
|
905 |
+
if float32(val) > MaxInt16*dbPercent {
|
906 |
+
silenceHeadIdx = i
|
907 |
+
done = true
|
908 |
+
break
|
909 |
+
}
|
910 |
+
if float32(val) < MinInt16*dbPercent {
|
911 |
+
silenceHeadIdx = i
|
912 |
+
done = true
|
913 |
+
break
|
914 |
+
}
|
915 |
+
}
|
916 |
+
case 4:
|
917 |
+
for c := 0; c < int(channel); c++ {
|
918 |
+
|
919 |
+
val := wavInfo.Data.Sample[i].val32s[c]
|
920 |
+
if float32(val) > math.MaxFloat32*dbPercent {
|
921 |
+
silenceHeadIdx = i
|
922 |
+
done = true
|
923 |
+
break
|
924 |
+
}
|
925 |
+
if val < 0 && (-val > math.MaxFloat32*dbPercent) {
|
926 |
+
silenceHeadIdx = i
|
927 |
+
done = true
|
928 |
+
break
|
929 |
+
}
|
930 |
+
}
|
931 |
+
}
|
932 |
+
|
933 |
+
if done {
|
934 |
+
break
|
935 |
+
}
|
936 |
+
}
|
937 |
+
|
938 |
+
//trim tail,截断尾部
|
939 |
+
silenceTailIdx := len(wavInfo.Data.Sample) - 1 //尾部静音截止位置
|
940 |
+
done = false
|
941 |
+
for i := len(wavInfo.Data.Sample) - 1; i >= 0; i-- {
|
942 |
+
switch bytePerSample {
|
943 |
+
case 1:
|
944 |
+
for c := 0; c < int(channel); c++ {
|
945 |
+
val := wavInfo.Data.Sample[i].val8s[c]
|
946 |
+
if float32(val) > MaxUint8*dbPercent {
|
947 |
+
silenceTailIdx = i
|
948 |
+
done = true
|
949 |
+
break
|
950 |
+
}
|
951 |
+
}
|
952 |
+
case 2:
|
953 |
+
for c := 0; c < int(channel); c++ {
|
954 |
+
val := wavInfo.Data.Sample[i].val16s[c]
|
955 |
+
if val >= 0 && float32(val) > float32(MaxInt16*dbPercent) {
|
956 |
+
silenceTailIdx = i
|
957 |
+
done = true
|
958 |
+
break
|
959 |
+
}
|
960 |
+
if val < 0 && float32(val) < float32(MinInt16*dbPercent) {
|
961 |
+
silenceTailIdx = i
|
962 |
+
done = true
|
963 |
+
break
|
964 |
+
}
|
965 |
+
}
|
966 |
+
case 4:
|
967 |
+
for c := 0; c < int(channel); c++ {
|
968 |
+
|
969 |
+
val := wavInfo.Data.Sample[i].val32s[c]
|
970 |
+
if float32(val) > math.MaxFloat32*dbPercent {
|
971 |
+
silenceTailIdx = i
|
972 |
+
done = true
|
973 |
+
break
|
974 |
+
}
|
975 |
+
if val < 0 && (-val > math.MaxFloat32*dbPercent) {
|
976 |
+
silenceTailIdx = i
|
977 |
+
done = true
|
978 |
+
break
|
979 |
+
}
|
980 |
+
}
|
981 |
+
}
|
982 |
+
|
983 |
+
if done {
|
984 |
+
break
|
985 |
+
}
|
986 |
+
}
|
987 |
+
|
988 |
+
wavInfo.Data.Sample = wavInfo.Data.Sample[:silenceTailIdx]
|
989 |
+
wavInfo.Data.Sample = wavInfo.Data.Sample[silenceHeadIdx:]
|
990 |
+
}
|
991 |
+
|
992 |
+
func (wavInfo *WavInfoType) TrimFirstWithTime(milliseconds int64) error{
|
993 |
+
sampleRate := wavInfo.Fmt.SampleRate
|
994 |
+
sizeToTrim := int64(sampleRate) * milliseconds / 1000
|
995 |
+
if int(sizeToTrim) >= len(wavInfo.Data.Sample) {
|
996 |
+
return errors.New("check time err")
|
997 |
+
}
|
998 |
+
wavInfo.Data.Sample = wavInfo.Data.Sample[sizeToTrim:]
|
999 |
+
return nil
|
1000 |
+
}
|
1001 |
+
|
1002 |
+
func (wavInfo *WavInfoType) GetWavTime() int {
|
1003 |
+
return int(math.Ceil(float64(len(wavInfo.Data.Sample)) / float64(wavInfo.Fmt.SampleRate)))
|
1004 |
+
}
|
1005 |
+
|
1006 |
+
//NewWavInfo 新建一个wav操作实例
|
1007 |
+
func NewWavInfo() *WavInfoType {
|
1008 |
+
w := &WavInfoType{}
|
1009 |
+
return w
|
1010 |
+
}
|
dsp/streaming_vad/streaming_vad.go
ADDED
@@ -0,0 +1,656 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package streaming_vad
|
2 |
+
|
3 |
+
import (
|
4 |
+
"fmt"
|
5 |
+
"math"
|
6 |
+
)
|
7 |
+
|
8 |
+
const (
|
9 |
+
FrameFlagSpeechPre = iota
|
10 |
+
FrameFlagSpeechStart
|
11 |
+
FrameFlagSpeechPresent
|
12 |
+
FrameFlagSpeechEnd
|
13 |
+
FrameFlagSpeechPost
|
14 |
+
)
|
15 |
+
|
16 |
+
const (
|
17 |
+
VadFlagPrepare = "VadFlagPrepare" //准备
|
18 |
+
VadFlagSpeaking = "VadFlagSpeaking" //说话中
|
19 |
+
VadFlagPause = "VadFlagPause" //逗号停顿
|
20 |
+
VadFlagNoSpeech = "VadFlagNoSpeech" //句号停顿
|
21 |
+
VadFlagUnknown = "VadFlagUnknown" //未知状态
|
22 |
+
)
|
23 |
+
|
24 |
+
|
25 |
+
type ParametersForFdType struct {
|
26 |
+
SampleRate uint32
|
27 |
+
Threshold float32
|
28 |
+
MinThreshold float32
|
29 |
+
|
30 |
+
FrameLengthInSecond float32
|
31 |
+
StartRejectUpdateNoiseLevelTimeInSecond float32
|
32 |
+
StartRejectSpeechTimeInSecond float32
|
33 |
+
|
34 |
+
SpeechStartWindowLengthInSecond float32 //检测语音开始,所需的窗口长度.
|
35 |
+
SpeechPresentWindowLengthInSecond float32 //检测语音活动,所需的窗口长度.
|
36 |
+
SpeechEndConfirmWindowLengthInSecond float32 //确认语音结束,所需的窗口长度.
|
37 |
+
|
38 |
+
SpeechStartRequiredLengthInSecond float32
|
39 |
+
SpeechStartConfirmRequiredLengthInSecond float32
|
40 |
+
SpeechPresentMaintainRequiredLengthInSecond float32
|
41 |
+
SpeechEndConfirmRequiredLengthInSecond float32
|
42 |
+
}
|
43 |
+
|
44 |
+
func (pd *ParametersForFdType) Init () {
|
45 |
+
pd.SampleRate = 8000
|
46 |
+
pd.Threshold = 150.0
|
47 |
+
pd.MinThreshold = 50.0
|
48 |
+
|
49 |
+
pd.FrameLengthInSecond = 0.01
|
50 |
+
|
51 |
+
//Start Reject
|
52 |
+
pd.StartRejectUpdateNoiseLevelTimeInSecond = 0.2
|
53 |
+
pd.StartRejectSpeechTimeInSecond = 0.25
|
54 |
+
|
55 |
+
//Window Length
|
56 |
+
pd.SpeechStartWindowLengthInSecond = 0.15
|
57 |
+
pd.SpeechPresentWindowLengthInSecond = 0.4
|
58 |
+
pd.SpeechEndConfirmWindowLengthInSecond = 0.15
|
59 |
+
|
60 |
+
//Required Length
|
61 |
+
pd.SpeechStartRequiredLengthInSecond = 0.09
|
62 |
+
pd.SpeechStartConfirmRequiredLengthInSecond = 0.075
|
63 |
+
pd.SpeechPresentMaintainRequiredLengthInSecond = 0.1
|
64 |
+
pd.SpeechEndConfirmRequiredLengthInSecond = 0.12
|
65 |
+
}
|
66 |
+
|
67 |
+
type DecisionStateType struct {
|
68 |
+
decisionFlag bool
|
69 |
+
timeInMilliSecond uint32
|
70 |
+
}
|
71 |
+
|
72 |
+
type FrameDecisionType struct {
|
73 |
+
params ParametersForFdType
|
74 |
+
|
75 |
+
sampleRate uint32
|
76 |
+
threshold float32
|
77 |
+
minThreshold float32
|
78 |
+
adaptFactor float32
|
79 |
+
|
80 |
+
//
|
81 |
+
frameLengthInSecond float32
|
82 |
+
|
83 |
+
noiseLevelValue float32
|
84 |
+
startRejectUpdateNoiseLevelTimeInSecond float32
|
85 |
+
startRejectUpdateNoiseLevelFrameNumber uint32
|
86 |
+
startRejectSpeechTimeInSecond float32
|
87 |
+
startRejectSpeechTimeInMilliSecond uint32
|
88 |
+
|
89 |
+
speechStartWindowLengthInSecond float32 //检测语音开始,所需的窗口长度.
|
90 |
+
speechPresentWindowLengthInSecond float32 //检测语音活动,所需的窗口长度.
|
91 |
+
speechEndConfirmWindowLengthInSecond float32 //确认语音结束,所需的窗口长度.
|
92 |
+
|
93 |
+
speechStartRequiredLengthInSecond float32
|
94 |
+
speechStartConfirmRequiredLengthInSecond float32
|
95 |
+
speechPresentMaintainRequiredLengthInSecond float32
|
96 |
+
speechEndConfirmRequiredLengthInSecond float32
|
97 |
+
|
98 |
+
decisionStateDeque []DecisionStateType
|
99 |
+
decisionStateDequeSize uint32
|
100 |
+
decisionStateDequeIndex uint32
|
101 |
+
|
102 |
+
processedFramesNumber uint32
|
103 |
+
lastFrameFlag int
|
104 |
+
thisFrameFlag int
|
105 |
+
|
106 |
+
}
|
107 |
+
|
108 |
+
func (fd *FrameDecisionType) Init (params ParametersForFdType) {
|
109 |
+
fd.params = params
|
110 |
+
|
111 |
+
fd.sampleRate = params.SampleRate
|
112 |
+
fd.threshold = params.Threshold
|
113 |
+
fd.minThreshold = params.MinThreshold
|
114 |
+
fd.adaptFactor = fd.threshold
|
115 |
+
|
116 |
+
fd.frameLengthInSecond = params.FrameLengthInSecond
|
117 |
+
|
118 |
+
fd.noiseLevelValue = fd.threshold / 2.0
|
119 |
+
fd.startRejectUpdateNoiseLevelTimeInSecond = params.StartRejectUpdateNoiseLevelTimeInSecond
|
120 |
+
fd.startRejectUpdateNoiseLevelFrameNumber = uint32(fd.startRejectUpdateNoiseLevelTimeInSecond / fd.frameLengthInSecond)
|
121 |
+
|
122 |
+
fd.startRejectSpeechTimeInSecond = params.StartRejectSpeechTimeInSecond
|
123 |
+
fd.startRejectSpeechTimeInMilliSecond = uint32(fd.startRejectSpeechTimeInSecond * 1e3 + 0.5)
|
124 |
+
fd.speechStartWindowLengthInSecond = params.SpeechStartWindowLengthInSecond
|
125 |
+
fd.speechPresentWindowLengthInSecond = params.SpeechPresentWindowLengthInSecond
|
126 |
+
fd.speechEndConfirmWindowLengthInSecond = params.SpeechEndConfirmWindowLengthInSecond
|
127 |
+
|
128 |
+
fd.speechStartRequiredLengthInSecond = params.SpeechStartRequiredLengthInSecond
|
129 |
+
fd.speechStartConfirmRequiredLengthInSecond = params.SpeechStartConfirmRequiredLengthInSecond
|
130 |
+
fd.speechPresentMaintainRequiredLengthInSecond = params.SpeechPresentMaintainRequiredLengthInSecond
|
131 |
+
fd.speechEndConfirmRequiredLengthInSecond = params.SpeechEndConfirmRequiredLengthInSecond
|
132 |
+
|
133 |
+
//initialize: decisionStateDeque, decisionStateDequeSize, decisionStateDequeIndex
|
134 |
+
largestWindowLengthInSecond := fd.speechStartWindowLengthInSecond
|
135 |
+
if largestWindowLengthInSecond < fd.speechPresentWindowLengthInSecond {
|
136 |
+
largestWindowLengthInSecond = fd.speechPresentWindowLengthInSecond
|
137 |
+
}
|
138 |
+
if largestWindowLengthInSecond < fd.speechEndConfirmWindowLengthInSecond {
|
139 |
+
largestWindowLengthInSecond = fd.speechEndConfirmWindowLengthInSecond
|
140 |
+
}
|
141 |
+
decisionStateDequeSize := uint32(largestWindowLengthInSecond / fd.frameLengthInSecond + 0.5)
|
142 |
+
fd.RefreshDecisionStateDeque(decisionStateDequeSize)
|
143 |
+
|
144 |
+
fd.processedFramesNumber = 0
|
145 |
+
fd.lastFrameFlag = FrameFlagSpeechPre
|
146 |
+
fd.thisFrameFlag = FrameFlagSpeechPre
|
147 |
+
|
148 |
+
}
|
149 |
+
|
150 |
+
/*
|
151 |
+
ProcessStart 当连续语音太长被强制截断时, 就需要有一个方法来重置状态
|
152 |
+
*/
|
153 |
+
func (fd *FrameDecisionType) ProcessStart(resetThreshold bool) {
|
154 |
+
fd.RefreshDecisionStateDeque(fd.decisionStateDequeSize)
|
155 |
+
|
156 |
+
if resetThreshold {
|
157 |
+
fd.threshold = fd.params.Threshold
|
158 |
+
fd.adaptFactor = fd.threshold
|
159 |
+
fd.noiseLevelValue = fd.threshold / 2.0
|
160 |
+
fd.processedFramesNumber = 0
|
161 |
+
}
|
162 |
+
|
163 |
+
fd.lastFrameFlag = FrameFlagSpeechPre
|
164 |
+
fd.thisFrameFlag = FrameFlagSpeechPre
|
165 |
+
}
|
166 |
+
|
167 |
+
func (fd *FrameDecisionType) RefreshDecisionStateDeque(decisionStateDequeSize uint32) {
|
168 |
+
fd.decisionStateDeque = make([]DecisionStateType, decisionStateDequeSize)
|
169 |
+
fd.decisionStateDequeSize = decisionStateDequeSize
|
170 |
+
fd.decisionStateDequeIndex = 0
|
171 |
+
}
|
172 |
+
|
173 |
+
func (fd *FrameDecisionType) UpdateDecisionState (frameStartTimeInMilliSecond uint32, decisionFlag bool) {
|
174 |
+
fd.decisionStateDeque[fd.decisionStateDequeIndex].decisionFlag = decisionFlag
|
175 |
+
fd.decisionStateDeque[fd.decisionStateDequeIndex].timeInMilliSecond = frameStartTimeInMilliSecond
|
176 |
+
fd.decisionStateDequeIndex = (fd.decisionStateDequeIndex + 1) % fd.decisionStateDequeSize
|
177 |
+
}
|
178 |
+
|
179 |
+
func (fd *FrameDecisionType) SumDecisionTrue (durationInSecond float32) (activeDurationInSecond float32) {
|
180 |
+
if len(fd.decisionStateDeque) == 0 {
|
181 |
+
return 0.0
|
182 |
+
}
|
183 |
+
|
184 |
+
indexTemp := int64(fd.decisionStateDequeIndex) - 1
|
185 |
+
if indexTemp < 0 {
|
186 |
+
indexTemp = int64(fd.decisionStateDequeSize) - 1
|
187 |
+
}
|
188 |
+
|
189 |
+
decisionFlag := fd.decisionStateDeque[indexTemp].decisionFlag
|
190 |
+
endInMilliSecond := int64(fd.decisionStateDeque[indexTemp].timeInMilliSecond)
|
191 |
+
beginInMilliSecond := endInMilliSecond - int64(durationInSecond * 1e3)
|
192 |
+
if beginInMilliSecond < 0 {
|
193 |
+
beginInMilliSecond = 0
|
194 |
+
}
|
195 |
+
|
196 |
+
var timeSum uint32 = 0
|
197 |
+
for i := uint32(1); i < fd.decisionStateDequeSize; i++ {
|
198 |
+
if fd.decisionStateDeque[indexTemp].timeInMilliSecond < uint32(beginInMilliSecond) {
|
199 |
+
break
|
200 |
+
}
|
201 |
+
indexTemp--
|
202 |
+
if indexTemp < 0 {
|
203 |
+
indexTemp = int64(fd.decisionStateDequeSize) - 1
|
204 |
+
}
|
205 |
+
if decisionFlag {
|
206 |
+
timeSum += uint32(endInMilliSecond) - fd.decisionStateDeque[indexTemp].timeInMilliSecond
|
207 |
+
}
|
208 |
+
decisionFlag = fd.decisionStateDeque[indexTemp].decisionFlag
|
209 |
+
endInMilliSecond = int64(fd.decisionStateDeque[indexTemp].timeInMilliSecond)
|
210 |
+
}
|
211 |
+
|
212 |
+
activeDurationInSecond = float32(timeSum) * 1e-3
|
213 |
+
return activeDurationInSecond
|
214 |
+
}
|
215 |
+
|
216 |
+
/*
|
217 |
+
SpeechFrameProcess 处理一帧音频, 给该帧音频配一个标签.
|
218 |
+
*/
|
219 |
+
func (fd *FrameDecisionType) SpeechFrameProcess (frameStartTimeInMilliSecond uint32, buffer []int16) {
|
220 |
+
bufferSize := uint32(len(buffer))
|
221 |
+
|
222 |
+
/**************************Calculate the RMS***************************/
|
223 |
+
sumTemp := int64(0)
|
224 |
+
ssqTemp := int64(0)
|
225 |
+
for i := uint32(0); i < bufferSize; i++ {
|
226 |
+
sumTemp = sumTemp + int64(buffer[i])
|
227 |
+
ssqTemp = ssqTemp + int64(buffer[i]) * int64(buffer[i])
|
228 |
+
}
|
229 |
+
|
230 |
+
sum := float64(sumTemp)
|
231 |
+
sum /= float64(bufferSize)
|
232 |
+
|
233 |
+
ssq := float64(ssqTemp)
|
234 |
+
rms := float32(math.Sqrt((ssq / float64(bufferSize)) - (sum * sum)))
|
235 |
+
|
236 |
+
//fmt.Printf("rms %f\n", rms)
|
237 |
+
/**********************************************************************/
|
238 |
+
var decisionFlag bool
|
239 |
+
if frameStartTimeInMilliSecond < fd.startRejectSpeechTimeInMilliSecond {
|
240 |
+
decisionFlag = false
|
241 |
+
} else {
|
242 |
+
decisionFlag = rms > fd.threshold && rms > 400
|
243 |
+
}
|
244 |
+
//fmt.Printf("decisionFlag %t\n", decisionFlag)
|
245 |
+
|
246 |
+
fd.UpdateDecisionState(frameStartTimeInMilliSecond, decisionFlag)
|
247 |
+
|
248 |
+
if fd.thisFrameFlag == FrameFlagSpeechPre {
|
249 |
+
if fd.SumDecisionTrue(fd.speechStartWindowLengthInSecond) > fd.speechStartRequiredLengthInSecond {
|
250 |
+
fd.thisFrameFlag = FrameFlagSpeechStart
|
251 |
+
}
|
252 |
+
} else if fd.thisFrameFlag == FrameFlagSpeechStart {
|
253 |
+
if fd.SumDecisionTrue(fd.speechStartWindowLengthInSecond) > fd.speechStartConfirmRequiredLengthInSecond {
|
254 |
+
fd.thisFrameFlag = FrameFlagSpeechPresent
|
255 |
+
} else {
|
256 |
+
//TODO: 感觉这一部分是不会触发的吧.
|
257 |
+
if fd.speechStartConfirmRequiredLengthInSecond != 0 {
|
258 |
+
fd.thisFrameFlag = FrameFlagSpeechPre
|
259 |
+
}
|
260 |
+
}
|
261 |
+
} else if fd.thisFrameFlag == FrameFlagSpeechPresent {
|
262 |
+
if fd.SumDecisionTrue(fd.speechPresentWindowLengthInSecond) < fd.speechPresentMaintainRequiredLengthInSecond {
|
263 |
+
fd.thisFrameFlag = FrameFlagSpeechEnd
|
264 |
+
}
|
265 |
+
} else if fd.thisFrameFlag == FrameFlagSpeechEnd {
|
266 |
+
if fd.SumDecisionTrue(fd.speechEndConfirmWindowLengthInSecond) <= fd.speechEndConfirmRequiredLengthInSecond {
|
267 |
+
fd.thisFrameFlag = FrameFlagSpeechPre
|
268 |
+
} else if fd.SumDecisionTrue(fd.speechPresentWindowLengthInSecond) >= fd.speechPresentMaintainRequiredLengthInSecond {
|
269 |
+
//fd.thisFrameFlag = FrameFlagSpeechPre
|
270 |
+
//我感觉这里的条件判断应该是 < 而不是 >=.
|
271 |
+
//有可能他是想在这里添加一个短暂的停顿,用于添加逗号.
|
272 |
+
fd.thisFrameFlag = FrameFlagSpeechPre
|
273 |
+
}
|
274 |
+
}
|
275 |
+
|
276 |
+
//
|
277 |
+
if fd.thisFrameFlag == FrameFlagSpeechPre && !decisionFlag {
|
278 |
+
fd.threshold = (0.02 * rms * 2) + (0.98 * fd.threshold)
|
279 |
+
fd.adaptFactor = fd.threshold
|
280 |
+
} else if decisionFlag && fd.thisFrameFlag == FrameFlagSpeechPresent {
|
281 |
+
if rms < fd.adaptFactor {
|
282 |
+
fd.adaptFactor = 0.01 * rms + 0.99 * fd.adaptFactor
|
283 |
+
} else {
|
284 |
+
fd.adaptFactor = 0.05 * rms + 0.95 * fd.adaptFactor
|
285 |
+
}
|
286 |
+
|
287 |
+
thresholdTemp := fd.noiseLevelValue + 0.3 * fd.adaptFactor
|
288 |
+
fd.threshold = (0.1 * thresholdTemp) + 0.9 * fd.threshold
|
289 |
+
}
|
290 |
+
|
291 |
+
//
|
292 |
+
if fd.threshold < fd.minThreshold {
|
293 |
+
fd.threshold = fd.minThreshold
|
294 |
+
}
|
295 |
+
|
296 |
+
// Update the Threshold
|
297 |
+
if fd.processedFramesNumber < fd.startRejectUpdateNoiseLevelFrameNumber {
|
298 |
+
alphaAdapt := float32(fd.processedFramesNumber) / float32(fd.startRejectUpdateNoiseLevelFrameNumber)
|
299 |
+
fd.noiseLevelValue = (alphaAdapt * fd.noiseLevelValue) + ((1 - alphaAdapt) * rms)
|
300 |
+
} else {
|
301 |
+
if rms > fd.noiseLevelValue {
|
302 |
+
fd.noiseLevelValue = (0.001 * rms) + (0.999 * fd.noiseLevelValue)
|
303 |
+
} else {
|
304 |
+
fd.noiseLevelValue = (0.05 * rms) + (0.95 * fd.noiseLevelValue)
|
305 |
+
}
|
306 |
+
}
|
307 |
+
|
308 |
+
if fd.processedFramesNumber < fd.startRejectUpdateNoiseLevelFrameNumber {
|
309 |
+
if fd.noiseLevelValue > 400 {
|
310 |
+
fd.noiseLevelValue = fd.noiseLevelValue * 0.1
|
311 |
+
}
|
312 |
+
fd.threshold = fd.noiseLevelValue * 2
|
313 |
+
|
314 |
+
if fd.threshold < fd.minThreshold {
|
315 |
+
fd.threshold = fd.minThreshold
|
316 |
+
}
|
317 |
+
}
|
318 |
+
|
319 |
+
fd.processedFramesNumber++
|
320 |
+
}
|
321 |
+
|
322 |
+
type VadEventMarkerType struct {
|
323 |
+
VadFlag string
|
324 |
+
Time uint32
|
325 |
+
}
|
326 |
+
|
327 |
+
type StreamingVadType struct{
|
328 |
+
sampleRate uint32 //采样率
|
329 |
+
silenceTime float32 //判断语音结束时需要的静音时长
|
330 |
+
timeout float32 //单个语音的最大长度. 语音活动时长超过时, 将被强制判断定为结束
|
331 |
+
timeoutInMilliSecond uint32
|
332 |
+
|
333 |
+
//VAD检测按帧指定, 每一次接收到音频信号时, 需要将 frameLength 的余数部分保存起来, 下一次则将要将余下的部分拼接到信号的开始位置.
|
334 |
+
frameLength uint32 //每一帖的长度
|
335 |
+
unfinishedFrame []int16 //剩余帧
|
336 |
+
unfinishedFrameSize uint32 //剩余帧长度
|
337 |
+
|
338 |
+
frameDecision FrameDecisionType
|
339 |
+
|
340 |
+
//
|
341 |
+
startRejectSpeechTimeInMilliSecond uint32
|
342 |
+
|
343 |
+
allowedSilenceTimeInSpeechInMilliSecond uint32
|
344 |
+
allowedLongestSpeechDurationInMilliSecond uint32
|
345 |
+
minDurationOfLongSpeechInMilliSecond uint32
|
346 |
+
endOfLongSpeechRequiredSilenceTimeInMilliSecond uint32
|
347 |
+
endOfNormalSpeechRequiredSilenceTimeInMilliSecond uint32
|
348 |
+
minDurationOfSpeechToAddCommaInMilliSecond uint32 //where to add comma if speech
|
349 |
+
|
350 |
+
//只有检测到 VadFlagSpeaking 标签时, 才知道语音已经开始了,
|
351 |
+
//此时向前推 prepareDurationInMilliSecond 的时间, 将其定义为 VadFlagPrepare 的位置.
|
352 |
+
prepareDurationInMilliSecond uint32
|
353 |
+
|
354 |
+
//检测到语音结束时, 并不会马上判断语音结束,
|
355 |
+
//而是需要一定时间 SpeechEndConfirmWindowLengthInSecond 的静音后再确认它,
|
356 |
+
//语音结束的事件节点是`语音结束`后 nonSpeechPadInInMilliSecond 的时间位置.
|
357 |
+
nonSpeechPadInInMilliSecond uint32
|
358 |
+
|
359 |
+
speechFrameGlobalTimeInMilliSecond uint32
|
360 |
+
|
361 |
+
speechDetectedStartTimeInMilliSecond uint32
|
362 |
+
speechDetectedStartTimeIsValid bool
|
363 |
+
speechDetectedEndTimeInMilliSecond uint32
|
364 |
+
speechDetectedEndTimeIsValid bool
|
365 |
+
speechDetectedEndTimeIsValidPossible bool
|
366 |
+
speechDetectedStartAndEnd bool
|
367 |
+
|
368 |
+
//
|
369 |
+
lastVadEndTimeInMilliSecond uint32
|
370 |
+
thisDetectedState string //VadFlag
|
371 |
+
VadEventMarkerDeque []VadEventMarkerType
|
372 |
+
}
|
373 |
+
|
374 |
+
/*
|
375 |
+
silenceTime: 0.4
|
376 |
+
timeout: 3.0
|
377 |
+
|
378 |
+
以下条件应满足:
|
379 |
+
minDurationOfSpeechToAddCommaInMilliSecond < endOfNormalSpeechRequiredSilenceTimeInMilliSecond
|
380 |
+
endOfNormalSpeechRequiredSilenceTimeInMilliSecond < endOfLongSpeechRequiredSilenceTimeInMilliSecond
|
381 |
+
|
382 |
+
*/
|
383 |
+
func (sv *StreamingVadType) Init (sampleRate uint32, silenceTime float32, timeout float32) {
|
384 |
+
sv.sampleRate = sampleRate
|
385 |
+
sv.timeout = timeout
|
386 |
+
sv.timeoutInMilliSecond = uint32(timeout * 1e3)
|
387 |
+
|
388 |
+
sv.frameLength = uint32(0.02 * float32(sampleRate))
|
389 |
+
sv.unfinishedFrameSize = 0
|
390 |
+
|
391 |
+
var params ParametersForFdType
|
392 |
+
params.Init()
|
393 |
+
params.SampleRate = sampleRate
|
394 |
+
sv.frameDecision.Init(params)
|
395 |
+
|
396 |
+
//
|
397 |
+
sv.startRejectSpeechTimeInMilliSecond = uint32(0.7 * 1e3)
|
398 |
+
sv.allowedSilenceTimeInSpeechInMilliSecond = uint32(0.2 * 1e3)
|
399 |
+
sv.minDurationOfLongSpeechInMilliSecond = 0
|
400 |
+
sv.endOfLongSpeechRequiredSilenceTimeInMilliSecond = 0
|
401 |
+
sv.endOfNormalSpeechRequiredSilenceTimeInMilliSecond = uint32(silenceTime * 1e3)
|
402 |
+
sv.minDurationOfSpeechToAddCommaInMilliSecond = uint32(0.3 * 1e3)
|
403 |
+
sv.allowedLongestSpeechDurationInMilliSecond = 0
|
404 |
+
|
405 |
+
sv.prepareDurationInMilliSecond = uint32(sv.frameDecision.speechStartWindowLengthInSecond * 1e3) * 2
|
406 |
+
sv.nonSpeechPadInInMilliSecond = uint32(silenceTime * 1e3 * 0.5)
|
407 |
+
sv.speechFrameGlobalTimeInMilliSecond = 0
|
408 |
+
|
409 |
+
//
|
410 |
+
sv.speechDetectedStartTimeInMilliSecond = 0
|
411 |
+
sv.speechDetectedStartTimeIsValid = false
|
412 |
+
sv.speechDetectedEndTimeInMilliSecond = 0
|
413 |
+
sv.speechDetectedEndTimeIsValid = false
|
414 |
+
sv.speechDetectedEndTimeIsValidPossible = false
|
415 |
+
sv.speechDetectedStartAndEnd = false
|
416 |
+
|
417 |
+
//
|
418 |
+
sv.lastVadEndTimeInMilliSecond = 0
|
419 |
+
sv.thisDetectedState = VadFlagNoSpeech
|
420 |
+
fmt.Println("do StreamingVad Init...")
|
421 |
+
}
|
422 |
+
|
423 |
+
|
424 |
+
func (sv *StreamingVadType) ProcessSpeechByChunk(buffer []int16) (err error) {
|
425 |
+
var validBuffer []int16
|
426 |
+
|
427 |
+
//unfinished frame
|
428 |
+
unfinishedFrameSize := uint32(len(sv.unfinishedFrame))
|
429 |
+
for i := uint32(0); i < unfinishedFrameSize; i++ {
|
430 |
+
validBuffer = append(validBuffer, sv.unfinishedFrame[i])
|
431 |
+
}
|
432 |
+
|
433 |
+
//buffer
|
434 |
+
bufferSize := uint32(len(buffer))
|
435 |
+
for i := uint32(0); i < bufferSize; i++ {
|
436 |
+
validBuffer = append(validBuffer, buffer[i])
|
437 |
+
}
|
438 |
+
|
439 |
+
//remainder
|
440 |
+
remainderSize := uint32(len(validBuffer)) % sv.frameLength
|
441 |
+
boundary := uint32(len(validBuffer)) - remainderSize
|
442 |
+
sv.unfinishedFrame = validBuffer[boundary:]
|
443 |
+
validBuffer = validBuffer[:boundary]
|
444 |
+
if uint32(len(validBuffer)) > sv.frameLength {
|
445 |
+
sv.ProcessSpeech(validBuffer)
|
446 |
+
}
|
447 |
+
return nil
|
448 |
+
}
|
449 |
+
|
450 |
+
|
451 |
+
//ProcessSpeechByChunk 需要将 buffer 更新成指定 frameLength 的倍数, 多余的部分保存起来以供下次使用.
|
452 |
+
func (sv *StreamingVadType) DeprecatedProcessSpeechByChunk(buffer []int16) (err error) {
|
453 |
+
bufferSize := uint32(len(buffer))
|
454 |
+
|
455 |
+
var validBuffer []int16
|
456 |
+
var unfinishedFrame []int16
|
457 |
+
var point int16
|
458 |
+
validSize := (uint32(len(buffer)) + sv.unfinishedFrameSize) / sv.frameLength * sv.frameLength
|
459 |
+
if validSize >= sv.frameLength {
|
460 |
+
if sv.unfinishedFrameSize != 0 {
|
461 |
+
for i := uint32(0); i < sv.unfinishedFrameSize; i++ {
|
462 |
+
point = sv.unfinishedFrame[i]
|
463 |
+
validBuffer = append(validBuffer, point)
|
464 |
+
}
|
465 |
+
for i := uint32(0); i < validSize - sv.unfinishedFrameSize; i++ {
|
466 |
+
point = buffer[i]
|
467 |
+
validBuffer = append(validBuffer, point)
|
468 |
+
}
|
469 |
+
} else {
|
470 |
+
for i := uint32(0); i < validSize; i++ {
|
471 |
+
point = buffer[i]
|
472 |
+
validBuffer = append(validBuffer, point)
|
473 |
+
}
|
474 |
+
}
|
475 |
+
sv.ProcessSpeech(validBuffer)
|
476 |
+
}
|
477 |
+
|
478 |
+
//fmt.Printf("validBuffer size: %d\n", len(validBuffer))
|
479 |
+
//fmt.Printf("validSize: %d\n", validSize)
|
480 |
+
//fmt.Printf("last unfinishedFrameSize: %d\n", sv.unfinishedFrameSize)
|
481 |
+
|
482 |
+
sv.unfinishedFrameSize = (bufferSize + sv.unfinishedFrameSize) - validSize
|
483 |
+
begin := bufferSize - sv.unfinishedFrameSize - 1
|
484 |
+
for i := uint32(0); i < sv.unfinishedFrameSize; i++ {
|
485 |
+
point = buffer[begin + i]
|
486 |
+
unfinishedFrame = append(unfinishedFrame, point)
|
487 |
+
}
|
488 |
+
sv.unfinishedFrame = unfinishedFrame
|
489 |
+
|
490 |
+
fmt.Println("do StreamingVad ProcessSpeechByChunk...")
|
491 |
+
return nil
|
492 |
+
}
|
493 |
+
|
494 |
+
//ProcessSpeech 根据上一帧的语音标签和当前帧的语音标签来判断VAD状态.
|
495 |
+
func (sv *StreamingVadType) ProcessSpeech(buffer []int16) {
|
496 |
+
bufferLength := uint32(len(buffer))
|
497 |
+
if bufferLength % sv.frameLength != 0 {
|
498 |
+
panic(fmt.Sprintf("bufferLength (%d) should be a multiple of B frameLength (%d)", bufferLength, sv.frameLength))
|
499 |
+
}
|
500 |
+
|
501 |
+
var frameBuffer []int16
|
502 |
+
for begin := uint32(0); begin + sv.frameLength <= bufferLength; {
|
503 |
+
frameBuffer = buffer[begin: begin + sv.frameLength]
|
504 |
+
sv.frameDecision.SpeechFrameProcess(sv.speechFrameGlobalTimeInMilliSecond, frameBuffer)
|
505 |
+
begin += sv.frameLength
|
506 |
+
|
507 |
+
if sv.frameDecision.lastFrameFlag == FrameFlagSpeechStart && sv.frameDecision.thisFrameFlag == FrameFlagSpeechPresent {
|
508 |
+
if sv.thisDetectedState == VadFlagNoSpeech {
|
509 |
+
//start
|
510 |
+
var prepareTime uint32 = 0
|
511 |
+
if sv.speechFrameGlobalTimeInMilliSecond > sv.prepareDurationInMilliSecond {
|
512 |
+
prepareTime = sv.speechFrameGlobalTimeInMilliSecond - sv.prepareDurationInMilliSecond
|
513 |
+
}
|
514 |
+
if prepareTime < sv.lastVadEndTimeInMilliSecond {
|
515 |
+
prepareTime = sv.lastVadEndTimeInMilliSecond
|
516 |
+
}
|
517 |
+
vadEventMarker := VadEventMarkerType{
|
518 |
+
VadFlag: VadFlagPrepare,
|
519 |
+
Time: prepareTime,
|
520 |
+
}
|
521 |
+
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
|
522 |
+
|
523 |
+
sv.thisDetectedState = VadFlagSpeaking
|
524 |
+
|
525 |
+
vadEventMarker = VadEventMarkerType{
|
526 |
+
VadFlag: VadFlagSpeaking,
|
527 |
+
Time: sv.speechFrameGlobalTimeInMilliSecond,
|
528 |
+
}
|
529 |
+
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
|
530 |
+
|
531 |
+
//
|
532 |
+
sv.speechDetectedEndTimeInMilliSecond = uint32(0)
|
533 |
+
sv.speechDetectedEndTimeIsValid = false
|
534 |
+
//sv.speechDetectedEndTimeIsValidPossible = false
|
535 |
+
sv.speechDetectedStartTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond - uint32(sv.frameDecision.speechStartWindowLengthInSecond * 1e3)
|
536 |
+
sv.speechDetectedStartTimeIsValid = true
|
537 |
+
} else if sv.thisDetectedState == VadFlagSpeaking && sv.speechDetectedEndTimeIsValid &&
|
538 |
+
sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond > sv.minDurationOfSpeechToAddCommaInMilliSecond &&
|
539 |
+
sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond < sv.endOfNormalSpeechRequiredSilenceTimeInMilliSecond {
|
540 |
+
|
541 |
+
//pause
|
542 |
+
vadEventMarker := VadEventMarkerType{
|
543 |
+
VadFlag: VadFlagPause,
|
544 |
+
Time: sv.speechDetectedEndTimeInMilliSecond,
|
545 |
+
}
|
546 |
+
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
|
547 |
+
|
548 |
+
sv.thisDetectedState = VadFlagSpeaking
|
549 |
+
|
550 |
+
vadEventMarker = VadEventMarkerType{
|
551 |
+
VadFlag: VadFlagSpeaking,
|
552 |
+
Time: sv.speechFrameGlobalTimeInMilliSecond,
|
553 |
+
}
|
554 |
+
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
|
555 |
+
|
556 |
+
//
|
557 |
+
sv.speechDetectedEndTimeInMilliSecond = uint32(0)
|
558 |
+
sv.speechDetectedEndTimeIsValid = false
|
559 |
+
//sv.speechDetectedEndTimeIsValidPossible = false
|
560 |
+
} else if sv.thisDetectedState == VadFlagSpeaking &&
|
561 |
+
sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond <= sv.minDurationOfSpeechToAddCommaInMilliSecond {
|
562 |
+
//
|
563 |
+
sv.speechDetectedEndTimeInMilliSecond = uint32(0)
|
564 |
+
sv.speechDetectedEndTimeIsValid = false
|
565 |
+
//sv.speechDetectedEndTimeIsValidPossible = false
|
566 |
+
} else {}
|
567 |
+
}
|
568 |
+
|
569 |
+
//end
|
570 |
+
if sv.frameDecision.lastFrameFlag == FrameFlagSpeechEnd && sv.frameDecision.thisFrameFlag == FrameFlagSpeechPre {
|
571 |
+
sv.speechDetectedEndTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond
|
572 |
+
sv.speechDetectedEndTimeIsValid = true
|
573 |
+
//sv.speechDetectedEndTimeIsValidPossible = true
|
574 |
+
}
|
575 |
+
|
576 |
+
//只在开始一定时间后, 才能检测到 Vad 结束.
|
577 |
+
if sv.speechFrameGlobalTimeInMilliSecond > sv.startRejectSpeechTimeInMilliSecond {
|
578 |
+
if sv.speechDetectedEndTimeIsValid {
|
579 |
+
var endOfSpeechRequiredSilenceTime uint32
|
580 |
+
if sv.minDurationOfLongSpeechInMilliSecond > 0 &&
|
581 |
+
sv.endOfLongSpeechRequiredSilenceTimeInMilliSecond > 0 &&
|
582 |
+
(sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedStartTimeInMilliSecond) > sv.minDurationOfLongSpeechInMilliSecond {
|
583 |
+
endOfSpeechRequiredSilenceTime = sv.endOfLongSpeechRequiredSilenceTimeInMilliSecond
|
584 |
+
} else {
|
585 |
+
endOfSpeechRequiredSilenceTime = sv.endOfNormalSpeechRequiredSilenceTimeInMilliSecond
|
586 |
+
}
|
587 |
+
|
588 |
+
if (sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond) >= endOfSpeechRequiredSilenceTime {
|
589 |
+
endTime := sv.speechDetectedEndTimeInMilliSecond + sv.nonSpeechPadInInMilliSecond
|
590 |
+
sv.speechDetectedEndTimeInMilliSecond = uint32(0)
|
591 |
+
sv.speechDetectedEndTimeIsValid = false
|
592 |
+
sv.speechDetectedStartTimeInMilliSecond = uint32(0)
|
593 |
+
sv.speechDetectedStartTimeIsValid = false
|
594 |
+
sv.thisDetectedState = VadFlagNoSpeech
|
595 |
+
|
596 |
+
sv.lastVadEndTimeInMilliSecond = endTime
|
597 |
+
vadEventMarker := VadEventMarkerType{
|
598 |
+
VadFlag: VadFlagNoSpeech,
|
599 |
+
Time: endTime,
|
600 |
+
}
|
601 |
+
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
|
602 |
+
}
|
603 |
+
}
|
604 |
+
}
|
605 |
+
|
606 |
+
//当语音时长超过时, 强制切断
|
607 |
+
if sv.speechDetectedStartTimeIsValid && sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedStartTimeInMilliSecond > sv.timeoutInMilliSecond {
|
608 |
+
//end
|
609 |
+
sv.speechDetectedEndTimeInMilliSecond = uint32(0)
|
610 |
+
sv.speechDetectedEndTimeIsValid = false
|
611 |
+
sv.speechDetectedStartTimeInMilliSecond = uint32(0)
|
612 |
+
sv.speechDetectedStartTimeIsValid = false
|
613 |
+
sv.thisDetectedState = VadFlagNoSpeech
|
614 |
+
|
615 |
+
sv.lastVadEndTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond
|
616 |
+
vadEventMarker := VadEventMarkerType{
|
617 |
+
VadFlag: VadFlagNoSpeech,
|
618 |
+
Time: sv.speechFrameGlobalTimeInMilliSecond,
|
619 |
+
}
|
620 |
+
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
|
621 |
+
|
622 |
+
//start
|
623 |
+
var prepareTime uint32 = 0
|
624 |
+
if sv.speechFrameGlobalTimeInMilliSecond > sv.prepareDurationInMilliSecond {
|
625 |
+
prepareTime = sv.speechFrameGlobalTimeInMilliSecond - sv.prepareDurationInMilliSecond
|
626 |
+
}
|
627 |
+
if prepareTime < sv.lastVadEndTimeInMilliSecond {
|
628 |
+
prepareTime = sv.lastVadEndTimeInMilliSecond
|
629 |
+
}
|
630 |
+
vadEventMarker = VadEventMarkerType{
|
631 |
+
VadFlag: VadFlagPrepare,
|
632 |
+
Time: prepareTime,
|
633 |
+
}
|
634 |
+
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
|
635 |
+
|
636 |
+
sv.thisDetectedState = VadFlagSpeaking
|
637 |
+
|
638 |
+
vadEventMarker = VadEventMarkerType{
|
639 |
+
VadFlag: VadFlagSpeaking,
|
640 |
+
Time: sv.speechFrameGlobalTimeInMilliSecond,
|
641 |
+
}
|
642 |
+
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
|
643 |
+
|
644 |
+
//
|
645 |
+
sv.speechDetectedEndTimeInMilliSecond = uint32(0)
|
646 |
+
sv.speechDetectedEndTimeIsValid = false
|
647 |
+
//sv.speechDetectedEndTimeIsValidPossible = false
|
648 |
+
sv.speechDetectedStartTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond - uint32(sv.frameDecision.speechStartWindowLengthInSecond * 1e3)
|
649 |
+
sv.speechDetectedStartTimeIsValid = true
|
650 |
+
}
|
651 |
+
|
652 |
+
//loop
|
653 |
+
sv.frameDecision.lastFrameFlag = sv.frameDecision.thisFrameFlag
|
654 |
+
sv.speechFrameGlobalTimeInMilliSecond += uint32(float32(sv.frameLength) / float32(sv.sampleRate) * 1e3)
|
655 |
+
}
|
656 |
+
}
|
go.mod
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
module vad_go
|
log.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import logging
|
4 |
+
from logging.handlers import TimedRotatingFileHandler
|
5 |
+
import os
|
6 |
+
|
7 |
+
|
8 |
+
def setup(log_directory: str):
|
9 |
+
fmt = "%(asctime)s - %(name)s - %(levelname)s %(filename)s:%(lineno)d > %(message)s"
|
10 |
+
|
11 |
+
stream_handler = logging.StreamHandler()
|
12 |
+
stream_handler.setLevel(logging.INFO)
|
13 |
+
stream_handler.setFormatter(logging.Formatter(fmt))
|
14 |
+
|
15 |
+
# main
|
16 |
+
main_logger = logging.getLogger("main")
|
17 |
+
main_logger.addHandler(stream_handler)
|
18 |
+
main_info_file_handler = TimedRotatingFileHandler(
|
19 |
+
filename=os.path.join(log_directory, "main.log"),
|
20 |
+
encoding="utf-8",
|
21 |
+
when="midnight",
|
22 |
+
interval=1,
|
23 |
+
backupCount=30
|
24 |
+
)
|
25 |
+
main_info_file_handler.setLevel(logging.INFO)
|
26 |
+
main_info_file_handler.setFormatter(logging.Formatter(fmt))
|
27 |
+
main_logger.addHandler(main_info_file_handler)
|
28 |
+
|
29 |
+
# http
|
30 |
+
http_logger = logging.getLogger("http")
|
31 |
+
http_file_handler = TimedRotatingFileHandler(
|
32 |
+
filename=os.path.join(log_directory, "http.log"),
|
33 |
+
encoding='utf-8',
|
34 |
+
when="midnight",
|
35 |
+
interval=1,
|
36 |
+
backupCount=30
|
37 |
+
)
|
38 |
+
http_file_handler.setLevel(logging.DEBUG)
|
39 |
+
http_file_handler.setFormatter(logging.Formatter(fmt))
|
40 |
+
http_logger.addHandler(http_file_handler)
|
41 |
+
|
42 |
+
# api
|
43 |
+
api_logger = logging.getLogger("api")
|
44 |
+
api_file_handler = TimedRotatingFileHandler(
|
45 |
+
filename=os.path.join(log_directory, "api.log"),
|
46 |
+
encoding='utf-8',
|
47 |
+
when="midnight",
|
48 |
+
interval=1,
|
49 |
+
backupCount=30
|
50 |
+
)
|
51 |
+
api_file_handler.setLevel(logging.DEBUG)
|
52 |
+
api_file_handler.setFormatter(logging.Formatter(fmt))
|
53 |
+
api_logger.addHandler(api_file_handler)
|
54 |
+
|
55 |
+
# alarm
|
56 |
+
alarm_logger = logging.getLogger("alarm")
|
57 |
+
alarm_file_handler = TimedRotatingFileHandler(
|
58 |
+
filename=os.path.join(log_directory, "alarm.log"),
|
59 |
+
encoding="utf-8",
|
60 |
+
when="midnight",
|
61 |
+
interval=1,
|
62 |
+
backupCount=30
|
63 |
+
)
|
64 |
+
alarm_file_handler.setLevel(logging.DEBUG)
|
65 |
+
alarm_file_handler.setFormatter(logging.Formatter(fmt))
|
66 |
+
alarm_logger.addHandler(alarm_file_handler)
|
67 |
+
|
68 |
+
debug_file_handler = TimedRotatingFileHandler(
|
69 |
+
filename=os.path.join(log_directory, "debug.log"),
|
70 |
+
encoding="utf-8",
|
71 |
+
when="D",
|
72 |
+
interval=1,
|
73 |
+
backupCount=7
|
74 |
+
)
|
75 |
+
debug_file_handler.setLevel(logging.DEBUG)
|
76 |
+
debug_file_handler.setFormatter(logging.Formatter(fmt))
|
77 |
+
|
78 |
+
info_file_handler = TimedRotatingFileHandler(
|
79 |
+
filename=os.path.join(log_directory, "info.log"),
|
80 |
+
encoding="utf-8",
|
81 |
+
when="D",
|
82 |
+
interval=1,
|
83 |
+
backupCount=7
|
84 |
+
)
|
85 |
+
info_file_handler.setLevel(logging.INFO)
|
86 |
+
info_file_handler.setFormatter(logging.Formatter(fmt))
|
87 |
+
|
88 |
+
error_file_handler = TimedRotatingFileHandler(
|
89 |
+
filename=os.path.join(log_directory, "error.log"),
|
90 |
+
encoding="utf-8",
|
91 |
+
when="D",
|
92 |
+
interval=1,
|
93 |
+
backupCount=7
|
94 |
+
)
|
95 |
+
error_file_handler.setLevel(logging.ERROR)
|
96 |
+
error_file_handler.setFormatter(logging.Formatter(fmt))
|
97 |
+
|
98 |
+
logging.basicConfig(
|
99 |
+
level=logging.DEBUG,
|
100 |
+
datefmt="%a, %d %b %Y %H:%M:%S",
|
101 |
+
handlers=[
|
102 |
+
debug_file_handler,
|
103 |
+
info_file_handler,
|
104 |
+
error_file_handler,
|
105 |
+
]
|
106 |
+
)
|
107 |
+
|
108 |
+
|
109 |
+
if __name__ == "__main__":
|
110 |
+
pass
|
main.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import argparse
|
4 |
+
import logging
|
5 |
+
from pathlib import Path
|
6 |
+
import platform
|
7 |
+
import re
|
8 |
+
|
9 |
+
from project_settings import project_path, log_directory
|
10 |
+
import log
|
11 |
+
|
12 |
+
log.setup(log_directory=log_directory)
|
13 |
+
|
14 |
+
import gradio as gr
|
15 |
+
|
16 |
+
from toolbox.os.command import Command
|
17 |
+
|
18 |
+
main_logger = logging.getLogger("main")
|
19 |
+
|
20 |
+
|
21 |
+
def get_args():
|
22 |
+
parser = argparse.ArgumentParser()
|
23 |
+
|
24 |
+
parser.add_argument(
|
25 |
+
"--example_wav_dir",
|
26 |
+
default=(project_path / "data/examples").as_posix(),
|
27 |
+
type=str
|
28 |
+
)
|
29 |
+
args = parser.parse_args()
|
30 |
+
return args
|
31 |
+
|
32 |
+
|
33 |
+
def process_uploaded_file(filename: str) -> str:
|
34 |
+
filename = Path(filename).as_posix()
|
35 |
+
|
36 |
+
main_logger.info("asr recognize: {}".format(filename))
|
37 |
+
|
38 |
+
cmd = "build/asr_id --filename {}".format(
|
39 |
+
filename
|
40 |
+
)
|
41 |
+
asr_result = Command.popen(cmd)
|
42 |
+
|
43 |
+
pattern = "text: (.*)textSize: (.*)wordSize: (.*)timeCost: (.+)"
|
44 |
+
match = re.search(pattern, asr_result, flags=re.IGNORECASE | re.DOTALL)
|
45 |
+
|
46 |
+
if match is None:
|
47 |
+
raise AssertionError("run asr recognize failed: \n{}".format(asr_result))
|
48 |
+
|
49 |
+
text = match.group(1)
|
50 |
+
|
51 |
+
return text
|
52 |
+
|
53 |
+
|
54 |
+
def shell(cmd: str):
|
55 |
+
return Command.popen(cmd)
|
56 |
+
|
57 |
+
|
58 |
+
def main():
|
59 |
+
args = get_args()
|
60 |
+
|
61 |
+
title = "## 针对电话场景的印尼语ASR."
|
62 |
+
|
63 |
+
# examples
|
64 |
+
example_wav_dir = Path(args.example_wav_dir)
|
65 |
+
|
66 |
+
examples = list()
|
67 |
+
for filename in example_wav_dir.glob("*.wav"):
|
68 |
+
examples.append(
|
69 |
+
[
|
70 |
+
filename.as_posix()
|
71 |
+
]
|
72 |
+
)
|
73 |
+
|
74 |
+
# blocks
|
75 |
+
with gr.Blocks() as blocks:
|
76 |
+
gr.Markdown(value=title)
|
77 |
+
|
78 |
+
with gr.Tabs():
|
79 |
+
with gr.TabItem("Upload from disk"):
|
80 |
+
uploaded_file = gr.Audio(
|
81 |
+
sources=["upload"],
|
82 |
+
type="filepath",
|
83 |
+
label="Upload from disk",
|
84 |
+
)
|
85 |
+
upload_button = gr.Button("Submit for recognition")
|
86 |
+
uploaded_output = gr.Textbox(label="Recognized speech from uploaded file")
|
87 |
+
|
88 |
+
gr.Examples(
|
89 |
+
examples=examples,
|
90 |
+
inputs=[
|
91 |
+
uploaded_file,
|
92 |
+
],
|
93 |
+
outputs=[
|
94 |
+
uploaded_output
|
95 |
+
],
|
96 |
+
fn=process_uploaded_file
|
97 |
+
)
|
98 |
+
|
99 |
+
upload_button.click(
|
100 |
+
process_uploaded_file,
|
101 |
+
inputs=[
|
102 |
+
uploaded_file,
|
103 |
+
],
|
104 |
+
outputs=[
|
105 |
+
uploaded_output
|
106 |
+
],
|
107 |
+
)
|
108 |
+
with gr.TabItem("shell"):
|
109 |
+
shell_text = gr.Textbox(label="cmd")
|
110 |
+
shell_button = gr.Button("run")
|
111 |
+
shell_output = gr.Textbox(label="output")
|
112 |
+
|
113 |
+
shell_button.click(
|
114 |
+
shell,
|
115 |
+
inputs=[
|
116 |
+
shell_text,
|
117 |
+
],
|
118 |
+
outputs=[
|
119 |
+
shell_output
|
120 |
+
],
|
121 |
+
)
|
122 |
+
|
123 |
+
blocks.queue().launch(
|
124 |
+
share=False if platform.system() == "Windows" else False,
|
125 |
+
server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0",
|
126 |
+
server_port=7860
|
127 |
+
)
|
128 |
+
|
129 |
+
return
|
130 |
+
|
131 |
+
|
132 |
+
if __name__ == "__main__":
|
133 |
+
main()
|
project_settings.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import os
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
from toolbox.os.environment import EnvironmentManager
|
7 |
+
|
8 |
+
|
9 |
+
project_path = os.path.abspath(os.path.dirname(__file__))
|
10 |
+
project_path = Path(project_path)
|
11 |
+
|
12 |
+
log_directory = project_path / "logs"
|
13 |
+
log_directory.mkdir(parents=True, exist_ok=True)
|
14 |
+
|
15 |
+
temp_directory = project_path / "temp"
|
16 |
+
temp_directory.mkdir(parents=True, exist_ok=True)
|
17 |
+
|
18 |
+
environment = EnvironmentManager(
|
19 |
+
path=os.path.join(project_path, "dotenv"),
|
20 |
+
env=os.environ.get("environment", "dev"),
|
21 |
+
)
|
22 |
+
|
23 |
+
|
24 |
+
if __name__ == '__main__':
|
25 |
+
pass
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
gradio==4.36.1
|
toolbox/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
if __name__ == '__main__':
|
5 |
+
pass
|
toolbox/json/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
|
5 |
+
if __name__ == '__main__':
|
6 |
+
pass
|
toolbox/json/misc.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
from typing import Callable
|
4 |
+
|
5 |
+
|
6 |
+
def traverse(js, callback: Callable, *args, **kwargs):
|
7 |
+
if isinstance(js, list):
|
8 |
+
result = list()
|
9 |
+
for l in js:
|
10 |
+
l = traverse(l, callback, *args, **kwargs)
|
11 |
+
result.append(l)
|
12 |
+
return result
|
13 |
+
elif isinstance(js, tuple):
|
14 |
+
result = list()
|
15 |
+
for l in js:
|
16 |
+
l = traverse(l, callback, *args, **kwargs)
|
17 |
+
result.append(l)
|
18 |
+
return tuple(result)
|
19 |
+
elif isinstance(js, dict):
|
20 |
+
result = dict()
|
21 |
+
for k, v in js.items():
|
22 |
+
k = traverse(k, callback, *args, **kwargs)
|
23 |
+
v = traverse(v, callback, *args, **kwargs)
|
24 |
+
result[k] = v
|
25 |
+
return result
|
26 |
+
elif isinstance(js, int):
|
27 |
+
return callback(js, *args, **kwargs)
|
28 |
+
elif isinstance(js, str):
|
29 |
+
return callback(js, *args, **kwargs)
|
30 |
+
else:
|
31 |
+
return js
|
32 |
+
|
33 |
+
|
34 |
+
def demo1():
|
35 |
+
d = {
|
36 |
+
"env": "ppe",
|
37 |
+
"mysql_connect": {
|
38 |
+
"host": "$mysql_connect_host",
|
39 |
+
"port": 3306,
|
40 |
+
"user": "callbot",
|
41 |
+
"password": "NxcloudAI2021!",
|
42 |
+
"database": "callbot_ppe",
|
43 |
+
"charset": "utf8"
|
44 |
+
},
|
45 |
+
"es_connect": {
|
46 |
+
"hosts": ["10.20.251.8"],
|
47 |
+
"http_auth": ["elastic", "ElasticAI2021!"],
|
48 |
+
"port": 9200
|
49 |
+
}
|
50 |
+
}
|
51 |
+
|
52 |
+
def callback(s):
|
53 |
+
if isinstance(s, str) and s.startswith('$'):
|
54 |
+
return s[1:]
|
55 |
+
return s
|
56 |
+
|
57 |
+
result = traverse(d, callback=callback)
|
58 |
+
print(result)
|
59 |
+
return
|
60 |
+
|
61 |
+
|
62 |
+
if __name__ == '__main__':
|
63 |
+
demo1()
|
toolbox/os/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
|
5 |
+
if __name__ == '__main__':
|
6 |
+
pass
|
toolbox/os/command.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import os
|
4 |
+
|
5 |
+
|
6 |
+
class Command(object):
|
7 |
+
custom_command = [
|
8 |
+
"cd"
|
9 |
+
]
|
10 |
+
|
11 |
+
@staticmethod
|
12 |
+
def _get_cmd(command):
|
13 |
+
command = str(command).strip()
|
14 |
+
if command == "":
|
15 |
+
return None
|
16 |
+
cmd_and_args = command.split(sep=" ")
|
17 |
+
cmd = cmd_and_args[0]
|
18 |
+
args = " ".join(cmd_and_args[1:])
|
19 |
+
return cmd, args
|
20 |
+
|
21 |
+
@classmethod
|
22 |
+
def popen(cls, command):
|
23 |
+
cmd, args = cls._get_cmd(command)
|
24 |
+
if cmd in cls.custom_command:
|
25 |
+
method = getattr(cls, cmd)
|
26 |
+
return method(args)
|
27 |
+
else:
|
28 |
+
resp = os.popen(command)
|
29 |
+
result = resp.read()
|
30 |
+
resp.close()
|
31 |
+
return result
|
32 |
+
|
33 |
+
@classmethod
|
34 |
+
def cd(cls, args):
|
35 |
+
if args.startswith("/"):
|
36 |
+
os.chdir(args)
|
37 |
+
else:
|
38 |
+
pwd = os.getcwd()
|
39 |
+
path = os.path.join(pwd, args)
|
40 |
+
os.chdir(path)
|
41 |
+
|
42 |
+
@classmethod
|
43 |
+
def system(cls, command):
|
44 |
+
return os.system(command)
|
45 |
+
|
46 |
+
def __init__(self):
|
47 |
+
pass
|
48 |
+
|
49 |
+
|
50 |
+
def ps_ef_grep(keyword: str):
|
51 |
+
cmd = "ps -ef | grep {}".format(keyword)
|
52 |
+
rows = Command.popen(cmd)
|
53 |
+
rows = str(rows).split("\n")
|
54 |
+
rows = [row for row in rows if row.__contains__(keyword) and not row.__contains__("grep")]
|
55 |
+
return rows
|
56 |
+
|
57 |
+
|
58 |
+
if __name__ == "__main__":
|
59 |
+
pass
|
toolbox/os/environment.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
from dotenv.main import DotEnv
|
8 |
+
|
9 |
+
from toolbox.json.misc import traverse
|
10 |
+
|
11 |
+
|
12 |
+
class EnvironmentManager(object):
|
13 |
+
def __init__(self, path, env, override=False):
|
14 |
+
filename = os.path.join(path, '{}.env'.format(env))
|
15 |
+
self.filename = filename
|
16 |
+
|
17 |
+
load_dotenv(
|
18 |
+
dotenv_path=filename,
|
19 |
+
override=override
|
20 |
+
)
|
21 |
+
|
22 |
+
self._environ = dict()
|
23 |
+
|
24 |
+
def open_dotenv(self, filename: str = None):
|
25 |
+
filename = filename or self.filename
|
26 |
+
dotenv = DotEnv(
|
27 |
+
dotenv_path=filename,
|
28 |
+
stream=None,
|
29 |
+
verbose=False,
|
30 |
+
interpolate=False,
|
31 |
+
override=False,
|
32 |
+
encoding="utf-8",
|
33 |
+
)
|
34 |
+
result = dotenv.dict()
|
35 |
+
return result
|
36 |
+
|
37 |
+
def get(self, key, default=None, dtype=str):
|
38 |
+
result = os.environ.get(key)
|
39 |
+
if result is None:
|
40 |
+
if default is None:
|
41 |
+
result = None
|
42 |
+
else:
|
43 |
+
result = default
|
44 |
+
else:
|
45 |
+
result = dtype(result)
|
46 |
+
self._environ[key] = result
|
47 |
+
return result
|
48 |
+
|
49 |
+
|
50 |
+
_DEFAULT_DTYPE_MAP = {
|
51 |
+
'int': int,
|
52 |
+
'float': float,
|
53 |
+
'str': str,
|
54 |
+
'json.loads': json.loads
|
55 |
+
}
|
56 |
+
|
57 |
+
|
58 |
+
class JsonConfig(object):
|
59 |
+
"""
|
60 |
+
将 json 中, 形如 `$float:threshold` 的值, 处理为:
|
61 |
+
从环境变量中查到 threshold, 再将其转换为 float 类型.
|
62 |
+
"""
|
63 |
+
def __init__(self, dtype_map: dict = None, environment: EnvironmentManager = None):
|
64 |
+
self.dtype_map = dtype_map or _DEFAULT_DTYPE_MAP
|
65 |
+
self.environment = environment or os.environ
|
66 |
+
|
67 |
+
def sanitize_by_filename(self, filename: str):
|
68 |
+
with open(filename, 'r', encoding='utf-8') as f:
|
69 |
+
js = json.load(f)
|
70 |
+
|
71 |
+
return self.sanitize_by_json(js)
|
72 |
+
|
73 |
+
def sanitize_by_json(self, js):
|
74 |
+
js = traverse(
|
75 |
+
js,
|
76 |
+
callback=self.sanitize,
|
77 |
+
environment=self.environment
|
78 |
+
)
|
79 |
+
return js
|
80 |
+
|
81 |
+
def sanitize(self, string, environment):
|
82 |
+
"""支持 $ 符开始的, 环境变量配置"""
|
83 |
+
if isinstance(string, str) and string.startswith('$'):
|
84 |
+
dtype, key = string[1:].split(':')
|
85 |
+
dtype = self.dtype_map[dtype]
|
86 |
+
|
87 |
+
value = environment.get(key)
|
88 |
+
if value is None:
|
89 |
+
raise AssertionError('environment not exist. key: {}'.format(key))
|
90 |
+
|
91 |
+
value = dtype(value)
|
92 |
+
result = value
|
93 |
+
else:
|
94 |
+
result = string
|
95 |
+
return result
|
96 |
+
|
97 |
+
|
98 |
+
def demo1():
|
99 |
+
import json
|
100 |
+
|
101 |
+
from project_settings import project_path
|
102 |
+
|
103 |
+
environment = EnvironmentManager(
|
104 |
+
path=os.path.join(project_path, 'server/callbot_server/dotenv'),
|
105 |
+
env='dev',
|
106 |
+
)
|
107 |
+
init_scenes = environment.get(key='init_scenes', dtype=json.loads)
|
108 |
+
print(init_scenes)
|
109 |
+
print(environment._environ)
|
110 |
+
return
|
111 |
+
|
112 |
+
|
113 |
+
if __name__ == '__main__':
|
114 |
+
demo1()
|
toolbox/os/other.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import inspect
|
3 |
+
|
4 |
+
|
5 |
+
def pwd():
|
6 |
+
"""你在哪个文件调用此函数, 它就会返回那个文件所在的 dir 目标"""
|
7 |
+
frame = inspect.stack()[1]
|
8 |
+
module = inspect.getmodule(frame[0])
|
9 |
+
return os.path.dirname(os.path.abspath(module.__file__))
|
vad_go.go
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package main
|
2 |
+
|
3 |
+
import (
|
4 |
+
"flag"
|
5 |
+
"vad_go/dsp/audio"
|
6 |
+
"vad_go/dsp/streaming_vad"
|
7 |
+
"fmt"
|
8 |
+
"os"
|
9 |
+
"path"
|
10 |
+
)
|
11 |
+
|
12 |
+
func main () {
|
13 |
+
args := flag.String("filename", "", "input wav audio file")
|
14 |
+
flag.Parse()
|
15 |
+
|
16 |
+
var filename string = *args
|
17 |
+
|
18 |
+
pwd, _ := os.Getwd()
|
19 |
+
filename = path.Join(pwd, filename)
|
20 |
+
fmt.Println(filename)
|
21 |
+
|
22 |
+
var wavInfo audio.WavInfoType
|
23 |
+
err := wavInfo.ParseFromFile(filename)
|
24 |
+
if err != nil {
|
25 |
+
fmt.Println("wavInfo.ParseFromFile failed.")
|
26 |
+
}
|
27 |
+
fmt.Printf("sample rate: %d\n", wavInfo.Fmt.SampleRate)
|
28 |
+
|
29 |
+
float32Samples := wavInfo.GetFloat32Samples(0, 2)
|
30 |
+
|
31 |
+
var int16Samples []int16
|
32 |
+
var size uint32
|
33 |
+
var point int16
|
34 |
+
for i := 0; i < len(wavInfo.Data.Sample); i++ {
|
35 |
+
point = int16(float32Samples[i] * (1 << 15)) + 1
|
36 |
+
int16Samples = append(int16Samples, point)
|
37 |
+
}
|
38 |
+
size = uint32(len(int16Samples))
|
39 |
+
|
40 |
+
fmt.Printf("sample number: %d\n", size)
|
41 |
+
|
42 |
+
winSize := uint32(7000)
|
43 |
+
winStep := uint32(7000)
|
44 |
+
count := uint32(0)
|
45 |
+
|
46 |
+
sv := streaming_vad.StreamingVadType{}
|
47 |
+
sv.Init(8000, 0.4, 3.0)
|
48 |
+
|
49 |
+
var begin uint32
|
50 |
+
var end uint32
|
51 |
+
//var bufferSize uint32
|
52 |
+
|
53 |
+
for true {
|
54 |
+
begin = count * winStep
|
55 |
+
end = begin + winSize
|
56 |
+
//fmt.Println(end)
|
57 |
+
|
58 |
+
if begin >= size {
|
59 |
+
break
|
60 |
+
}
|
61 |
+
|
62 |
+
if end >= size {
|
63 |
+
end = size
|
64 |
+
}
|
65 |
+
buffer := int16Samples[begin:end]
|
66 |
+
|
67 |
+
//fmt.Printf("bufferSize: %d\n", bufferSize)
|
68 |
+
//fmt.Printf("buffer: %d\n", buffer)
|
69 |
+
|
70 |
+
count++
|
71 |
+
|
72 |
+
err = sv.ProcessSpeechByChunk(buffer)
|
73 |
+
if err != nil {
|
74 |
+
fmt.Println(err)
|
75 |
+
break
|
76 |
+
}
|
77 |
+
//if count > 3 {
|
78 |
+
// break
|
79 |
+
//}
|
80 |
+
}
|
81 |
+
|
82 |
+
fmt.Println(len(sv.VadEventMarkerDeque))
|
83 |
+
var marker streaming_vad.VadEventMarkerType
|
84 |
+
for i := 0; i < len(sv.VadEventMarkerDeque); i++ {
|
85 |
+
marker = sv.VadEventMarkerDeque[i]
|
86 |
+
|
87 |
+
fmt.Println(marker.Time)
|
88 |
+
fmt.Println(marker.VadFlag)
|
89 |
+
|
90 |
+
}
|
91 |
+
}
|