Spaces:

intelli-zen
/

vad_go

Sleeping

App Files Files Community

HoneyTian commited on Jun 11, 2024

Commit

382cf0c

1 Parent(s): ccd188f

update

Browse files

Files changed (20) hide show

.gitattributes +1 -0
.gitignore +10 -0
Dockerfile +19 -0
build_vad_go.sh +5 -0
data/examples/b07ae20f-247d-4e96-9c32-4ea27addcd79.wav +3 -0
dsp/audio/wav.go +1010 -0
dsp/streaming_vad/streaming_vad.go +656 -0
go.mod +1 -0
log.py +110 -0
main.py +133 -0
project_settings.py +25 -0
requirements.txt +1 -0
toolbox/__init__.py +5 -0
toolbox/json/__init__.py +6 -0
toolbox/json/misc.py +63 -0
toolbox/os/__init__.py +6 -0
toolbox/os/command.py +59 -0
toolbox/os/environment.py +114 -0
toolbox/os/other.py +9 -0
vad_go.go +91 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+.git/
+.idea/
+data/
+dotenv/
+logs/
+**/__pycache__/
+**/*.wav

Dockerfile ADDED Viewed

	@@ -0,0 +1,19 @@

+FROM golang:1.18
+WORKDIR /data/GolandProjects/vad_go
+COPY . /data/GolandProjects/vad_go
+RUN apt-get update
+RUN apt-get install -y python3-pip
+RUN pip install --upgrade pip
+RUN pip install --no-cache-dir --upgrade -r /data/GolandProjects/vad_go/requirements.txt
+RUN bash build_vad_go.sh
+USER root
+RUN chmod -R 777 .
+CMD ["python3", "main.py"]

build_vad_go.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+#!/usr/bin/env bash
+mkdir build
+go build -o build vad_go

data/examples/b07ae20f-247d-4e96-9c32-4ea27addcd79.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b59f9910c50776eb704ead2360fdf3a0330da3cf693073575a12c800f6316a9a
+size 78284

dsp/audio/wav.go ADDED Viewed

	@@ -0,0 +1,1010 @@

+package audio
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"math"
+	"os"
+	"path/filepath"
+)
+/*
+-----RIFF-----
+RIFF			4	标头字母
+CHUNK_SIZE		4	整个RIFF文件的长度(不包含RIFF和CHUNK_SIZE这两个字段的长度)
+FORMAT			4	格式,WAVE代表是wav文件,WAVE格式要求带有标头为fmt和data的子chunk
+-----FMT -----
+SUB_CHUNK_ID	4	子chunk的标头字母,此处为"fmt "(注意,fmt后面是带一个空格的),其相当于wav的属性字段
+SUB_CHUNK_SIZE	4	此subchunk的长度(不包含SUB_CHUNK_ID和SUB_CHUNK_SIZE两个字段的长度)
+AUDIO_FORMAT	2	音频格式,pcm为1
+NUM_CHANNELS	2	声道数量,理论上可以n声道,一般我们用单声道mono,或者双声道stereo(双声道也叫立体声)
+SAMPLE_RATE		4	采样率,每秒采样多少次,通常都有固定的采样选择(8000, 11025,12000,16000,22050,24000,32000,44100,48000)
+BYTE_RATE		4	码率,即每秒播放多少byte数据,计算公式=SAMPLE_RATE*NUM_CHANNELS*BITS_PER_SAMPLE/8(不明白为何需要这个字段)
+BLOCK_ALIGN		2	块对其,其值=BITS_PER_SAMPLE*NUM_CHANNELS/8
+BITS_PER_SAMPLE 2	每个采样多少bit,通常为8,16,32(为8时候代表的是uint8,16代表的是int16,32代表float32)
+-----DATA-----
+SUB_CHUNK_ID	4	子chunk的标头字母,此处为"data"
+SUB_CHUNK_SIZE	4	此subchunk的长度(不包含SUB_CHUNK_ID和SUB_CHUNK_SIZE两个字段的长度)
+DATA				pcm的数据
+-------------
+*/
+//MaxChannelNum 最大声道数量(此处只允许2）
+const MaxChannelNum = 2
+const (
+	LeftChannel = 0
+	RightChannel =1
+)
+//tag tag定义
+type tag [4]byte
+//一些变量
+var (
+	tagRIFF = tag{'R', 'I', 'F', 'F'} // "RIFF"
+	tagWAVE = tag{'W', 'A', 'V', 'E'} // "WAVE"
+	tagFmt  = tag{'f', 'm', 't', ' '} // "fmt "
+	tagData = tag{'d', 'a', 't', 'a'} // "data"
+)
+//WavHeaderType wav子部头结构
+type WavHeaderType struct {
+	ID   tag
+	Size uint32
+}
+//String 打印
+func (wavHeader *WavHeaderType) String() string {
+	return fmt.Sprintf("ID=%s,Size=%d", string(wavHeader.ID[:]), wavHeader.Size)
+}
+//头部size
+var (
+	sizeHeader = binary.Size(WavHeaderType{})
+)
+//chunkLoc ...
+type chunkLoc struct {
+	pos  int64
+	size int64
+}
+//RiffType ...
+type RiffType struct {
+	WavHeaderType
+	Fmt tag
+}
+//String ...
+func (riff *RiffType) String() string {
+	return fmt.Sprintf("ID=%s,Size=%d,Fmt=%s", string(riff.ID[:]), riff.Size, string(riff.Fmt[:]))
+}
+//WavFmtType wav格式结构（头部）
+type WavFmtType struct {
+	WavHeaderType
+	AudioFormat   uint16
+	Channels      uint16
+	SampleRate    uint32
+	BytesPerSec   uint32
+	BytesPerBlock uint16
+	BitsPerSample uint16
+}
+//String ...
+func (wavFmt *WavFmtType) String() string {
+	return fmt.Sprintf(
+		"ID=%s,Size=%d,AudioFormat=%d,Channels=%d,SampleRate=%d,BytesPerSec=%d,BytesPerBlock=%d,BitsPerSample=%d",
+		string(wavFmt.ID[:]), wavFmt.Size, wavFmt.AudioFormat, wavFmt.Channels, wavFmt.SampleRate,
+		wavFmt.BytesPerSec, wavFmt.BytesPerBlock, wavFmt.BitsPerSample)
+}
+//SampleType 采样结构
+type SampleType struct {
+	val8s  [MaxChannelNum]uint8
+	val16s [MaxChannelNum]int16
+	val32s [MaxChannelNum]float32
+}
+//WavDataType wav整体结构（头部+采样数据结构）
+type WavDataType struct {
+	WavHeaderType
+	Sample []SampleType
+}
+//String ...
+func (wavData *WavDataType) String() string {
+	blockNum := len(wavData.Sample)
+	return fmt.Sprintf("ID=%s,Size=%d,BlockNum=%d", string(wavData.ID[:]), wavData.Size, blockNum)
+}
+//WavInfoType wav操作实例
+type WavInfoType struct {
+	Riff RiffType
+	Fmt  WavFmtType
+	Data WavDataType
+	//create info
+	createMs int64
+}
+//String ...
+func (wavInfo *WavInfoType) String() string {
+	f := &wavInfo.Fmt
+	blockNum := len(wavInfo.Data.Sample)
+	return fmt.Sprintf("SampleRate=%d,BitsPerSample=%d,Channels=%d,BlockNum=%d",
+		f.SampleRate, f.BitsPerSample, f.Channels, blockNum)
+}
+//SetCreateTs ...
+func (wavInfo *WavInfoType) SetCreateTs(timestampMs int64) {
+	wavInfo.createMs = timestampMs
+}
+//CopyFormat 复制头部结构
+func (wavInfo *WavInfoType) CopyFormat(w *WavInfoType) (err error) {
+	wavInfo.Riff.ID = w.Riff.ID
+	wavInfo.Riff.Size = w.Riff.Size
+	wavInfo.Riff.Fmt = w.Riff.Fmt
+	wavInfo.Fmt.ID = w.Fmt.ID
+	wavInfo.Fmt.Size = w.Fmt.Size
+	wavInfo.Fmt.AudioFormat = w.Fmt.AudioFormat
+	wavInfo.Fmt.Channels = w.Fmt.Channels
+	wavInfo.Fmt.SampleRate = w.Fmt.SampleRate
+	wavInfo.Fmt.BytesPerSec = w.Fmt.BytesPerSec
+	wavInfo.Fmt.BytesPerBlock = w.Fmt.BytesPerBlock
+	wavInfo.Fmt.BitsPerSample = w.Fmt.BitsPerSample
+	wavInfo.Data.ID = w.Data.ID
+	wavInfo.Data.Size = w.Data.Size
+	return
+}
+//ParseFromFile 从文件中导入
+func (wavInfo *WavInfoType) ParseFromFile(absFile string) (err error) {
+	absFile, err = filepath.Abs(absFile) //#nosec
+	if err != nil {
+		return err
+	}
+	fileHandler, err := os.Open(absFile) //#nosec
+	if err != nil {
+		return err
+	}
+	defer fileHandler.Close()
+	_, err = fileHandler.Seek(0, os.SEEK_SET)
+	if err != nil {
+		return err
+	}
+	var pos int64
+	var ch WavHeaderType
+	//-----------------------------------------------------
+	// RIFF header
+	err = binary.Read(fileHandler, binary.LittleEndian, &wavInfo.Riff)
+	if err != nil {
+		return err
+	}
+	pos += int64(sizeHeader) + int64(len(tagWAVE))
+	if wavInfo.Riff.ID != tagRIFF {
+		return errors.New("File Format Not Riff")
+	}
+	if wavInfo.Riff.Fmt != tagWAVE {
+		return errors.New("File Format Not Wave")
+	}
+	fileSize := int64(sizeHeader) + int64(wavInfo.Riff.Size)
+	_ = fileSize
+	//r := &wavInfo.Riff
+	//-----------------------------------------------------
+	// read all chunks
+	var chunks = make(map[tag]*chunkLoc)
+	for {
+		err = binary.Read(fileHandler, binary.LittleEndian, &ch)
+		if err != nil {
+			if err == io.EOF {
+				break
+			}
+			return err
+		}
+		pos += int64(sizeHeader)
+		loc := chunkLoc{
+			pos:  pos,
+			size: int64(ch.Size),
+		}
+		_, err = fileHandler.Seek(loc.size, os.SEEK_CUR)
+		if err != nil {
+			return err
+		}
+		pos += loc.size // chunk data
+		chunks[ch.ID] = &loc
+	}
+	// check fileHandler size
+	if pos != fileSize {
+		return errors.New("pos != fileSize")
+	}
+	//-----------------------------------------------------
+	// chunk fmt_
+	loc, ok := chunks[tagFmt]
+	if !ok {
+		return errors.New("wav: has not chunk \"fmt \"")
+	}
+	_, err = fileHandler.Seek(loc.pos-int64(sizeHeader), os.SEEK_SET)
+	if err != nil {
+		return err
+	}
+	err = binary.Read(fileHandler, binary.LittleEndian, &wavInfo.Fmt)
+	if err != nil {
+		return err
+	}
+	//-----------------------------------------------------
+	// chunk data
+	loc, ok = chunks[tagData]
+	if !ok {
+		return errors.New("wav: has not chunk \"data\"")
+	}
+	_, err = fileHandler.Seek(loc.pos, os.SEEK_SET)
+	if err != nil {
+		return err
+	}
+	channel := wavInfo.Fmt.Channels
+	bytePerSample := wavInfo.Fmt.BitsPerSample / 8
+	blockSize := channel * bytePerSample
+	wavInfo.Data.ID = tagData
+	wavInfo.Data.Size = uint32(loc.size)
+	blockNum := wavInfo.Data.Size / uint32(blockSize)
+	wavInfo.Data.Sample = make([]SampleType, blockNum)
+	_, err = fileHandler.Seek(loc.pos, os.SEEK_SET)
+	if err != nil {
+		return err
+	}
+	blockIdx := 0
+	for i := 0; i < int(wavInfo.Data.Size); i += int(blockSize) {
+		sample := &wavInfo.Data.Sample[blockIdx]
+		blockIdx++
+		for c := 0; c < int(channel); c++ {
+			switch bytePerSample {
+			case 1:
+				var val uint8
+				err = binary.Read(fileHandler, binary.LittleEndian, &val)
+				//sample.val8s = append(sample.val8s,val)
+				sample.val8s[c] = val
+			case 2:
+				var val int16
+				err = binary.Read(fileHandler, binary.LittleEndian, &val)
+				//sample.val16s = append(sample.val16s,val)
+				sample.val16s[c] = val
+				//fmt.Printf("pos=%d,val=%d\n",i,sample.val16s)
+			case 4:
+				var val float32
+				err = binary.Read(fileHandler, binary.LittleEndian, &val)
+				//sample.val32s = append(sample.val32s,val)
+				sample.val32s[c] = val
+			}
+			if err != nil {
+				return err
+			}
+		}
+		//wavInfo.Data.Sample = append(wavInfo.Data.Sample,sample)
+	}
+	//for i:=0;i<len(wavInfo.Data.Sample);i++{
+	//	sample := &wavInfo.Data.Sample[i]
+	//	fmt.Printf("idx=%d,left=%d,right=%d\n",i,sample.val16s[0],sample.val16s[1])
+	//}
+	for id, chunk := range chunks {
+		if id != tagData && id != tagFmt {
+			//util.MainLogger.Error(fmt.Sprintf("unkonw chunk=%s,size=%d\n", string(id[:]), chunk.size))
+			fmt.Printf("unkonw chunk=%s,size=%d\n", string(id[:]), chunk.size)
+		}
+	}
+	return
+}
+//ParseFromBuffer 从buffer中导入
+func (wavInfo *WavInfoType) ParseFromBuffer(buffer []byte) (err error) {
+	var ch WavHeaderType
+	bufferReader := bytes.NewBuffer(buffer)
+	//-----------------------------------------------------
+	// RIFF header
+	err = binary.Read(bufferReader, binary.LittleEndian, &wavInfo.Riff)
+	if err != nil {
+		return err
+	}
+	//pos := int64(sizeHeader) + int64(len(tagWAVE))
+	if wavInfo.Riff.ID != tagRIFF {
+		return errors.New("File Format Not Riff")
+	}
+	if wavInfo.Riff.Fmt != tagWAVE {
+		return errors.New("File Format Not Wave")
+	}
+	fileSize := int64(sizeHeader) + int64(wavInfo.Riff.Size)
+	_ = fileSize
+	//r := &wavInfo.Riff
+	//-----------------------------------------------------
+	// read all chunks
+	for {
+		err = binary.Read(bufferReader, binary.LittleEndian, &ch)
+		if err != nil {
+			if err == io.EOF {
+				//文件读取结束
+				err = nil
+				return
+			}
+			return err
+		}
+		//fmt格式的chunk
+		if ch.ID == tagFmt {
+			wavInfo.Fmt.WavHeaderType = ch
+			err = binary.Read(bufferReader, binary.LittleEndian, &wavInfo.Fmt.AudioFormat)
+			err = binary.Read(bufferReader, binary.LittleEndian, &wavInfo.Fmt.Channels)
+			err = binary.Read(bufferReader, binary.LittleEndian, &wavInfo.Fmt.SampleRate)
+			err = binary.Read(bufferReader, binary.LittleEndian, &wavInfo.Fmt.BytesPerSec)
+			err = binary.Read(bufferReader, binary.LittleEndian, &wavInfo.Fmt.BytesPerBlock)
+			err = binary.Read(bufferReader, binary.LittleEndian, &wavInfo.Fmt.BitsPerSample)
+			if err != nil {
+				return err
+			}
+			continue
+		}
+		//data格式的chunk
+		if ch.ID == tagData {
+			channel := wavInfo.Fmt.Channels
+			bytePerSample := wavInfo.Fmt.BitsPerSample / 8
+			blockSize := channel * bytePerSample
+			wavInfo.Data.ID = tagData
+			wavInfo.Data.Size = ch.Size
+			blockNum := wavInfo.Data.Size / uint32(blockSize)
+			wavInfo.Data.Sample = make([]SampleType, blockNum)
+			blockIdx := 0
+			for i := 0; i < int(wavInfo.Data.Size); i += int(blockSize) {
+				sample := &wavInfo.Data.Sample[blockIdx]
+				blockIdx++
+				for c := 0; c < int(channel); c++ {
+					switch bytePerSample {
+					case 1:
+						var val uint8
+						err = binary.Read(bufferReader, binary.LittleEndian, &val)
+						//sample.val8s = append(sample.val8s,val)
+						sample.val8s[c] = val
+					case 2:
+						var val int16
+						err = binary.Read(bufferReader, binary.LittleEndian, &val)
+						//sample.val16s = append(sample.val16s,val)
+						sample.val16s[c] = val
+						//fmt.Printf("pos=%d,val=%d\n",i,sample.val16s)
+					case 4:
+						var val float32
+						err = binary.Read(bufferReader, binary.LittleEndian, &val)
+						//sample.val32s = append(sample.val32s,val)
+						sample.val32s[c] = val
+					}
+					if err != nil {
+						return err
+					}
+				}
+				//wavInfo.Data.Sample = append(wavInfo.Data.Sample,sample)
+			}
+			continue
+		}
+		//其他格式的chunk
+		byteData := make([]byte, ch.Size)
+		err = binary.Read(bufferReader, binary.LittleEndian, byteData)
+	}
+	return
+}
+//SaveToFile 保存到文件中
+func (wavInfo *WavInfoType) SaveToFile(absFile string) (err error) {
+	file, err := os.Create(absFile)
+	if err != nil {
+		return
+	}
+	defer file.Close()
+	_, err = file.Seek(0, os.SEEK_SET)
+	if err != nil {
+		return
+	}
+	//calc size
+	wavInfo.Data.Size = (uint32(wavInfo.Fmt.Channels) * uint32(wavInfo.Fmt.BitsPerSample) / 8) *
+		uint32(len(wavInfo.Data.Sample))
+	wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
+	wavInfo.Riff.Size = uint32(len(wavInfo.Riff.Fmt)) +
+		uint32(sizeHeader) +
+		wavInfo.Fmt.Size +
+		uint32(sizeHeader) +
+		wavInfo.Data.Size
+	//----------------------------------------
+	// RIFF header
+	err = binary.Write(file, binary.LittleEndian, wavInfo.Riff)
+	if err != nil {
+		return err
+	}
+	//----------------------------------------
+	// chunk fmt_
+	wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
+	err = binary.Write(file, binary.LittleEndian, wavInfo.Fmt)
+	if err != nil {
+		return err
+	}
+	//----------------------------------------
+	// chunk data
+	ch := WavHeaderType{
+		ID:   tagData,
+		Size: wavInfo.Data.Size,
+	}
+	err = binary.Write(file, binary.LittleEndian, ch)
+	if err != nil {
+		return err
+	}
+	channel := wavInfo.Fmt.Channels
+	bytePerSample := wavInfo.Fmt.BitsPerSample / 8
+	//blockSize := channel*bytePerSample
+	for i := 0; i < len(wavInfo.Data.Sample); i++ {
+		sample := &wavInfo.Data.Sample[i]
+		switch bytePerSample {
+		case 1:
+			for c := 0; c < int(channel); c++ {
+				err = binary.Write(file, binary.LittleEndian, sample.val8s[c])
+				if err != nil {
+					return
+				}
+			}
+		case 2:
+			for c := 0; c < int(channel); c++ {
+				err = binary.Write(file, binary.LittleEndian, sample.val16s[c])
+				if err != nil {
+					return
+				}
+			}
+		case 4:
+			for c := 0; c < int(channel); c++ {
+				err = binary.Write(file, binary.LittleEndian, sample.val32s[c])
+				if err != nil {
+					return
+				}
+			}
+		}
+	}
+	return
+}
+//SaveToBuffer 保存到buffer中
+func (wavInfo *WavInfoType) SaveToBuffer() (buffer []byte, err error) {
+	//var bufferWriter   bytes.Buffer
+	buffer = make([]byte, 0)
+	bufferWriter := bytes.NewBuffer(buffer)
+	//calc size
+	wavInfo.Data.Size = (uint32(wavInfo.Fmt.Channels) * uint32(wavInfo.Fmt.BitsPerSample) / 8) *
+		uint32(len(wavInfo.Data.Sample))
+	wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
+	wavInfo.Riff.Size = uint32(len(wavInfo.Riff.Fmt)) +
+		uint32(sizeHeader) +
+		wavInfo.Fmt.Size +
+		uint32(sizeHeader) +
+		wavInfo.Data.Size
+	//----------------------------------------
+	// RIFF header
+	err = binary.Write(bufferWriter, binary.LittleEndian, wavInfo.Riff)
+	if err != nil {
+		return
+	}
+	//----------------------------------------
+	// chunk fmt_
+	wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
+	err = binary.Write(bufferWriter, binary.LittleEndian, wavInfo.Fmt)
+	if err != nil {
+		return
+	}
+	//----------------------------------------
+	// chunk data
+	ch := WavHeaderType{
+		ID:   tagData,
+		Size: wavInfo.Data.Size,
+	}
+	err = binary.Write(bufferWriter, binary.LittleEndian, ch)
+	if err != nil {
+		return
+	}
+	channel := wavInfo.Fmt.Channels
+	bytePerSample := wavInfo.Fmt.BitsPerSample / 8
+	//blockSize := channel*bytePerSample
+	for i := 0; i < len(wavInfo.Data.Sample); i++ {
+		sample := &wavInfo.Data.Sample[i]
+		switch bytePerSample {
+		case 1:
+			for c := 0; c < int(channel); c++ {
+				err = binary.Write(bufferWriter, binary.LittleEndian, sample.val8s[c])
+				if err != nil {
+					return
+				}
+			}
+		case 2:
+			for c := 0; c < int(channel); c++ {
+				err = binary.Write(bufferWriter, binary.LittleEndian, sample.val16s[c])
+				if err != nil {
+					return
+				}
+			}
+		case 4:
+			for c := 0; c < int(channel); c++ {
+				err = binary.Write(bufferWriter, binary.LittleEndian, sample.val32s[c])
+				if err != nil {
+					return
+				}
+			}
+		}
+	}
+	buffer = bufferWriter.Bytes()
+	return
+}
+//SaveToBuffer 保存到buffer中
+func (wavInfo *WavInfoType) SaveToBufferWithChannel(voiceChannel int) (buffer []byte, err error) {
+	//var bufferWriter   bytes.Buffer
+	buffer = make([]byte, 0)
+	bufferWriter := bytes.NewBuffer(buffer)
+	//calc size
+	channel := wavInfo.Fmt.Channels
+	wavInfo.Fmt.Channels = 1
+	wavInfo.Data.Size = (uint32(wavInfo.Fmt.Channels) * uint32(wavInfo.Fmt.BitsPerSample) / 8) *
+		uint32(len(wavInfo.Data.Sample))
+	wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
+	wavInfo.Riff.Size = uint32(len(wavInfo.Riff.Fmt)) +
+		uint32(sizeHeader) +
+		wavInfo.Fmt.Size +
+		uint32(sizeHeader) +
+		wavInfo.Data.Size
+	//----------------------------------------
+	// RIFF header
+	err = binary.Write(bufferWriter, binary.LittleEndian, wavInfo.Riff)
+	if err != nil {
+		return
+	}
+	//----------------------------------------
+	// chunk fmt_
+	wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
+	err = binary.Write(bufferWriter, binary.LittleEndian, wavInfo.Fmt)
+	if err != nil {
+		return
+	}
+	//----------------------------------------
+	// chunk data
+	ch := WavHeaderType{
+		ID:   tagData,
+		Size: wavInfo.Data.Size,
+	}
+	err = binary.Write(bufferWriter, binary.LittleEndian, ch)
+	if err != nil {
+		return
+	}
+	bytePerSample := wavInfo.Fmt.BitsPerSample / 8
+	//blockSize := channel*bytePerSample
+	for i := 0; i < len(wavInfo.Data.Sample); i++ {
+		sample := &wavInfo.Data.Sample[i]
+		switch bytePerSample {
+		case 1:
+			for c := 0; c < int(channel); c++ {
+				if voiceChannel == c {
+					err = binary.Write(bufferWriter, binary.LittleEndian, sample.val8s[c])
+					if err != nil {
+						return
+					}
+				}
+			}
+		case 2:
+			for c := 0; c < int(channel); c++ {
+				if voiceChannel == c {
+					err = binary.Write(bufferWriter, binary.LittleEndian, sample.val16s[c])
+					if err != nil {
+						return
+					}
+				}
+			}
+		case 4:
+			for c := 0; c < int(channel); c++ {
+				if voiceChannel == c {
+					err = binary.Write(bufferWriter, binary.LittleEndian, sample.val32s[c])
+					if err != nil {
+						return
+					}
+				}
+			}
+		}
+	}
+	buffer = bufferWriter.Bytes()
+	wavInfo.Fmt.Channels = 2
+	return
+}
+//SaveToFile 保存到文件中
+func (wavInfo *WavInfoType) SaveToFileWithChannel(absFile string, voiceChannel int) (err error) {
+	file, err := os.Create(absFile)
+	if err != nil {
+		return
+	}
+	defer file.Close()
+	_, err = file.Seek(0, io.SeekStart)
+	if err != nil {
+		return
+	}
+	//calc size
+	channel := wavInfo.Fmt.Channels
+	wavInfo.Fmt.Channels = 1
+	wavInfo.Data.Size = (uint32(wavInfo.Fmt.Channels) * uint32(wavInfo.Fmt.BitsPerSample) / 8) *
+		uint32(len(wavInfo.Data.Sample))
+	wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
+	wavInfo.Riff.Size = uint32(len(wavInfo.Riff.Fmt)) +
+		uint32(sizeHeader) +
+		wavInfo.Fmt.Size +
+		uint32(sizeHeader) +
+		wavInfo.Data.Size
+	//----------------------------------------
+	// RIFF header
+	err = binary.Write(file, binary.LittleEndian, wavInfo.Riff)
+	if err != nil {
+		return err
+	}
+	//----------------------------------------
+	// chunk fmt_
+	wavInfo.Fmt.Size = uint32(binary.Size(wavInfo.Fmt)) - uint32(sizeHeader)
+	err = binary.Write(file, binary.LittleEndian, wavInfo.Fmt)
+	if err != nil {
+		return err
+	}
+	//----------------------------------------
+	// chunk data
+	ch := WavHeaderType{
+		ID:   tagData,
+		Size: wavInfo.Data.Size,
+	}
+	err = binary.Write(file, binary.LittleEndian, ch)
+	if err != nil {
+		return err
+	}
+	bytePerSample := wavInfo.Fmt.BitsPerSample / 8
+	//blockSize := channel*bytePerSample
+	for i := 0; i < len(wavInfo.Data.Sample); i++ {
+		sample := &wavInfo.Data.Sample[i]
+		switch bytePerSample {
+		case 1:
+			for c := 0; c < int(channel); c++ {
+				if voiceChannel == c {
+					err = binary.Write(file, binary.LittleEndian, sample.val8s[c])
+					if err != nil {
+						return
+					}
+				}
+			}
+		case 2:
+			for c := 0; c < int(channel); c++ {
+				if voiceChannel == c {
+					err = binary.Write(file, binary.LittleEndian, sample.val16s[c])
+					if err != nil {
+						return
+					}
+				}
+			}
+		case 4:
+			for c := 0; c < int(channel); c++ {
+				if voiceChannel == c {
+					err = binary.Write(file, binary.LittleEndian, sample.val32s[c])
+					if err != nil {
+						return
+					}
+				}
+			}
+		}
+	}
+	wavInfo.Fmt.Channels = 2
+	return
+}
+//AdjusterVolume 调整音量
+func (wavInfo *WavInfoType) AdjusterVolume(rateAsRaw float32) {
+	channel := wavInfo.Fmt.Channels
+	bytePerSample := wavInfo.Fmt.BitsPerSample / 8
+	//blockSize := channel*bytePerSample
+	const MaxUint8 = math.MaxUint8
+	const MinUint8 = 0
+	const MaxInt16 = math.MaxInt16
+	const MinInt16 = math.MinInt16
+	for i := 0; i < len(wavInfo.Data.Sample); i++ {
+		switch bytePerSample {
+		case 1:
+			for c := 0; c < int(channel); c++ {
+				val := wavInfo.Data.Sample[i].val8s[c]
+				clip := float64(val) * float64(rateAsRaw)
+				if clip < MinUint8 {
+					clip = MinUint8
+				}
+				if clip > MaxUint8 {
+					clip = MaxUint8
+				}
+				//wavInfo.Data.Sample[i].val8s[c] = uint8((float32(val) * rateAsRaw))
+				wavInfo.Data.Sample[i].val8s[c] = uint8(clip)
+			}
+		case 2:
+			for c := 0; c < int(channel); c++ {
+				//val := wavInfo.Data.Sample[i].val16s[c]
+				//wavInfo.Data.Sample[i].val16s[c] = int16((float32(val) * rateAsRaw))
+				val := wavInfo.Data.Sample[i].val16s[c]
+				clip := float64(val) * float64(rateAsRaw)
+				if clip < MinInt16 {
+					clip = MinInt16
+				}
+				if clip > MaxInt16 {
+					clip = MaxInt16
+				}
+				wavInfo.Data.Sample[i].val16s[c] = int16(clip)
+			}
+		case 4:
+			for c := 0; c < int(channel); c++ {
+				val := wavInfo.Data.Sample[i].val32s[c]
+				wavInfo.Data.Sample[i].val32s[c] = val * rateAsRaw
+			}
+		}
+	}
+}
+//Resample 重置采样率
+func (wavInfo *WavInfoType) Resample(resampleRate uint32) {
+	sampleRate := wavInfo.Fmt.SampleRate
+	rate := float64(sampleRate) / float64(resampleRate)
+	channel := wavInfo.Fmt.Channels
+	bytePerSample := wavInfo.Fmt.BitsPerSample / 8
+	rawLen := len(wavInfo.Data.Sample)
+	resampleData := make([]SampleType, 0)
+	resampleIdx := 0
+	for {
+		rawIdx := int(float64(resampleIdx) * rate)
+		if rawIdx < rawLen {
+			sample := SampleType{}
+			for c := 0; c < int(channel); c++ {
+				switch bytePerSample {
+				case 1:
+					val := wavInfo.Data.Sample[rawIdx].val8s[c]
+					//sample.val8s = append(sample.val8s,val)
+					sample.val8s[c] = val
+				case 2:
+					val := wavInfo.Data.Sample[rawIdx].val16s[c]
+					//sample.val16s = append(sample.val16s,val)
+					sample.val16s[c] = val
+				case 4:
+					val := wavInfo.Data.Sample[rawIdx].val32s[c]
+					//sample.val32s = append(sample.val32s,val)
+					sample.val32s[c] = val
+				}
+			}
+			resampleData = append(resampleData, sample)
+		} else {
+			break
+		}
+		resampleIdx++
+	}
+	wavInfo.Data.Sample = resampleData
+	wavInfo.Fmt.SampleRate = resampleRate
+}
+//ConvertToFloat32 将采样值转换到 0 到 1 之间
+func (wavInfo *WavInfoType) GetFloat32Samples(channel int, bytePerSample int) []float32 {
+	//fmt.Println(wavInfo.cha)
+	var floatSamples []float32
+	var point float32
+	for i := 0; i < len(wavInfo.Data.Sample); i++ {
+		sample := &wavInfo.Data.Sample[i]
+		switch bytePerSample {
+		case 1:
+			point = float32(sample.val8s[channel]) / (1 << 8)
+		case 2:
+			point = float32(sample.val16s[channel]) / (1 << 15)
+		case 4:
+			point = sample.val32s[channel]
+		}
+		floatSamples = append(floatSamples, point)
+	}
+	return floatSamples
+}
+//Trim 切头切尾
+func (wavInfo *WavInfoType) Trim(dbPercent float32) {
+	channel := wavInfo.Fmt.Channels
+	bytePerSample := wavInfo.Fmt.BitsPerSample / 8
+	//blockSize := channel*bytePerSample
+	const MaxUint8 = math.MaxUint8
+	const MinUint8 = 0
+	const MaxInt16 = math.MaxInt16
+	const MinInt16 = math.MinInt16
+	//trim head
+	silenceHeadIdx := 0 //头部静音截止位置
+	done := false
+	for i := 0; i < len(wavInfo.Data.Sample); i++ {
+		switch bytePerSample {
+		case 1:
+			for c := 0; c < int(channel); c++ {
+				val := wavInfo.Data.Sample[i].val8s[c]
+				if float32(val) > MaxUint8*dbPercent {
+					silenceHeadIdx = i
+					done = true
+					break
+				}
+			}
+		case 2:
+			for c := 0; c < int(channel); c++ {
+				val := wavInfo.Data.Sample[i].val16s[c]
+				if float32(val) > MaxInt16*dbPercent {
+					silenceHeadIdx = i
+					done = true
+					break
+				}
+				if float32(val) < MinInt16*dbPercent {
+					silenceHeadIdx = i
+					done = true
+					break
+				}
+			}
+		case 4:
+			for c := 0; c < int(channel); c++ {
+				val := wavInfo.Data.Sample[i].val32s[c]
+				if float32(val) > math.MaxFloat32*dbPercent {
+					silenceHeadIdx = i
+					done = true
+					break
+				}
+				if val < 0 && (-val > math.MaxFloat32*dbPercent) {
+					silenceHeadIdx = i
+					done = true
+					break
+				}
+			}
+		}
+		if done {
+			break
+		}
+	}
+	//trim tail,截断尾部
+	silenceTailIdx := len(wavInfo.Data.Sample) - 1 //尾部静音截止位置
+	done = false
+	for i := len(wavInfo.Data.Sample) - 1; i >= 0; i-- {
+		switch bytePerSample {
+		case 1:
+			for c := 0; c < int(channel); c++ {
+				val := wavInfo.Data.Sample[i].val8s[c]
+				if float32(val) > MaxUint8*dbPercent {
+					silenceTailIdx = i
+					done = true
+					break
+				}
+			}
+		case 2:
+			for c := 0; c < int(channel); c++ {
+				val := wavInfo.Data.Sample[i].val16s[c]
+				if val >= 0 && float32(val) > float32(MaxInt16*dbPercent) {
+					silenceTailIdx = i
+					done = true
+					break
+				}
+				if val < 0 && float32(val) < float32(MinInt16*dbPercent) {
+					silenceTailIdx = i
+					done = true
+					break
+				}
+			}
+		case 4:
+			for c := 0; c < int(channel); c++ {
+				val := wavInfo.Data.Sample[i].val32s[c]
+				if float32(val) > math.MaxFloat32*dbPercent {
+					silenceTailIdx = i
+					done = true
+					break
+				}
+				if val < 0 && (-val > math.MaxFloat32*dbPercent) {
+					silenceTailIdx = i
+					done = true
+					break
+				}
+			}
+		}
+		if done {
+			break
+		}
+	}
+	wavInfo.Data.Sample = wavInfo.Data.Sample[:silenceTailIdx]
+	wavInfo.Data.Sample = wavInfo.Data.Sample[silenceHeadIdx:]
+}
+func (wavInfo *WavInfoType) TrimFirstWithTime(milliseconds int64) error{
+	sampleRate := wavInfo.Fmt.SampleRate
+	sizeToTrim :=  int64(sampleRate) * milliseconds / 1000
+	if int(sizeToTrim) >= len(wavInfo.Data.Sample) {
+		return errors.New("check time err")
+	}
+	wavInfo.Data.Sample = wavInfo.Data.Sample[sizeToTrim:]
+	return nil
+}
+func (wavInfo *WavInfoType) GetWavTime() int {
+	return int(math.Ceil(float64(len(wavInfo.Data.Sample)) / float64(wavInfo.Fmt.SampleRate)))
+}
+//NewWavInfo 新建一个wav操作实例
+func NewWavInfo() *WavInfoType {
+	w := &WavInfoType{}
+	return w
+}

dsp/streaming_vad/streaming_vad.go ADDED Viewed

	@@ -0,0 +1,656 @@

+package streaming_vad
+import (
+	"fmt"
+	"math"
+)
+const (
+	FrameFlagSpeechPre = iota
+	FrameFlagSpeechStart
+	FrameFlagSpeechPresent
+	FrameFlagSpeechEnd
+	FrameFlagSpeechPost
+)
+const (
+	VadFlagPrepare     = "VadFlagPrepare"	//准备
+	VadFlagSpeaking    = "VadFlagSpeaking"	//说话中
+	VadFlagPause       = "VadFlagPause"		//逗号停顿
+	VadFlagNoSpeech    = "VadFlagNoSpeech"	//句号停顿
+	VadFlagUnknown     = "VadFlagUnknown"	//未知状态
+)
+type ParametersForFdType struct {
+	SampleRate                            uint32
+	Threshold                             float32
+	MinThreshold						  float32
+	FrameLengthInSecond                           float32
+	StartRejectUpdateNoiseLevelTimeInSecond       float32
+	StartRejectSpeechTimeInSecond                 float32
+	SpeechStartWindowLengthInSecond               float32            //检测语音开始,所需的窗口长度.
+	SpeechPresentWindowLengthInSecond             float32            //检测语音活动,所需的窗口长度.
+	SpeechEndConfirmWindowLengthInSecond          float32            //确认语音结束,所需的窗口长度.
+	SpeechStartRequiredLengthInSecond             float32
+	SpeechStartConfirmRequiredLengthInSecond      float32
+	SpeechPresentMaintainRequiredLengthInSecond   float32
+	SpeechEndConfirmRequiredLengthInSecond        float32
+}
+func (pd *ParametersForFdType) Init () {
+	pd.SampleRate = 8000
+	pd.Threshold = 150.0
+	pd.MinThreshold = 50.0
+	pd.FrameLengthInSecond = 0.01
+	//Start Reject
+	pd.StartRejectUpdateNoiseLevelTimeInSecond = 0.2
+	pd.StartRejectSpeechTimeInSecond = 0.25
+	//Window Length
+	pd.SpeechStartWindowLengthInSecond = 0.15
+	pd.SpeechPresentWindowLengthInSecond = 0.4
+	pd.SpeechEndConfirmWindowLengthInSecond = 0.15
+	//Required Length
+	pd.SpeechStartRequiredLengthInSecond = 0.09
+	pd.SpeechStartConfirmRequiredLengthInSecond = 0.075
+	pd.SpeechPresentMaintainRequiredLengthInSecond = 0.1
+	pd.SpeechEndConfirmRequiredLengthInSecond = 0.12
+}
+type DecisionStateType struct {
+	decisionFlag           bool
+	timeInMilliSecond      uint32
+}
+type FrameDecisionType struct {
+	params                                ParametersForFdType
+	sampleRate                            uint32
+	threshold                             float32
+	minThreshold						  float32
+	adaptFactor                           float32
+	//
+	frameLengthInSecond                           float32
+	noiseLevelValue                               float32
+	startRejectUpdateNoiseLevelTimeInSecond       float32
+	startRejectUpdateNoiseLevelFrameNumber        uint32
+	startRejectSpeechTimeInSecond                 float32
+	startRejectSpeechTimeInMilliSecond            uint32
+	speechStartWindowLengthInSecond               float32            //检测语音开始,所需的窗口长度.
+	speechPresentWindowLengthInSecond             float32            //检测语音活动,所需的窗口长度.
+	speechEndConfirmWindowLengthInSecond          float32            //确认语音结束,所需的窗口长度.
+	speechStartRequiredLengthInSecond             float32
+	speechStartConfirmRequiredLengthInSecond      float32
+	speechPresentMaintainRequiredLengthInSecond   float32
+	speechEndConfirmRequiredLengthInSecond        float32
+	decisionStateDeque                    []DecisionStateType
+	decisionStateDequeSize                uint32
+	decisionStateDequeIndex               uint32
+	processedFramesNumber			 uint32
+	lastFrameFlag                    int
+	thisFrameFlag                    int
+}
+func (fd *FrameDecisionType) Init (params ParametersForFdType) {
+	fd.params = params
+	fd.sampleRate = params.SampleRate
+	fd.threshold = params.Threshold
+	fd.minThreshold = params.MinThreshold
+	fd.adaptFactor = fd.threshold
+	fd.frameLengthInSecond = params.FrameLengthInSecond
+	fd.noiseLevelValue = fd.threshold / 2.0
+	fd.startRejectUpdateNoiseLevelTimeInSecond = params.StartRejectUpdateNoiseLevelTimeInSecond
+	fd.startRejectUpdateNoiseLevelFrameNumber = uint32(fd.startRejectUpdateNoiseLevelTimeInSecond / fd.frameLengthInSecond)
+	fd.startRejectSpeechTimeInSecond = params.StartRejectSpeechTimeInSecond
+	fd.startRejectSpeechTimeInMilliSecond = uint32(fd.startRejectSpeechTimeInSecond * 1e3 + 0.5)
+	fd.speechStartWindowLengthInSecond = params.SpeechStartWindowLengthInSecond
+	fd.speechPresentWindowLengthInSecond = params.SpeechPresentWindowLengthInSecond
+	fd.speechEndConfirmWindowLengthInSecond = params.SpeechEndConfirmWindowLengthInSecond
+	fd.speechStartRequiredLengthInSecond = params.SpeechStartRequiredLengthInSecond
+	fd.speechStartConfirmRequiredLengthInSecond = params.SpeechStartConfirmRequiredLengthInSecond
+	fd.speechPresentMaintainRequiredLengthInSecond = params.SpeechPresentMaintainRequiredLengthInSecond
+	fd.speechEndConfirmRequiredLengthInSecond = params.SpeechEndConfirmRequiredLengthInSecond
+	//initialize: decisionStateDeque, decisionStateDequeSize, decisionStateDequeIndex
+	largestWindowLengthInSecond := fd.speechStartWindowLengthInSecond
+	if largestWindowLengthInSecond < fd.speechPresentWindowLengthInSecond {
+		largestWindowLengthInSecond = fd.speechPresentWindowLengthInSecond
+	}
+	if largestWindowLengthInSecond < fd.speechEndConfirmWindowLengthInSecond {
+		largestWindowLengthInSecond = fd.speechEndConfirmWindowLengthInSecond
+	}
+	decisionStateDequeSize := uint32(largestWindowLengthInSecond / fd.frameLengthInSecond + 0.5)
+	fd.RefreshDecisionStateDeque(decisionStateDequeSize)
+	fd.processedFramesNumber = 0
+	fd.lastFrameFlag = FrameFlagSpeechPre
+	fd.thisFrameFlag = FrameFlagSpeechPre
+}
+/*
+ProcessStart 当连续语音太长被强制截断时, 就需要有一个方法来重置状态
+*/
+func (fd *FrameDecisionType) ProcessStart(resetThreshold bool) {
+	fd.RefreshDecisionStateDeque(fd.decisionStateDequeSize)
+	if resetThreshold {
+		fd.threshold = fd.params.Threshold
+		fd.adaptFactor = fd.threshold
+		fd.noiseLevelValue = fd.threshold / 2.0
+		fd.processedFramesNumber = 0
+	}
+	fd.lastFrameFlag = FrameFlagSpeechPre
+	fd.thisFrameFlag = FrameFlagSpeechPre
+}
+func (fd *FrameDecisionType) RefreshDecisionStateDeque(decisionStateDequeSize uint32) {
+	fd.decisionStateDeque = make([]DecisionStateType, decisionStateDequeSize)
+	fd.decisionStateDequeSize = decisionStateDequeSize
+	fd.decisionStateDequeIndex = 0
+}
+func (fd *FrameDecisionType) UpdateDecisionState (frameStartTimeInMilliSecond uint32, decisionFlag bool) {
+	fd.decisionStateDeque[fd.decisionStateDequeIndex].decisionFlag = decisionFlag
+	fd.decisionStateDeque[fd.decisionStateDequeIndex].timeInMilliSecond = frameStartTimeInMilliSecond
+	fd.decisionStateDequeIndex = (fd.decisionStateDequeIndex + 1) % fd.decisionStateDequeSize
+}
+func (fd *FrameDecisionType) SumDecisionTrue (durationInSecond float32) (activeDurationInSecond float32) {
+	if len(fd.decisionStateDeque) == 0 {
+		return 0.0
+	}
+	indexTemp := int64(fd.decisionStateDequeIndex) - 1
+	if indexTemp < 0 {
+		indexTemp = int64(fd.decisionStateDequeSize) - 1
+	}
+	decisionFlag := fd.decisionStateDeque[indexTemp].decisionFlag
+	endInMilliSecond := int64(fd.decisionStateDeque[indexTemp].timeInMilliSecond)
+	beginInMilliSecond := endInMilliSecond - int64(durationInSecond * 1e3)
+	if beginInMilliSecond < 0 {
+		beginInMilliSecond = 0
+	}
+	var timeSum uint32 = 0
+	for i := uint32(1); i < fd.decisionStateDequeSize; i++ {
+		if fd.decisionStateDeque[indexTemp].timeInMilliSecond < uint32(beginInMilliSecond) {
+			break
+		}
+		indexTemp--
+		if indexTemp < 0 {
+			indexTemp = int64(fd.decisionStateDequeSize) - 1
+		}
+		if decisionFlag {
+			timeSum += uint32(endInMilliSecond) - fd.decisionStateDeque[indexTemp].timeInMilliSecond
+		}
+		decisionFlag = fd.decisionStateDeque[indexTemp].decisionFlag
+		endInMilliSecond = int64(fd.decisionStateDeque[indexTemp].timeInMilliSecond)
+	}
+	activeDurationInSecond = float32(timeSum) * 1e-3
+	return activeDurationInSecond
+}
+/*
+SpeechFrameProcess 处理一帧音频, 给该帧音频配一个标签.
+*/
+func (fd *FrameDecisionType) SpeechFrameProcess (frameStartTimeInMilliSecond uint32, buffer []int16) {
+	bufferSize := uint32(len(buffer))
+	/**************************Calculate the RMS***************************/
+	sumTemp := int64(0)
+	ssqTemp := int64(0)
+	for i := uint32(0); i < bufferSize; i++ {
+		sumTemp = sumTemp + int64(buffer[i])
+		ssqTemp = ssqTemp + int64(buffer[i]) * int64(buffer[i])
+	}
+	sum := float64(sumTemp)
+	sum /= float64(bufferSize)
+	ssq := float64(ssqTemp)
+	rms := float32(math.Sqrt((ssq / float64(bufferSize)) - (sum * sum)))
+	//fmt.Printf("rms %f\n", rms)
+	/**********************************************************************/
+	var decisionFlag bool
+	if frameStartTimeInMilliSecond < fd.startRejectSpeechTimeInMilliSecond {
+		decisionFlag = false
+	} else {
+		decisionFlag = rms > fd.threshold && rms > 400
+	}
+	//fmt.Printf("decisionFlag %t\n", decisionFlag)
+	fd.UpdateDecisionState(frameStartTimeInMilliSecond, decisionFlag)
+	if fd.thisFrameFlag == FrameFlagSpeechPre {
+		if fd.SumDecisionTrue(fd.speechStartWindowLengthInSecond) > fd.speechStartRequiredLengthInSecond {
+			fd.thisFrameFlag = FrameFlagSpeechStart
+		}
+	} else if fd.thisFrameFlag == FrameFlagSpeechStart {
+		if fd.SumDecisionTrue(fd.speechStartWindowLengthInSecond) > fd.speechStartConfirmRequiredLengthInSecond {
+			fd.thisFrameFlag = FrameFlagSpeechPresent
+		} else {
+			//TODO: 感觉这一部分是不会触发的吧.
+			if fd.speechStartConfirmRequiredLengthInSecond != 0 {
+				fd.thisFrameFlag = FrameFlagSpeechPre
+			}
+		}
+	} else if fd.thisFrameFlag == FrameFlagSpeechPresent {
+		if fd.SumDecisionTrue(fd.speechPresentWindowLengthInSecond) < fd.speechPresentMaintainRequiredLengthInSecond {
+			fd.thisFrameFlag = FrameFlagSpeechEnd
+		}
+	} else if fd.thisFrameFlag == FrameFlagSpeechEnd {
+		if fd.SumDecisionTrue(fd.speechEndConfirmWindowLengthInSecond) <= fd.speechEndConfirmRequiredLengthInSecond {
+			fd.thisFrameFlag = FrameFlagSpeechPre
+		} else if fd.SumDecisionTrue(fd.speechPresentWindowLengthInSecond) >= fd.speechPresentMaintainRequiredLengthInSecond {
+			//fd.thisFrameFlag = FrameFlagSpeechPre
+			//我感觉这里的条件判断应该是 < 而不是 >=.
+			//有可能他是想在这里添加一个短暂的停顿,用于添加逗号.
+			fd.thisFrameFlag = FrameFlagSpeechPre
+		}
+	}
+	//
+	if fd.thisFrameFlag == FrameFlagSpeechPre && !decisionFlag {
+		fd.threshold = (0.02 * rms * 2) + (0.98 * fd.threshold)
+		fd.adaptFactor = fd.threshold
+	} else if decisionFlag && fd.thisFrameFlag == FrameFlagSpeechPresent {
+		if rms < fd.adaptFactor {
+			fd.adaptFactor = 0.01 * rms + 0.99 * fd.adaptFactor
+		} else {
+			fd.adaptFactor = 0.05 * rms + 0.95 * fd.adaptFactor
+		}
+		thresholdTemp := fd.noiseLevelValue + 0.3 * fd.adaptFactor
+		fd.threshold = (0.1 * thresholdTemp) + 0.9 * fd.threshold
+	}
+	//
+	if fd.threshold < fd.minThreshold {
+		fd.threshold = fd.minThreshold
+	}
+	// Update the Threshold
+	if fd.processedFramesNumber < fd.startRejectUpdateNoiseLevelFrameNumber {
+		alphaAdapt := float32(fd.processedFramesNumber) / float32(fd.startRejectUpdateNoiseLevelFrameNumber)
+		fd.noiseLevelValue = (alphaAdapt * fd.noiseLevelValue) + ((1 - alphaAdapt) * rms)
+	} else {
+		if rms > fd.noiseLevelValue {
+			fd.noiseLevelValue = (0.001 * rms) + (0.999 * fd.noiseLevelValue)
+		} else {
+			fd.noiseLevelValue = (0.05 * rms) + (0.95 * fd.noiseLevelValue)
+		}
+	}
+	if fd.processedFramesNumber < fd.startRejectUpdateNoiseLevelFrameNumber {
+		if fd.noiseLevelValue > 400 {
+			fd.noiseLevelValue = fd.noiseLevelValue * 0.1
+		}
+		fd.threshold = fd.noiseLevelValue * 2
+		if fd.threshold < fd.minThreshold {
+			fd.threshold = fd.minThreshold
+		}
+	}
+	fd.processedFramesNumber++
+}
+type VadEventMarkerType struct {
+	VadFlag                string
+	Time                   uint32
+}
+type StreamingVadType struct{
+	sampleRate                                     uint32		    	//采样率
+	silenceTime                                    float32			//判断语音结束时需要的静音时长
+	timeout                                        float32			//单个语音的最大长度. 语音活动时长超过时, 将被强制判断定为结束
+	timeoutInMilliSecond                           uint32
+	//VAD检测按帧指定, 每一次接收到音频信号时, 需要将 frameLength 的余数部分保存起来, 下一次则将要将余下的部分拼接到信号的开始位置.
+	frameLength                                    uint32             //每一帖的长度
+	unfinishedFrame                                []int16            //剩余帧
+	unfinishedFrameSize                            uint32             //剩余帧长度
+	frameDecision                                  FrameDecisionType
+	//
+	startRejectSpeechTimeInMilliSecond	        		  uint32
+	allowedSilenceTimeInSpeechInMilliSecond               uint32
+	allowedLongestSpeechDurationInMilliSecond             uint32
+	minDurationOfLongSpeechInMilliSecond                  uint32
+	endOfLongSpeechRequiredSilenceTimeInMilliSecond       uint32
+	endOfNormalSpeechRequiredSilenceTimeInMilliSecond     uint32
+	minDurationOfSpeechToAddCommaInMilliSecond            uint32        //where to add comma if speech
+	//只有检测到 VadFlagSpeaking 标签时, 才知道语音已经开始了,
+	//此时向前推 prepareDurationInMilliSecond 的时间, 将其定义为 VadFlagPrepare 的位置.
+	prepareDurationInMilliSecond                          uint32
+	//检测到语音结束时, 并不会马上判断语音结束,
+	//而是需要一定时间 SpeechEndConfirmWindowLengthInSecond 的静音后再确认它,
+	//语音结束的事件节点是`语音结束`后 nonSpeechPadInInMilliSecond 的时间位置.
+	nonSpeechPadInInMilliSecond                           uint32
+    speechFrameGlobalTimeInMilliSecond             uint32
+	speechDetectedStartTimeInMilliSecond           uint32
+	speechDetectedStartTimeIsValid                 bool
+	speechDetectedEndTimeInMilliSecond		       uint32
+	speechDetectedEndTimeIsValid                   bool
+	speechDetectedEndTimeIsValidPossible           bool
+	speechDetectedStartAndEnd                      bool
+	//
+	lastVadEndTimeInMilliSecond      uint32
+	thisDetectedState                string                 //VadFlag
+	VadEventMarkerDeque	             []VadEventMarkerType
+}
+/*
+silenceTime: 0.4
+timeout: 3.0
+以下条件应满足:
+minDurationOfSpeechToAddCommaInMilliSecond < endOfNormalSpeechRequiredSilenceTimeInMilliSecond
+endOfNormalSpeechRequiredSilenceTimeInMilliSecond < endOfLongSpeechRequiredSilenceTimeInMilliSecond
+*/
+func (sv *StreamingVadType) Init (sampleRate uint32, silenceTime float32, timeout float32) {
+	sv.sampleRate = sampleRate
+	sv.timeout = timeout
+	sv.timeoutInMilliSecond = uint32(timeout * 1e3)
+	sv.frameLength = uint32(0.02 * float32(sampleRate))
+	sv.unfinishedFrameSize = 0
+	var params ParametersForFdType
+	params.Init()
+	params.SampleRate = sampleRate
+	sv.frameDecision.Init(params)
+	//
+	sv.startRejectSpeechTimeInMilliSecond = uint32(0.7 * 1e3)
+	sv.allowedSilenceTimeInSpeechInMilliSecond = uint32(0.2 * 1e3)
+	sv.minDurationOfLongSpeechInMilliSecond = 0
+	sv.endOfLongSpeechRequiredSilenceTimeInMilliSecond = 0
+	sv.endOfNormalSpeechRequiredSilenceTimeInMilliSecond = uint32(silenceTime * 1e3)
+	sv.minDurationOfSpeechToAddCommaInMilliSecond = uint32(0.3 * 1e3)
+	sv.allowedLongestSpeechDurationInMilliSecond = 0
+	sv.prepareDurationInMilliSecond = uint32(sv.frameDecision.speechStartWindowLengthInSecond * 1e3) * 2
+	sv.nonSpeechPadInInMilliSecond = uint32(silenceTime * 1e3 * 0.5)
+	sv.speechFrameGlobalTimeInMilliSecond = 0
+	//
+	sv.speechDetectedStartTimeInMilliSecond = 0
+	sv.speechDetectedStartTimeIsValid = false
+	sv.speechDetectedEndTimeInMilliSecond = 0
+	sv.speechDetectedEndTimeIsValid = false
+	sv.speechDetectedEndTimeIsValidPossible = false
+	sv.speechDetectedStartAndEnd = false
+	//
+	sv.lastVadEndTimeInMilliSecond = 0
+	sv.thisDetectedState = VadFlagNoSpeech
+	fmt.Println("do StreamingVad Init...")
+}
+func (sv *StreamingVadType) ProcessSpeechByChunk(buffer []int16) (err error) {
+	var validBuffer []int16
+	//unfinished frame
+	unfinishedFrameSize := uint32(len(sv.unfinishedFrame))
+	for i := uint32(0); i < unfinishedFrameSize; i++ {
+		validBuffer = append(validBuffer, sv.unfinishedFrame[i])
+	}
+	//buffer
+	bufferSize := uint32(len(buffer))
+	for i := uint32(0); i < bufferSize; i++ {
+		validBuffer = append(validBuffer, buffer[i])
+	}
+	//remainder
+	remainderSize := uint32(len(validBuffer)) % sv.frameLength
+	boundary := uint32(len(validBuffer)) - remainderSize
+	sv.unfinishedFrame = validBuffer[boundary:]
+	validBuffer = validBuffer[:boundary]
+	if uint32(len(validBuffer)) > sv.frameLength {
+		sv.ProcessSpeech(validBuffer)
+	}
+	return nil
+}
+//ProcessSpeechByChunk 需要将 buffer 更新成指定 frameLength 的倍数, 多余的部分保存起来以供下次使用.
+func (sv *StreamingVadType) DeprecatedProcessSpeechByChunk(buffer []int16) (err error) {
+	bufferSize := uint32(len(buffer))
+	var validBuffer []int16
+	var unfinishedFrame []int16
+	var point int16
+	validSize := (uint32(len(buffer)) + sv.unfinishedFrameSize) / sv.frameLength * sv.frameLength
+	if validSize >= sv.frameLength {
+		if sv.unfinishedFrameSize != 0 {
+			for i := uint32(0); i < sv.unfinishedFrameSize; i++ {
+				point = sv.unfinishedFrame[i]
+				validBuffer = append(validBuffer, point)
+			}
+			for i := uint32(0); i < validSize - sv.unfinishedFrameSize; i++ {
+				point = buffer[i]
+				validBuffer = append(validBuffer, point)
+			}
+		} else {
+			for i := uint32(0); i < validSize; i++ {
+				point = buffer[i]
+				validBuffer = append(validBuffer, point)
+			}
+		}
+		sv.ProcessSpeech(validBuffer)
+	}
+	//fmt.Printf("validBuffer size: %d\n", len(validBuffer))
+	//fmt.Printf("validSize: %d\n", validSize)
+	//fmt.Printf("last unfinishedFrameSize: %d\n", sv.unfinishedFrameSize)
+	sv.unfinishedFrameSize = (bufferSize + sv.unfinishedFrameSize) - validSize
+	begin := bufferSize - sv.unfinishedFrameSize - 1
+	for i := uint32(0); i < sv.unfinishedFrameSize; i++ {
+		point = buffer[begin + i]
+		unfinishedFrame = append(unfinishedFrame, point)
+	}
+	sv.unfinishedFrame = unfinishedFrame
+	fmt.Println("do StreamingVad ProcessSpeechByChunk...")
+	return nil
+}
+//ProcessSpeech 根据上一帧的语音标签和当前帧的语音标签来判断VAD状态.
+func (sv *StreamingVadType) ProcessSpeech(buffer []int16) {
+	bufferLength := uint32(len(buffer))
+	if bufferLength % sv.frameLength != 0 {
+		panic(fmt.Sprintf("bufferLength (%d) should be a multiple of B frameLength (%d)", bufferLength, sv.frameLength))
+	}
+	var frameBuffer []int16
+	for begin := uint32(0); begin + sv.frameLength <= bufferLength;  {
+		frameBuffer = buffer[begin: begin + sv.frameLength]
+		sv.frameDecision.SpeechFrameProcess(sv.speechFrameGlobalTimeInMilliSecond, frameBuffer)
+		begin += sv.frameLength
+		if sv.frameDecision.lastFrameFlag == FrameFlagSpeechStart && sv.frameDecision.thisFrameFlag == FrameFlagSpeechPresent {
+			if sv.thisDetectedState == VadFlagNoSpeech {
+				//start
+				var prepareTime uint32 = 0
+				if sv.speechFrameGlobalTimeInMilliSecond > sv.prepareDurationInMilliSecond {
+					prepareTime = sv.speechFrameGlobalTimeInMilliSecond - sv.prepareDurationInMilliSecond
+				}
+				if prepareTime < sv.lastVadEndTimeInMilliSecond {
+					prepareTime = sv.lastVadEndTimeInMilliSecond
+				}
+				vadEventMarker := VadEventMarkerType{
+					VadFlag: VadFlagPrepare,
+					Time: prepareTime,
+				}
+				sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
+				sv.thisDetectedState = VadFlagSpeaking
+				vadEventMarker = VadEventMarkerType{
+					VadFlag: VadFlagSpeaking,
+					Time: sv.speechFrameGlobalTimeInMilliSecond,
+				}
+				sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
+				//
+				sv.speechDetectedEndTimeInMilliSecond = uint32(0)
+				sv.speechDetectedEndTimeIsValid = false
+				//sv.speechDetectedEndTimeIsValidPossible = false
+				sv.speechDetectedStartTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond  - uint32(sv.frameDecision.speechStartWindowLengthInSecond * 1e3)
+				sv.speechDetectedStartTimeIsValid = true
+			} else if sv.thisDetectedState == VadFlagSpeaking && sv.speechDetectedEndTimeIsValid &&
+				sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond > sv.minDurationOfSpeechToAddCommaInMilliSecond &&
+				sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond < sv.endOfNormalSpeechRequiredSilenceTimeInMilliSecond {
+				//pause
+				vadEventMarker := VadEventMarkerType{
+					VadFlag: VadFlagPause,
+					Time: sv.speechDetectedEndTimeInMilliSecond,
+				}
+				sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
+				sv.thisDetectedState = VadFlagSpeaking
+				vadEventMarker = VadEventMarkerType{
+					VadFlag: VadFlagSpeaking,
+					Time: sv.speechFrameGlobalTimeInMilliSecond,
+				}
+				sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
+				//
+				sv.speechDetectedEndTimeInMilliSecond = uint32(0)
+				sv.speechDetectedEndTimeIsValid = false
+				//sv.speechDetectedEndTimeIsValidPossible = false
+			} else if sv.thisDetectedState == VadFlagSpeaking &&
+				sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond <= sv.minDurationOfSpeechToAddCommaInMilliSecond {
+				//
+				sv.speechDetectedEndTimeInMilliSecond = uint32(0)
+				sv.speechDetectedEndTimeIsValid = false
+				//sv.speechDetectedEndTimeIsValidPossible = false
+			} else {}
+		}
+		//end
+		if sv.frameDecision.lastFrameFlag == FrameFlagSpeechEnd && sv.frameDecision.thisFrameFlag == FrameFlagSpeechPre {
+			sv.speechDetectedEndTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond
+			sv.speechDetectedEndTimeIsValid = true
+			//sv.speechDetectedEndTimeIsValidPossible = true
+		}
+		//只在开始一定时间后, 才能检测到 Vad 结束.
+		if sv.speechFrameGlobalTimeInMilliSecond > sv.startRejectSpeechTimeInMilliSecond {
+			if sv.speechDetectedEndTimeIsValid {
+				var endOfSpeechRequiredSilenceTime uint32
+				if sv.minDurationOfLongSpeechInMilliSecond > 0 &&
+					sv.endOfLongSpeechRequiredSilenceTimeInMilliSecond > 0 &&
+					(sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedStartTimeInMilliSecond) > sv.minDurationOfLongSpeechInMilliSecond {
+					endOfSpeechRequiredSilenceTime = sv.endOfLongSpeechRequiredSilenceTimeInMilliSecond
+				} else {
+					endOfSpeechRequiredSilenceTime = sv.endOfNormalSpeechRequiredSilenceTimeInMilliSecond
+				}
+				if (sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond) >= endOfSpeechRequiredSilenceTime {
+					endTime := sv.speechDetectedEndTimeInMilliSecond + sv.nonSpeechPadInInMilliSecond
+					sv.speechDetectedEndTimeInMilliSecond = uint32(0)
+					sv.speechDetectedEndTimeIsValid = false
+					sv.speechDetectedStartTimeInMilliSecond = uint32(0)
+					sv.speechDetectedStartTimeIsValid = false
+					sv.thisDetectedState = VadFlagNoSpeech
+					sv.lastVadEndTimeInMilliSecond = endTime
+					vadEventMarker := VadEventMarkerType{
+						VadFlag: VadFlagNoSpeech,
+						Time: endTime,
+					}
+					sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
+				}
+			}
+		}
+		//当语音时长超过时, 强制切断
+		if sv.speechDetectedStartTimeIsValid && sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedStartTimeInMilliSecond > sv.timeoutInMilliSecond {
+			//end
+			sv.speechDetectedEndTimeInMilliSecond = uint32(0)
+			sv.speechDetectedEndTimeIsValid = false
+			sv.speechDetectedStartTimeInMilliSecond = uint32(0)
+			sv.speechDetectedStartTimeIsValid = false
+			sv.thisDetectedState = VadFlagNoSpeech
+			sv.lastVadEndTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond
+			vadEventMarker := VadEventMarkerType{
+				VadFlag: VadFlagNoSpeech,
+				Time: sv.speechFrameGlobalTimeInMilliSecond,
+			}
+			sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
+			//start
+			var prepareTime uint32 = 0
+			if sv.speechFrameGlobalTimeInMilliSecond > sv.prepareDurationInMilliSecond {
+				prepareTime = sv.speechFrameGlobalTimeInMilliSecond - sv.prepareDurationInMilliSecond
+			}
+			if prepareTime < sv.lastVadEndTimeInMilliSecond {
+				prepareTime = sv.lastVadEndTimeInMilliSecond
+			}
+			vadEventMarker = VadEventMarkerType{
+				VadFlag: VadFlagPrepare,
+				Time: prepareTime,
+			}
+			sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
+			sv.thisDetectedState = VadFlagSpeaking
+			vadEventMarker = VadEventMarkerType{
+				VadFlag: VadFlagSpeaking,
+				Time: sv.speechFrameGlobalTimeInMilliSecond,
+			}
+			sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
+			//
+			sv.speechDetectedEndTimeInMilliSecond = uint32(0)
+			sv.speechDetectedEndTimeIsValid = false
+			//sv.speechDetectedEndTimeIsValidPossible = false
+			sv.speechDetectedStartTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond  - uint32(sv.frameDecision.speechStartWindowLengthInSecond * 1e3)
+			sv.speechDetectedStartTimeIsValid = true
+		}
+		//loop
+		sv.frameDecision.lastFrameFlag = sv.frameDecision.thisFrameFlag
+		sv.speechFrameGlobalTimeInMilliSecond += uint32(float32(sv.frameLength) / float32(sv.sampleRate) * 1e3)
+	}
+}

go.mod ADDED Viewed

	@@ -0,0 +1 @@


1	+ module vad_go

log.py ADDED Viewed

	@@ -0,0 +1,110 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import logging
+from logging.handlers import TimedRotatingFileHandler
+import os
+def setup(log_directory: str):
+    fmt = "%(asctime)s - %(name)s - %(levelname)s  %(filename)s:%(lineno)d >  %(message)s"
+    stream_handler = logging.StreamHandler()
+    stream_handler.setLevel(logging.INFO)
+    stream_handler.setFormatter(logging.Formatter(fmt))
+    # main
+    main_logger = logging.getLogger("main")
+    main_logger.addHandler(stream_handler)
+    main_info_file_handler = TimedRotatingFileHandler(
+        filename=os.path.join(log_directory, "main.log"),
+        encoding="utf-8",
+        when="midnight",
+        interval=1,
+        backupCount=30
+    )
+    main_info_file_handler.setLevel(logging.INFO)
+    main_info_file_handler.setFormatter(logging.Formatter(fmt))
+    main_logger.addHandler(main_info_file_handler)
+    # http
+    http_logger = logging.getLogger("http")
+    http_file_handler = TimedRotatingFileHandler(
+        filename=os.path.join(log_directory, "http.log"),
+        encoding='utf-8',
+        when="midnight",
+        interval=1,
+        backupCount=30
+    )
+    http_file_handler.setLevel(logging.DEBUG)
+    http_file_handler.setFormatter(logging.Formatter(fmt))
+    http_logger.addHandler(http_file_handler)
+    # api
+    api_logger = logging.getLogger("api")
+    api_file_handler = TimedRotatingFileHandler(
+        filename=os.path.join(log_directory, "api.log"),
+        encoding='utf-8',
+        when="midnight",
+        interval=1,
+        backupCount=30
+    )
+    api_file_handler.setLevel(logging.DEBUG)
+    api_file_handler.setFormatter(logging.Formatter(fmt))
+    api_logger.addHandler(api_file_handler)
+    # alarm
+    alarm_logger = logging.getLogger("alarm")
+    alarm_file_handler = TimedRotatingFileHandler(
+        filename=os.path.join(log_directory, "alarm.log"),
+        encoding="utf-8",
+        when="midnight",
+        interval=1,
+        backupCount=30
+    )
+    alarm_file_handler.setLevel(logging.DEBUG)
+    alarm_file_handler.setFormatter(logging.Formatter(fmt))
+    alarm_logger.addHandler(alarm_file_handler)
+    debug_file_handler = TimedRotatingFileHandler(
+        filename=os.path.join(log_directory, "debug.log"),
+        encoding="utf-8",
+        when="D",
+        interval=1,
+        backupCount=7
+    )
+    debug_file_handler.setLevel(logging.DEBUG)
+    debug_file_handler.setFormatter(logging.Formatter(fmt))
+    info_file_handler = TimedRotatingFileHandler(
+        filename=os.path.join(log_directory, "info.log"),
+        encoding="utf-8",
+        when="D",
+        interval=1,
+        backupCount=7
+    )
+    info_file_handler.setLevel(logging.INFO)
+    info_file_handler.setFormatter(logging.Formatter(fmt))
+    error_file_handler = TimedRotatingFileHandler(
+        filename=os.path.join(log_directory, "error.log"),
+        encoding="utf-8",
+        when="D",
+        interval=1,
+        backupCount=7
+    )
+    error_file_handler.setLevel(logging.ERROR)
+    error_file_handler.setFormatter(logging.Formatter(fmt))
+    logging.basicConfig(
+        level=logging.DEBUG,
+        datefmt="%a, %d %b %Y %H:%M:%S",
+        handlers=[
+            debug_file_handler,
+            info_file_handler,
+            error_file_handler,
+        ]
+    )
+if __name__ == "__main__":
+    pass

main.py ADDED Viewed

	@@ -0,0 +1,133 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+import logging
+from pathlib import Path
+import platform
+import re
+from project_settings import project_path, log_directory
+import log
+log.setup(log_directory=log_directory)
+import gradio as gr
+from toolbox.os.command import Command
+main_logger = logging.getLogger("main")
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--example_wav_dir",
+        default=(project_path / "data/examples").as_posix(),
+        type=str
+    )
+    args = parser.parse_args()
+    return args
+def process_uploaded_file(filename: str) -> str:
+    filename = Path(filename).as_posix()
+    main_logger.info("asr recognize: {}".format(filename))
+    cmd = "build/asr_id --filename {}".format(
+        filename
+    )
+    asr_result = Command.popen(cmd)
+    pattern = "text: (.*)textSize: (.*)wordSize: (.*)timeCost: (.+)"
+    match = re.search(pattern, asr_result, flags=re.IGNORECASE | re.DOTALL)
+    if match is None:
+        raise AssertionError("run asr recognize failed: \n{}".format(asr_result))
+    text = match.group(1)
+    return text
+def shell(cmd: str):
+    return Command.popen(cmd)
+def main():
+    args = get_args()
+    title = "## 针对电话场景的印尼语ASR."
+    # examples
+    example_wav_dir = Path(args.example_wav_dir)
+    examples = list()
+    for filename in example_wav_dir.glob("*.wav"):
+        examples.append(
+            [
+                filename.as_posix()
+            ]
+        )
+    # blocks
+    with gr.Blocks() as blocks:
+        gr.Markdown(value=title)
+        with gr.Tabs():
+            with gr.TabItem("Upload from disk"):
+                uploaded_file = gr.Audio(
+                    sources=["upload"],
+                    type="filepath",
+                    label="Upload from disk",
+                )
+                upload_button = gr.Button("Submit for recognition")
+                uploaded_output = gr.Textbox(label="Recognized speech from uploaded file")
+            gr.Examples(
+                examples=examples,
+                inputs=[
+                    uploaded_file,
+                ],
+                outputs=[
+                    uploaded_output
+                ],
+                fn=process_uploaded_file
+            )
+            upload_button.click(
+                process_uploaded_file,
+                inputs=[
+                    uploaded_file,
+                ],
+                outputs=[
+                    uploaded_output
+                ],
+            )
+            with gr.TabItem("shell"):
+                shell_text = gr.Textbox(label="cmd")
+                shell_button = gr.Button("run")
+                shell_output = gr.Textbox(label="output")
+            shell_button.click(
+                shell,
+                inputs=[
+                    shell_text,
+                ],
+                outputs=[
+                    shell_output
+                ],
+            )
+    blocks.queue().launch(
+        share=False if platform.system() == "Windows" else False,
+        server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0",
+        server_port=7860
+    )
+    return
+if __name__ == "__main__":
+    main()

project_settings.py ADDED Viewed

	@@ -0,0 +1,25 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import os
+from pathlib import Path
+from toolbox.os.environment import EnvironmentManager
+project_path = os.path.abspath(os.path.dirname(__file__))
+project_path = Path(project_path)
+log_directory = project_path / "logs"
+log_directory.mkdir(parents=True, exist_ok=True)
+temp_directory = project_path / "temp"
+temp_directory.mkdir(parents=True, exist_ok=True)
+environment = EnvironmentManager(
+    path=os.path.join(project_path, "dotenv"),
+    env=os.environ.get("environment", "dev"),
+)
+if __name__ == '__main__':
+    pass

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ gradio==4.36.1

toolbox/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/json/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/json/misc.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from typing import Callable
+def traverse(js, callback: Callable, *args, **kwargs):
+    if isinstance(js, list):
+        result = list()
+        for l in js:
+            l = traverse(l, callback, *args, **kwargs)
+            result.append(l)
+        return result
+    elif isinstance(js, tuple):
+        result = list()
+        for l in js:
+            l = traverse(l, callback, *args, **kwargs)
+            result.append(l)
+        return tuple(result)
+    elif isinstance(js, dict):
+        result = dict()
+        for k, v in js.items():
+            k = traverse(k, callback, *args, **kwargs)
+            v = traverse(v, callback, *args, **kwargs)
+            result[k] = v
+        return result
+    elif isinstance(js, int):
+        return callback(js, *args, **kwargs)
+    elif isinstance(js, str):
+        return callback(js, *args, **kwargs)
+    else:
+        return js
+def demo1():
+    d = {
+        "env": "ppe",
+        "mysql_connect": {
+            "host": "$mysql_connect_host",
+            "port": 3306,
+            "user": "callbot",
+            "password": "NxcloudAI2021!",
+            "database": "callbot_ppe",
+            "charset": "utf8"
+        },
+        "es_connect": {
+            "hosts": ["10.20.251.8"],
+            "http_auth": ["elastic", "ElasticAI2021!"],
+            "port": 9200
+        }
+    }
+    def callback(s):
+        if isinstance(s, str) and s.startswith('$'):
+            return s[1:]
+        return s
+    result = traverse(d, callback=callback)
+    print(result)
+    return
+if __name__ == '__main__':
+    demo1()

toolbox/os/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/os/command.py ADDED Viewed

	@@ -0,0 +1,59 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import os
+class Command(object):
+    custom_command = [
+        "cd"
+    ]
+    @staticmethod
+    def _get_cmd(command):
+        command = str(command).strip()
+        if command == "":
+            return None
+        cmd_and_args = command.split(sep=" ")
+        cmd = cmd_and_args[0]
+        args = " ".join(cmd_and_args[1:])
+        return cmd, args
+    @classmethod
+    def popen(cls, command):
+        cmd, args = cls._get_cmd(command)
+        if cmd in cls.custom_command:
+            method = getattr(cls, cmd)
+            return method(args)
+        else:
+            resp = os.popen(command)
+            result = resp.read()
+            resp.close()
+            return result
+    @classmethod
+    def cd(cls, args):
+        if args.startswith("/"):
+            os.chdir(args)
+        else:
+            pwd = os.getcwd()
+            path = os.path.join(pwd, args)
+            os.chdir(path)
+    @classmethod
+    def system(cls, command):
+        return os.system(command)
+    def __init__(self):
+        pass
+def ps_ef_grep(keyword: str):
+    cmd = "ps -ef | grep {}".format(keyword)
+    rows = Command.popen(cmd)
+    rows = str(rows).split("\n")
+    rows = [row for row in rows if row.__contains__(keyword) and not row.__contains__("grep")]
+    return rows
+if __name__ == "__main__":
+    pass

toolbox/os/environment.py ADDED Viewed

	@@ -0,0 +1,114 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import json
+import os
+from dotenv import load_dotenv
+from dotenv.main import DotEnv
+from toolbox.json.misc import traverse
+class EnvironmentManager(object):
+    def __init__(self, path, env, override=False):
+        filename = os.path.join(path, '{}.env'.format(env))
+        self.filename = filename
+        load_dotenv(
+            dotenv_path=filename,
+            override=override
+        )
+        self._environ = dict()
+    def open_dotenv(self, filename: str = None):
+        filename = filename or self.filename
+        dotenv = DotEnv(
+            dotenv_path=filename,
+            stream=None,
+            verbose=False,
+            interpolate=False,
+            override=False,
+            encoding="utf-8",
+        )
+        result = dotenv.dict()
+        return result
+    def get(self, key, default=None, dtype=str):
+        result = os.environ.get(key)
+        if result is None:
+            if default is None:
+                result = None
+            else:
+                result = default
+        else:
+            result = dtype(result)
+        self._environ[key] = result
+        return result
+_DEFAULT_DTYPE_MAP = {
+    'int': int,
+    'float': float,
+    'str': str,
+    'json.loads': json.loads
+}
+class JsonConfig(object):
+    """
+    将 json 中, 形如 `$float:threshold` 的值, 处理为:
+    从环境变量中查到 threshold, 再将其转换为 float 类型.
+    """
+    def __init__(self, dtype_map: dict = None, environment: EnvironmentManager = None):
+        self.dtype_map = dtype_map or _DEFAULT_DTYPE_MAP
+        self.environment = environment or os.environ
+    def sanitize_by_filename(self, filename: str):
+        with open(filename, 'r', encoding='utf-8') as f:
+            js = json.load(f)
+        return self.sanitize_by_json(js)
+    def sanitize_by_json(self, js):
+        js = traverse(
+            js,
+            callback=self.sanitize,
+            environment=self.environment
+        )
+        return js
+    def sanitize(self, string, environment):
+        """支持 $ 符开始的, 环境变量配置"""
+        if isinstance(string, str) and string.startswith('$'):
+            dtype, key = string[1:].split(':')
+            dtype = self.dtype_map[dtype]
+            value = environment.get(key)
+            if value is None:
+                raise AssertionError('environment not exist. key: {}'.format(key))
+            value = dtype(value)
+            result = value
+        else:
+            result = string
+        return result
+def demo1():
+    import json
+    from project_settings import project_path
+    environment = EnvironmentManager(
+        path=os.path.join(project_path, 'server/callbot_server/dotenv'),
+        env='dev',
+    )
+    init_scenes = environment.get(key='init_scenes', dtype=json.loads)
+    print(init_scenes)
+    print(environment._environ)
+    return
+if __name__ == '__main__':
+    demo1()

toolbox/os/other.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import os
+import inspect
+def pwd():
+    """你在哪个文件调用此函数, 它就会返回那个文件所在的 dir 目标"""
+    frame = inspect.stack()[1]
+    module = inspect.getmodule(frame[0])
+    return os.path.dirname(os.path.abspath(module.__file__))

vad_go.go ADDED Viewed

	@@ -0,0 +1,91 @@

+package main
+import (
+	"flag"
+	"vad_go/dsp/audio"
+	"vad_go/dsp/streaming_vad"
+	"fmt"
+	"os"
+	"path"
+)
+func main () {
+    args := flag.String("filename", "", "input wav audio file")
+	flag.Parse()
+    var filename string = *args
+	pwd, _ := os.Getwd()
+	filename = path.Join(pwd, filename)
+	fmt.Println(filename)
+	var wavInfo audio.WavInfoType
+	err := wavInfo.ParseFromFile(filename)
+	if err != nil {
+		fmt.Println("wavInfo.ParseFromFile failed.")
+	}
+	fmt.Printf("sample rate: %d\n", wavInfo.Fmt.SampleRate)
+	float32Samples := wavInfo.GetFloat32Samples(0, 2)
+	var int16Samples []int16
+	var size uint32
+	var point int16
+	for i := 0; i < len(wavInfo.Data.Sample); i++ {
+		point = int16(float32Samples[i] * (1 << 15)) + 1
+		int16Samples = append(int16Samples, point)
+	}
+	size = uint32(len(int16Samples))
+	fmt.Printf("sample number: %d\n", size)
+	winSize := uint32(7000)
+	winStep := uint32(7000)
+	count := uint32(0)
+	sv := streaming_vad.StreamingVadType{}
+	sv.Init(8000, 0.4, 3.0)
+	var begin uint32
+	var end uint32
+	//var bufferSize uint32
+	for true {
+		begin = count * winStep
+		end = begin + winSize
+		//fmt.Println(end)
+		if begin >= size {
+			break
+		}
+		if end >= size {
+			end = size
+		}
+		buffer := int16Samples[begin:end]
+		//fmt.Printf("bufferSize: %d\n", bufferSize)
+		//fmt.Printf("buffer: %d\n", buffer)
+		count++
+		err = sv.ProcessSpeechByChunk(buffer)
+		if err != nil {
+			fmt.Println(err)
+			break
+		}
+		//if count > 3 {
+		//	break
+		//}
+	}
+	fmt.Println(len(sv.VadEventMarkerDeque))
+	var marker streaming_vad.VadEventMarkerType
+	for i := 0; i < len(sv.VadEventMarkerDeque); i++ {
+		marker = sv.VadEventMarkerDeque[i]
+		fmt.Println(marker.Time)
+		fmt.Println(marker.VadFlag)
+	}
+}