package vox import ( "encoding/json" "errors" "git.sr.ht/~geb/numen/vox/phrasesplit" "github.com/m7shapan/njson" "math" "strings" vosk "github.com/alphacep/vosk-api/go" ) func init() { vosk.SetLogLevel(-1) } func NewModel(filepath string) (*vosk.VoskModel, error) { return vosk.NewModel(filepath) } type PhraseResult struct { Text string Confidence float64 Start, End int } type Result struct { Text string Phrases []PhraseResult Confidence float64 Valid, Partial bool } type Recognizer struct { VoskRecognizer *vosk.VoskRecognizer phraseMap map[string][]string sampleRate, byteDepth int bytesRead int Audio []byte finalized bool keyphrases bool } func NewRecognizer(model *vosk.VoskModel, sampleRate, bitDepth int, phrases []string) (*Recognizer, error) { if bitDepth % 8 != 0 { panic("bitDepth must be a multiple of eight") } var r *vosk.VoskRecognizer if phrases == nil { var err error r, err = vosk.NewRecognizer(model, float64(sampleRate)) if err != nil { return nil, err } } else { j, err := json.Marshal(phrases) if err != nil { panic(err.Error()) } r, err = vosk.NewRecognizerGrm(model, float64(sampleRate), string(j)) if err != nil { return nil, err } } p := phrasesplit.Parse(phrases) return &Recognizer{r, p, sampleRate, bitDepth/8, 0, nil, false, false}, nil } func (r *Recognizer) Free() { r.VoskRecognizer.Free() } func (r *Recognizer) SetGrm(phrases []string) { j, err := json.Marshal(phrases) if err != nil { panic(err.Error()) } audio := r.Audio r.Reset() r.VoskRecognizer.SetGrm(string(j)) r.phraseMap = phrasesplit.Parse(phrases) r.Audio = audio } func (r *Recognizer) SetKeyphrases(b bool) { r.keyphrases = b } func (r *Recognizer) SetMaxAlternatives(n int) { r.VoskRecognizer.SetMaxAlternatives(n) } func (r *Recognizer) SetPartialWords(b bool) { if b { r.VoskRecognizer.SetPartialWords(1) } else { r.VoskRecognizer.SetPartialWords(0) } } func (r *Recognizer) SetWords(b bool) { if b { r.VoskRecognizer.SetWords(1) } else { r.VoskRecognizer.SetWords(0) } } func (r *Recognizer) index(time float64) int { rate := float64(r.sampleRate * r.byteDepth) i := time * rate - float64(r.bytesRead - len(r.Audio)) // round to byteDepth multiple i = math.Round(i / float64(r.byteDepth)) * float64(r.byteDepth) if i < 0 { return 0 } if int(i) > len(r.Audio) { return len(r.Audio) } return int(i) } func (r *Recognizer) parseVoskResults(json string) []Result { type ResultJson struct { Text string `njson:"text"` Words []string `njson:"result.#.word"` Confs []float64 `njson:"result.#.conf"` Starts []float64 `njson:"result.#.start"` Ends []float64 `njson:"result.#.end"` Confidence float64 `njson:"confidence"` // only with alternatives } var s struct { Alternatives []ResultJson `njson:"alternatives"` // copy paste of ResultJson Text string `njson:"text"` Words []string `njson:"result.#.word"` Confs []float64 `njson:"result.#.conf"` Starts []float64 `njson:"result.#.start"` Ends []float64 `njson:"result.#.end"` Confidence float64 `njson:"confidence"` // only with alternatives ParText string `njson:"partial"` ParWords []string `njson:"partial_result.#.word"` ParConfs []float64 `njson:"partial_result.#.conf"` ParStarts []float64 `njson:"partial_result.#.start"` ParEnds []float64 `njson:"partial_result.#.end"` } err := njson.Unmarshal([]byte(json), &s) if err != nil { panic(err) } if len(s.Alternatives) > 0 { results := make([]Result, len(s.Alternatives)) for a := range s.Alternatives { results[a].Text = s.Alternatives[a].Text results[a].Confidence = s.Alternatives[a].Confidence results[a].Phrases = make([]PhraseResult, len(s.Alternatives[a].Words)) for p := range results[a].Phrases { results[a].Phrases[p] = PhraseResult{ s.Alternatives[a].Words[p], -1, // conf isn't given r.index(s.Alternatives[a].Starts[p]), r.index(s.Alternatives[a].Ends[p]), } } } return results } if len(s.Text) > 0 { result := Result{Text: s.Text} result.Confidence = -1 // confidence isn't given result.Phrases = make([]PhraseResult, len(s.Words)) for p := range result.Phrases { result.Phrases[p] = PhraseResult{ s.Words[p], s.Confs[p], r.index(s.Starts[p]), r.index(s.Ends[p]), } } return []Result{result} } result := Result{Text: s.ParText, Partial: true} result.Confidence = -1 // confidence isn't given result.Phrases = make([]PhraseResult, len(s.ParWords)) for p := range result.Phrases { result.Phrases[p] = PhraseResult{ s.ParWords[p], s.ParConfs[p], r.index(s.ParStarts[p]), r.index(s.ParEnds[p]), } } return []Result{result} } func (r *Recognizer) parseResults(json string) []Result { if !r.keyphrases { return r.parseVoskResults(json) } results := r.parseVoskResults(json) for ri := range results { phrases, ok := phrasesplit.Split(results[ri].Text, r.phraseMap) results[ri].Valid = ok if len(results[ri].Phrases) > 0 && len(phrases) > 0 { n := strings.Count(strings.Join(phrases, " "), " ") + 1 results[ri].Phrases = results[ri].Phrases[:n] } else { results[ri].Phrases = results[ri].Phrases[:0] } for pi := 0; pi < len(results[ri].Phrases) && len(phrases) > 0; pi++ { n := strings.Count(phrases[0], " ") + 1 if n > 1 { text := results[ri].Phrases[pi].Text conf := results[ri].Phrases[pi].Confidence for _, p := range results[ri].Phrases[pi+1:pi+n] { text += " " + p.Text conf += p.Confidence } conf /= float64(n) start := results[ri].Phrases[pi].Start end := results[ri].Phrases[pi+n-1].End results[ri].Phrases = append(results[ri].Phrases[:pi+1], results[ri].Phrases[pi+n:]...) results[ri].Phrases[pi] = PhraseResult{text, conf, start, end} } phrases = phrases[1:] } } return results } func (r *Recognizer) Accept(audio []byte) (bool, error) { if r.finalized { r.Audio = nil r.finalized = false } if r.Audio == nil { // Prepending silence seems to help, especially when no required pause. audio = append(make([]byte, 4096), audio...) } r.bytesRead += len(audio) r.Audio = append(r.Audio, audio...) code := r.VoskRecognizer.AcceptWaveform(audio) if code == -1 { return false, errors.New("an exception occurred") } r.finalized = code == 1 return r.finalized, nil } func (r *Recognizer) Results() []Result { if r.finalized { return r.parseResults(r.VoskRecognizer.Result()) } return r.parseResults(r.VoskRecognizer.PartialResult()) } func (r *Recognizer) FinalResults() []Result { r.finalized = true return r.parseResults(r.VoskRecognizer.FinalResult()) } func (r *Recognizer) Reset() { r.VoskRecognizer.Reset() r.Audio = nil }