Page MenuHomePhabricator

DictationConfidence.cpp

Authored By
Jsalsman
Jul 8 2019, 6:25 PM
Size
6 KB
Referenced Files
None
Subscribers
None

DictationConfidence.cpp

// DictationConfidence.cpp
// adapted from https://msdn.microsoft.com/en-us/library/ms717071(v=vs.85).aspx
// and https://stackoverflow.com/a/40002268
// This file alone works as a "C++ Console Application" in
// Visual Studio Community 2017 with Visual C++ 2017 and
// Microsoft Speech SKD v11.0 (SAPI v5.3) which are free
#include "stdafx.h"
#include <sphelper.h>
#include <string>
#include <math.h>
int compare(const void * a, const void * b)
{
if (*(float*)a == *(float*)b) return 0;
if (*(float*)a < *(float*)b) return -1;
else return 1;
}
int wmain(int argc, wchar_t **argv)
{
CComPtr<ISpStream> cpInputStream;
CComPtr<ISpRecognizer> cpRecognizer;
CComPtr<ISpRecoContext> cpRecoContext;
CComPtr<ISpRecoGrammar> cpRecoGrammar;
HRESULT hr;
#define CHECK_HR if (hr != S_OK) { fprintf(stderr, \
"DictationConfidence: failed at line %d (error %d)\n", \
__LINE__, hr); return -1; }
int n = 0, i;
#define MAX_SCORES 50000
float scores[MAX_SCORES];
double asum, hsum, gprod, qsum, csum, med;
if (argc != 2) {
fprintf(stderr, "usage: DictationConfidence [filename.wav]\n");
return -1;
}
hr = CoInitialize(NULL);
CHECK_HR;
// Create basic SAPI stream object
// NOTE: The helper SpBindToFile can be used to perform the following operations
hr = cpInputStream.CoCreateInstance(CLSID_SpStream);
CHECK_HR;
CSpStreamFormat sInputFormat;
// generate WaveFormatEx structure, assuming the wav format is 22kHz, 16-bit, Stereo
hr = sInputFormat.AssignFormat(SPSF_32kHz16BitStereo);
CHECK_HR;
// setup stream object with wav file MY_WAVE_AUDIO_FILENAME
// for read-only access, since it will only be access by the SR engine
hr = cpInputStream->BindToFile(
(LPCWSTR) argv[1], // first command linke argument is filename
SPFM_OPEN_READONLY,
&sInputFormat.FormatId(), // had to add the '&'
sInputFormat.WaveFormatExPtr(),
SPFEI_ALL_EVENTS);
if (hr != S_OK) {
fprintf(stderr, "DictationConfidence: can't open %ws\n", argv[1]);
return -1;
}
// Create in-process speech recognition engine
hr = cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer);
CHECK_HR;
// Use all available CPU
hr = cpRecognizer->SetPropertyNum(L"ResourceUsage", 50);
// CHECK_HR; // don't care if unsupported
// Don't adapt to the speakers' voices
hr = cpRecognizer->SetPropertyNum(L"PersistedBackgroundAdaptation", 0);
CHECK_HR;
// connect wav input to recognizer
// SAPI will negotiate mismatched engine/input audio formats using system audio codecs, so second parameter is not important - use default of TRUE
hr = cpRecognizer->SetInput(cpInputStream, TRUE);
CHECK_HR;
// Create recognition context to receive events
hr = cpRecognizer->CreateRecoContext(&cpRecoContext);
CHECK_HR;
// Create grammar, and load dictation
// ignore grammar ID for simplicity's sake
// NOTE: Voice command apps would load CFG here
hr = cpRecoContext->CreateGrammar(NULL, &cpRecoGrammar);
CHECK_HR;
hr = cpRecoGrammar->LoadDictation(NULL, SPLO_STATIC);
CHECK_HR;
// check for recognitions and end of stream event
hr = cpRecoContext->SetInterest(SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_END_SR_STREAM), SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_END_SR_STREAM));
CHECK_HR;
// use Win32 events for command-line style application
hr = cpRecoContext->SetNotifyWin32Event();
CHECK_HR;
// activate dictation, and begin recognition
hr = cpRecoGrammar->SetDictationState(SPRS_ACTIVE);
CHECK_HR;
// while events occur, continue processing
// timeout should be greater than the audio stream length, or a reasonable amount of time expected to pass before no more recognitions are expected in an audio stream
BOOL fEndStreamReached = FALSE;
while (!fEndStreamReached && S_OK == cpRecoContext->WaitForNotifyEvent(20000)) // 20 seconds
{
CSpEvent spEvent;
// pull all queued events from the reco context's event queue
while (!fEndStreamReached && S_OK == spEvent.GetFrom(cpRecoContext))
{
// Check event type
switch (spEvent.eEventId)
{
ISpRecoResult *pPhrase;
SPRECORESULTTIMES pTimes;
SPPHRASE *phrase;
// speech recognition engine recognized some audio
case SPEI_RECOGNITION:
pPhrase = spEvent.RecoResult();
pPhrase->GetPhrase(&phrase);
pPhrase->GetResultTimes(&pTimes);
if (phrase != NULL && phrase->pElements != NULL) {
for (ULONG i = 0; i < (ULONG)phrase->Rule.ulCountOfElements; ++i) {
if (phrase->pElements[i].pszDisplayText != NULL) {
std::wstring outString = phrase->pElements[i].pszDisplayText;
std::string soutString = std::string(outString.begin(), outString.end());
printf("%s -- confidence: %.4f at %.2fs for %.2fs\n",
soutString.c_str(), phrase->pElements[i].SREngineConfidence,
(pTimes.ullStart + phrase->pElements[i].ulAudioTimeOffset)
/ 10000000.0, phrase->pElements[i].ulAudioSizeTime / 10000000.0);
scores[n++] = phrase->pElements[i].SREngineConfidence;
if (n == MAX_SCORES) {
fprintf(stderr, "DictationConfidence: too many scores");
return -1;
}
}
}
}
break;
// end of the wav file was reached by the speech recognition engine
case SPEI_END_SR_STREAM:
fEndStreamReached = TRUE;
break;
}
// clear any event data/object references
spEvent.Clear();
}// END event pulling loop - break on empty event queue OR end stream
}// END event polling loop - break on event timeout OR end stream
// deactivate dictation
hr = cpRecoGrammar->SetDictationState(SPRS_INACTIVE);
CHECK_HR;
// unload dictation topic
hr = cpRecoGrammar->UnloadDictation();
CHECK_HR;
// close the input stream, since we're done with it
// NOTE: smart pointer will call SpStream's destructor, and consequently ::Close, but code may want to check for errors on ::Close operation
hr = cpInputStream->Close();
CHECK_HR;
if (n < 1) return 0;
asum = 0.0; hsum = 0.0; gprod = 1.0; qsum = 0.0; csum = 0.0;
for (i = 0; i < n; i++) {
asum += scores[i];
hsum += 1.0 / scores[i];
gprod *= scores[i];
qsum += scores[i] * scores[i];
csum += scores[i] * scores[i] * scores[i];
}
printf("means: arithmetic %.4f harmonic %.4f geometric %.4f " \
"quadratic %.4f cubic %.4f", asum / n, n / hsum, pow(gprod, 1.0/n),
sqrt(qsum / n), pow(csum / n, 1.0/3.0));
qsort(&scores, n, sizeof(float), compare);
if (n % 2) {
med = scores[n / 2];
} else {
med = (scores[(n / 2) - 1] + scores[n / 2]) / 2;
}
printf(" median: %.4f\n", med);
return 0;
}

File Metadata

Mime Type
text/x-c
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
7710335
Default Alt Text
DictationConfidence.cpp (6 KB)

Event Timeline