DictationConfidence.cpp
Jsalsman (James Salsman)
Actions

Authored By

	• Jsalsman
	Jul 8 2019, 6:25 PM

Size

6 KB

Referenced Files

None

Subscribers

None

DictationConfidence.cpp
View Options

	// DictationConfidence.cpp
	// adapted from https://msdn.microsoft.com/en-us/library/ms717071(v=vs.85).aspx
	// and https://stackoverflow.com/a/40002268
	// This file alone works as a "C++ Console Application" in
	// Visual Studio Community 2017 with Visual C++ 2017 and
	// Microsoft Speech SKD v11.0 (SAPI v5.3) which are free

	#include "stdafx.h"
	#include <sphelper.h>
	#include <string>
	#include <math.h>

	int compare(const void * a, const void * b)
	{
	if ((float)a == (float)b) return 0;
	if ((float)a < (float)b) return -1;
	else return 1;
	}

	int wmain(int argc, wchar_t **argv)
	{
	CComPtr<ISpStream> cpInputStream;
	CComPtr<ISpRecognizer> cpRecognizer;
	CComPtr<ISpRecoContext> cpRecoContext;
	CComPtr<ISpRecoGrammar> cpRecoGrammar;

	HRESULT hr;
	#define CHECK_HR if (hr != S_OK) { fprintf(stderr, \
	"DictationConfidence: failed at line %d (error %d)\n", \
	__LINE__, hr); return -1; }

	int n = 0, i;
	#define MAX_SCORES 50000
	float scores[MAX_SCORES];
	double asum, hsum, gprod, qsum, csum, med;

	if (argc != 2) {
	fprintf(stderr, "usage: DictationConfidence [filename.wav]\n");
	return -1;
	}

	hr = CoInitialize(NULL);
	CHECK_HR;

	// Create basic SAPI stream object
	// NOTE: The helper SpBindToFile can be used to perform the following operations
	hr = cpInputStream.CoCreateInstance(CLSID_SpStream);
	CHECK_HR;

	CSpStreamFormat sInputFormat;
	// generate WaveFormatEx structure, assuming the wav format is 22kHz, 16-bit, Stereo
	hr = sInputFormat.AssignFormat(SPSF_32kHz16BitStereo);
	CHECK_HR;

	// setup stream object with wav file MY_WAVE_AUDIO_FILENAME
	// for read-only access, since it will only be access by the SR engine
	hr = cpInputStream->BindToFile(
	(LPCWSTR) argv[1], // first command linke argument is filename
	SPFM_OPEN_READONLY,
	&sInputFormat.FormatId(), // had to add the '&'
	sInputFormat.WaveFormatExPtr(),
	SPFEI_ALL_EVENTS);
	if (hr != S_OK) {
	fprintf(stderr, "DictationConfidence: can't open %ws\n", argv[1]);
	return -1;
	}

	// Create in-process speech recognition engine
	hr = cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer);
	CHECK_HR;

	// Use all available CPU
	hr = cpRecognizer->SetPropertyNum(L"ResourceUsage", 50);
	// CHECK_HR; // don't care if unsupported

	// Don't adapt to the speakers' voices
	hr = cpRecognizer->SetPropertyNum(L"PersistedBackgroundAdaptation", 0);
	CHECK_HR;

	// connect wav input to recognizer
	// SAPI will negotiate mismatched engine/input audio formats using system audio codecs, so second parameter is not important - use default of TRUE
	hr = cpRecognizer->SetInput(cpInputStream, TRUE);
	CHECK_HR;

	// Create recognition context to receive events
	hr = cpRecognizer->CreateRecoContext(&cpRecoContext);
	CHECK_HR;

	// Create grammar, and load dictation
	// ignore grammar ID for simplicity's sake
	// NOTE: Voice command apps would load CFG here
	hr = cpRecoContext->CreateGrammar(NULL, &cpRecoGrammar);
	CHECK_HR;

	hr = cpRecoGrammar->LoadDictation(NULL, SPLO_STATIC);
	CHECK_HR;

	// check for recognitions and end of stream event
	hr = cpRecoContext->SetInterest(SPFEI(SPEI_RECOGNITION) \| SPFEI(SPEI_END_SR_STREAM), SPFEI(SPEI_RECOGNITION) \| SPFEI(SPEI_END_SR_STREAM));
	CHECK_HR;

	// use Win32 events for command-line style application
	hr = cpRecoContext->SetNotifyWin32Event();
	CHECK_HR;

	// activate dictation, and begin recognition
	hr = cpRecoGrammar->SetDictationState(SPRS_ACTIVE);
	CHECK_HR;

	// while events occur, continue processing
	// timeout should be greater than the audio stream length, or a reasonable amount of time expected to pass before no more recognitions are expected in an audio stream
	BOOL fEndStreamReached = FALSE;
	while (!fEndStreamReached && S_OK == cpRecoContext->WaitForNotifyEvent(20000)) // 20 seconds
	{
	CSpEvent spEvent;
	// pull all queued events from the reco context's event queue

	while (!fEndStreamReached && S_OK == spEvent.GetFrom(cpRecoContext))
	{
	// Check event type
	switch (spEvent.eEventId)
	{
	ISpRecoResult *pPhrase;
	SPRECORESULTTIMES pTimes;
	SPPHRASE *phrase;

	// speech recognition engine recognized some audio
	case SPEI_RECOGNITION:

	pPhrase = spEvent.RecoResult();
	pPhrase->GetPhrase(&phrase);
	pPhrase->GetResultTimes(&pTimes);

	if (phrase != NULL && phrase->pElements != NULL) {
	for (ULONG i = 0; i < (ULONG)phrase->Rule.ulCountOfElements; ++i) {
	if (phrase->pElements[i].pszDisplayText != NULL) {
	std::wstring outString = phrase->pElements[i].pszDisplayText;
	std::string soutString = std::string(outString.begin(), outString.end());
	printf("%s -- confidence: %.4f at %.2fs for %.2fs\n",
	soutString.c_str(), phrase->pElements[i].SREngineConfidence,
	(pTimes.ullStart + phrase->pElements[i].ulAudioTimeOffset)
	/ 10000000.0, phrase->pElements[i].ulAudioSizeTime / 10000000.0);
	scores[n++] = phrase->pElements[i].SREngineConfidence;
	if (n == MAX_SCORES) {
	fprintf(stderr, "DictationConfidence: too many scores");
	return -1;
	}
	}
	}
	}

	break;

	// end of the wav file was reached by the speech recognition engine
	case SPEI_END_SR_STREAM:
	fEndStreamReached = TRUE;
	break;
	}

	// clear any event data/object references
	spEvent.Clear();
	}// END event pulling loop - break on empty event queue OR end stream
	}// END event polling loop - break on event timeout OR end stream

	// deactivate dictation
	hr = cpRecoGrammar->SetDictationState(SPRS_INACTIVE);
	CHECK_HR;

	// unload dictation topic
	hr = cpRecoGrammar->UnloadDictation();
	CHECK_HR;

	// close the input stream, since we're done with it
	// NOTE: smart pointer will call SpStream's destructor, and consequently ::Close, but code may want to check for errors on ::Close operation
	hr = cpInputStream->Close();
	CHECK_HR;

	if (n < 1) return 0;

	asum = 0.0; hsum = 0.0; gprod = 1.0; qsum = 0.0; csum = 0.0;

	for (i = 0; i < n; i++) {
	asum += scores[i];
	hsum += 1.0 / scores[i];
	gprod *= scores[i];
	qsum += scores[i] * scores[i];
	csum += scores[i] * scores[i] * scores[i];
	}
	printf("means: arithmetic %.4f harmonic %.4f geometric %.4f " \
	"quadratic %.4f cubic %.4f", asum / n, n / hsum, pow(gprod, 1.0/n),
	sqrt(qsum / n), pow(csum / n, 1.0/3.0));

	qsort(&scores, n, sizeof(float), compare);
	if (n % 2) {
	med = scores[n / 2];
	} else {
	med = (scores[(n / 2) - 1] + scores[n / 2]) / 2;
	}
	printf(" median: %.4f\n", med);

	return 0;
	}

File Metadata

Mime Type: text/x-c
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 7710335
Default Alt Text: DictationConfidence.cpp (6 KB)

DictationConfidence.cppJsalsman (James Salsman)Actions

DictationConfidence.cppView Options

File Metadata

Event Timeline

DictationConfidence.cpp
Jsalsman (James Salsman)
Actions

DictationConfidence.cpp
View Options