Skip to content
This repository was archived by the owner on Nov 16, 2023. It is now read-only.

Add Custom Speech integration to SDK and sample #82

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 57 additions & 12 deletions samples/browser/Sample.html
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,22 @@ <h1 style="font-weight:500;">Speech Recognition</h1>
<h2 style="font-weight:500;">Microsoft Cognitive Services</h2>
</td>
</tr>
<tr>
<td align="right">Recognition API:</td>
<td align="left">
<select id="recognitionAPI">
<option value="Bing Speech" selected="selected">Bing Speech</option>
<option value="Custom Speech">Custom Speech</option>
</select>
</td>
</tr>
<tr>
<td align="right"><a href="https://www.microsoft.com/cognitive-services/en-us/sign-up" target="_blank">Custom Speech Endpoint ID<br/>(leave blank for Bing Speech)</a>:</td>
<td><input id="endpointId" type="text" size="40" value="YOUR_CUSTOM_SPEECH_ENDPOINT_ID"></td>
</tr>
<tr>
<td align="right"><a href="https://www.microsoft.com/cognitive-services/en-us/sign-up" target="_blank">Subscription</a>:</td>
<td><input id="key" type="text" size="40" value="YOUR_BING_SPEECH_API_KEY"></td>
<td><input id="key" type="text" size="40" value="YOUR_API_SUBSCRIPTION_KEY"></td>
</tr>
<tr>
<td align="right">Language:</td>
Expand Down Expand Up @@ -137,7 +150,7 @@ <h2 style="font-weight:500;">Microsoft Cognitive Services</h2>
}

// Setup the recognizer
function RecognizerSetup(SDK, recognitionMode, language, format, subscriptionKey) {
function RecognizerSetup(SDK, recognitionMode, language, format, subscriptionKey, recognitionAPI, endpointId) {

switch (recognitionMode) {
case "Interactive" :
Expand All @@ -153,15 +166,28 @@ <h2 style="font-weight:500;">Microsoft Cognitive Services</h2>
recognitionMode = SDK.RecognitionMode.Interactive;
}

switch (recognitionAPI) {
case "Bing Speech":
recognitionAPI = SDK.RecognitionAPI.BingSpeech;
break;
case "Custom Speech":
recognitionAPI = SDK.RecognitionAPI.CustomSpeech;
break;
default:
recognitionAPI = SDK.RecognitionAPI.BingSpeech;
break;
}

var recognizerConfig = new SDK.RecognizerConfig(
new SDK.SpeechConfig(
new SDK.Context(
new SDK.OS(navigator.userAgent, "Browser", null),
new SDK.Device("SpeechSample", "SpeechSample", "1.0.00000"))),
recognitionMode,
language, // Supported languages are specific to each recognition mode. Refer to docs.
format); // SDK.SpeechResultFormat.Simple (Options - Simple/Detailed)

format, // SDK.SpeechResultFormat.Simple (Options - Simple/Detailed)
recognitionAPI, // supports Bing Speech and Custom Speech
endpointId); // Custom Speech subscription key, see https://cris.ai portal

var useTokenAuth = false;

Expand Down Expand Up @@ -267,10 +293,10 @@ <h2 style="font-weight:500;">Microsoft Cognitive Services</h2>
<!-- Browser Hooks -->
<script>
var startBtn, stopBtn, hypothesisDiv, phraseDiv, statusDiv;
var key, languageOptions, formatOptions, recognitionMode, inputSource, filePicker;
var endpointId, key, languageOptions, formatOptions, recognitionMode, inputSource, filePicker;
var SDK;
var recognizer;
var previousSubscriptionKey;
var previousEndpointId, previousSubscriptionKey;

document.addEventListener("DOMContentLoaded", function () {
createBtn = document.getElementById("createBtn");
Expand All @@ -279,6 +305,7 @@ <h2 style="font-weight:500;">Microsoft Cognitive Services</h2>
phraseDiv = document.getElementById("phraseDiv");
hypothesisDiv = document.getElementById("hypothesisDiv");
statusDiv = document.getElementById("statusDiv");
endpointId = document.getElementById("endpointId");
key = document.getElementById("key");
languageOptions = document.getElementById("languageOptions");
formatOptions = document.getElementById("formatOptions");
Expand All @@ -289,16 +316,22 @@ <h2 style="font-weight:500;">Microsoft Cognitive Services</h2>
languageOptions.addEventListener("change", Setup);
formatOptions.addEventListener("change", Setup);
recognitionMode.addEventListener("change", Setup);
recognitionAPI.addEventListener("change", Setup);

startBtn.addEventListener("click", function () {
if (key.value == "" || key.value == "YOUR_BING_SPEECH_API_KEY") {
alert("Please enter your Bing Speech subscription key!");
if (recognitionAPI.value === "Custom Speech" && (endpointId.value == "" || endpointId.value == "YOUR_CUSTOM_SPEECH_ENDPOINT_ID")) {
alert("Please enter your Custom Speech endpoint id!");
return;
}
if (key.value == "" || key.value == "YOUR_API_SUBSCRIPTION_KEY") {
alert("Please enter your Bing Speech or Custom Speech API subscription key!");
return;
}
if (inputSource.value === "File") {
filePicker.click();
} else {
if (!recognizer || previousSubscriptionKey != key.value) {
if (!recognizer || previousSubscriptionKey != key.value || previousEndpointId != endpointId.value) {
previousEndpointId = endpointId.value;
previousSubscriptionKey = key.value;
Setup();
}
Expand All @@ -311,15 +344,27 @@ <h2 style="font-weight:500;">Microsoft Cognitive Services</h2>
}
});

endpointId.addEventListener("focus", function () {
if (endpointId.value == "YOUR_CUSTOM_SPEECH_ENDPOINT_ID") {
endpointId.value = "";
}
});

endpointId.addEventListener("focusout", function () {
if (endpointId.value == "") {
endpointId.value = "YOUR_CUSTOM_SPEECH_ENDPOINT_ID";
}
});

key.addEventListener("focus", function () {
if (key.value == "YOUR_BING_SPEECH_API_KEY") {
if (key.value == "YOUR_API_SUBSCRIPTION_KEY") {
key.value = "";
}
});

key.addEventListener("focusout", function () {
if (key.value == "") {
key.value = "YOUR_BING_SPEECH_API_KEY";
key.value = "YOUR_API_SUBSCRIPTION_KEY";
}
});

Expand Down Expand Up @@ -349,7 +394,7 @@ <h2 style="font-weight:500;">Microsoft Cognitive Services</h2>
if (recognizer != null) {
RecognizerStop(SDK, recognizer);
}
recognizer = RecognizerSetup(SDK, recognitionMode.value, languageOptions.value, SDK.SpeechResultFormat[formatOptions.value], key.value);
recognizer = RecognizerSetup(SDK, recognitionMode.value, languageOptions.value, SDK.SpeechResultFormat[formatOptions.value], key.value, recognitionAPI.value, endpointId.value);
}

function UpdateStatus(status) {
Expand Down
7 changes: 7 additions & 0 deletions src/common.browser/WebsocketConnection.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import {
PlatformEvent,
Promise,
} from "../common/Exports";
import { RecognitionAPI } from "../sdk/speech/Exports";
import { WebsocketMessageAdapter } from "./WebsocketMessageAdapter";

export class WebsocketConnection implements IConnection {
Expand All @@ -27,6 +28,8 @@ export class WebsocketConnection implements IConnection {
queryParameters: IStringDictionary<string>,
headers: IStringDictionary<string>,
messageFormatter: IWebsocketMessageFormatter,
recognitionAPI: RecognitionAPI,
endpointId: string,
connectionId?: string) {

if (!uri) {
Expand All @@ -41,6 +44,10 @@ export class WebsocketConnection implements IConnection {

let queryParams = "";
let i = 0;
if (recognitionAPI === RecognitionAPI.CustomSpeech) {
queryParams = "?cid=" + endpointId;
i++;
}

if (queryParameters) {
for (const paramName in queryParameters) {
Expand Down
15 changes: 14 additions & 1 deletion src/sdk/speech.browser/SpeechConnectionFactory.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import {
AuthInfo,
IAuthentication,
IConnectionFactory,
RecognitionAPI,
RecognitionMode,
RecognizerConfig,
SpeechResultFormat,
Expand All @@ -25,6 +26,18 @@ export class SpeechConnectionFactory implements IConnectionFactory {
authInfo: AuthInfo,
connectionId?: string): IConnection => {

switch (config.RecognitionAPI) {
case RecognitionAPI.BingSpeech:
Storage.Local.Set("Host", "wss://speech.platform.bing.com");
break;
case RecognitionAPI.CustomSpeech:
Storage.Local.Set("Host", "wss://westus.stt.speech.microsoft.com");
break;
default:
Storage.Local.Set("Host", "wss://speech.platform.bing.com");
break;
}

let endpoint = "";
switch (config.RecognitionMode) {
case RecognitionMode.Conversation:
Expand All @@ -51,7 +64,7 @@ export class SpeechConnectionFactory implements IConnectionFactory {
headers[authInfo.HeaderName] = authInfo.Token;
headers[ConnectionIdHeader] = connectionId;

return new WebsocketConnection(endpoint, queryParams, headers, new WebsocketMessageFormatter(), connectionId);
return new WebsocketConnection(endpoint, queryParams, headers, new WebsocketMessageFormatter(), config.RecognitionAPI, config.EndpointId, connectionId);
}

private get Host(): string {
Expand Down
21 changes: 20 additions & 1 deletion src/sdk/speech/RecognizerConfig.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@

export enum RecognitionAPI {
BingSpeech,
CustomSpeech,
}

export enum RecognitionMode {
Interactive,
Conversation,
Expand All @@ -16,17 +21,23 @@ export class RecognizerConfig {
private format: SpeechResultFormat;
private speechConfig: SpeechConfig;
private recognitionActivityTimeout: number;
private recognitionAPI: RecognitionAPI;
private endpointId: string;

constructor(
platformConfig: SpeechConfig,
recognitionMode: RecognitionMode = RecognitionMode.Interactive,
language: string = "en-us",
format: SpeechResultFormat = SpeechResultFormat.Simple) {
format: SpeechResultFormat = SpeechResultFormat.Simple,
recognitionAPI: RecognitionAPI,
endpointId: string) {
this.speechConfig = platformConfig ? platformConfig : new SpeechConfig(new Context(null, null));
this.recognitionMode = recognitionMode;
this.language = language;
this.format = format;
this.recognitionActivityTimeout = recognitionMode === RecognitionMode.Interactive ? 8000 : 25000;
this.recognitionAPI = recognitionAPI,
this.endpointId = endpointId;
}

public get RecognitionMode(): RecognitionMode {
Expand All @@ -52,6 +63,14 @@ export class RecognizerConfig {
public get IsContinuousRecognition(): boolean {
return this.recognitionMode !== RecognitionMode.Interactive;
}

public get RecognitionAPI(): RecognitionAPI {
return this.recognitionAPI;
}

public get EndpointId(): string {
return this.endpointId;
}
}

// tslint:disable-next-line:max-classes-per-file
Expand Down