From 7289bbc9c96670fdcc407cbdc751af7623d7c40c Mon Sep 17 00:00:00 2001 From: Stypox Date: Fri, 26 Jul 2024 14:06:16 +0200 Subject: [PATCH 1/5] Implement RecognitionService Dicio now appears in Voice Input --- app/src/main/AndroidManifest.xml | 26 +++++++ .../stypox/dicio/di/SttInputDeviceWrapper.kt | 12 +++- .../stypox/dicio/io/input/SttInputDevice.kt | 4 +- .../dicio/io/input/stt_service/SttService.kt | 72 +++++++++++++++++++ .../dicio/io/input/vosk/VoskInputDevice.kt | 19 ++++- app/src/main/res/xml/stt_service_metadata.xml | 4 ++ 6 files changed, 132 insertions(+), 5 deletions(-) create mode 100644 app/src/main/kotlin/org/stypox/dicio/io/input/stt_service/SttService.kt create mode 100644 app/src/main/res/xml/stt_service_metadata.xml diff --git a/app/src/main/AndroidManifest.xml b/app/src/main/AndroidManifest.xml index e1d23d0b..a83cf9c3 100644 --- a/app/src/main/AndroidManifest.xml +++ b/app/src/main/AndroidManifest.xml @@ -22,6 +22,15 @@ + + + + + + + + + + + + + + + diff --git a/app/src/main/kotlin/org/stypox/dicio/di/SttInputDeviceWrapper.kt b/app/src/main/kotlin/org/stypox/dicio/di/SttInputDeviceWrapper.kt index d063b748..d645e831 100644 --- a/app/src/main/kotlin/org/stypox/dicio/di/SttInputDeviceWrapper.kt +++ b/app/src/main/kotlin/org/stypox/dicio/di/SttInputDeviceWrapper.kt @@ -31,7 +31,9 @@ import javax.inject.Singleton interface SttInputDeviceWrapper { val uiState: StateFlow - fun tryLoad(thenStartListeningEventListener: ((InputEvent) -> Unit)?) + fun tryLoad(thenStartListeningEventListener: ((InputEvent) -> Unit)?): Boolean + + fun stopListening() fun onClick(eventListener: (InputEvent) -> Unit) } @@ -98,8 +100,12 @@ class SttInputDeviceWrapperImpl( } - override fun tryLoad(thenStartListeningEventListener: ((InputEvent) -> Unit)?) { - sttInputDevice?.tryLoad(thenStartListeningEventListener) + override fun tryLoad(thenStartListeningEventListener: ((InputEvent) -> Unit)?): Boolean { + return sttInputDevice?.tryLoad(thenStartListeningEventListener) ?: false + } + + override fun stopListening() { + sttInputDevice?.stopListening() } override fun onClick(eventListener: (InputEvent) -> Unit) { diff --git a/app/src/main/kotlin/org/stypox/dicio/io/input/SttInputDevice.kt b/app/src/main/kotlin/org/stypox/dicio/io/input/SttInputDevice.kt index be67a900..4c6eb80e 100644 --- a/app/src/main/kotlin/org/stypox/dicio/io/input/SttInputDevice.kt +++ b/app/src/main/kotlin/org/stypox/dicio/io/input/SttInputDevice.kt @@ -6,7 +6,9 @@ import org.stypox.dicio.ui.home.SttState interface SttInputDevice { val uiState: StateFlow - fun tryLoad(thenStartListeningEventListener: ((InputEvent) -> Unit)?) + fun tryLoad(thenStartListeningEventListener: ((InputEvent) -> Unit)?): Boolean + + fun stopListening() fun onClick(eventListener: (InputEvent) -> Unit) diff --git a/app/src/main/kotlin/org/stypox/dicio/io/input/stt_service/SttService.kt b/app/src/main/kotlin/org/stypox/dicio/io/input/stt_service/SttService.kt new file mode 100644 index 00000000..4edd7d11 --- /dev/null +++ b/app/src/main/kotlin/org/stypox/dicio/io/input/stt_service/SttService.kt @@ -0,0 +1,72 @@ +package org.stypox.dicio.io.input.stt_service + +import android.content.Intent +import android.os.Bundle +import android.speech.RecognitionService +import android.speech.SpeechRecognizer +import dagger.hilt.android.AndroidEntryPoint +import org.stypox.dicio.di.SttInputDeviceWrapper +import org.stypox.dicio.io.input.InputEvent +import javax.inject.Inject + + +@AndroidEntryPoint +class SttService : RecognitionService() { + + @Inject + lateinit var sttInputDevice: SttInputDeviceWrapper + + override fun onStartListening(recognizerIntent: Intent, listener: Callback) { + var beginningOfSpeech = true + val willStartListening = sttInputDevice.tryLoad { inputEvent -> + when (inputEvent) { + is InputEvent.Error -> { + listener.error(SpeechRecognizer.ERROR_SERVER) + } + is InputEvent.Final -> { + val results = Bundle() + results.putStringArrayList( + SpeechRecognizer.RESULTS_RECOGNITION, + ArrayList(inputEvent.utterances.map { it.first }) + ) + results.putFloatArray( + SpeechRecognizer.CONFIDENCE_SCORES, + inputEvent.utterances.map { it.second }.toFloatArray() + ) + listener.results(results) + listener.endOfSpeech() + } + InputEvent.None -> { + listener.error(SpeechRecognizer.ERROR_NO_MATCH) + listener.endOfSpeech() + } + is InputEvent.Partial -> { + if (beginningOfSpeech) { + listener.beginningOfSpeech() + beginningOfSpeech = false + } + val partResult = Bundle() + partResult.putStringArrayList( + SpeechRecognizer.RESULTS_RECOGNITION, + arrayListOf(inputEvent.utterance) + ) + listener.partialResults(partResult) + } + } + } + + if (!willStartListening) { + // TODO choose better error to indicate that manual intervention is required to + // download the Vosk model + listener.error(SpeechRecognizer.ERROR_NETWORK) + } + } + + override fun onCancel(listener: Callback) { + sttInputDevice.stopListening() + } + + override fun onStopListening(listener: Callback) { + sttInputDevice.stopListening() + } +} diff --git a/app/src/main/kotlin/org/stypox/dicio/io/input/vosk/VoskInputDevice.kt b/app/src/main/kotlin/org/stypox/dicio/io/input/vosk/VoskInputDevice.kt index b05fa198..b4f1594a 100644 --- a/app/src/main/kotlin/org/stypox/dicio/io/input/vosk/VoskInputDevice.kt +++ b/app/src/main/kotlin/org/stypox/dicio/io/input/vosk/VoskInputDevice.kt @@ -211,13 +211,20 @@ class VoskInputDevice( * * @param thenStartListeningEventListener if not `null`, causes the [VoskInputDevice] to start * listening after it has finished loading, and the received input events are sent there + * @return `true` if the input device will start listening (or be ready to do so in case + * `thenStartListeningEventListener == null`) at some point, + * `false` if manual user intervention is required to start listening */ - override fun tryLoad(thenStartListeningEventListener: ((InputEvent) -> Unit)?) { + override fun tryLoad(thenStartListeningEventListener: ((InputEvent) -> Unit)?): Boolean { val s = _state.value if (s == NotLoaded) { load(thenStartListeningEventListener) + return true } else if (thenStartListeningEventListener != null && s is Loaded) { startListening(s.speechService, thenStartListeningEventListener) + return true + } else { + return false } } @@ -252,6 +259,16 @@ class VoskInputDevice( } } + /** + * If the recognizer is currently listening, stops listening. Otherwise does nothing. + */ + override fun stopListening() { + when (val s = _state.value) { + is Listening -> stopListening(s.speechService, s.eventListener, true) + else -> {} + } + } + /** * Downloads the model zip file. Sets the state to [Downloading], and periodically updates it * with downloading progress, until either [ErrorDownloading] or [Downloaded] are set as state. diff --git a/app/src/main/res/xml/stt_service_metadata.xml b/app/src/main/res/xml/stt_service_metadata.xml new file mode 100644 index 00000000..fd24ec31 --- /dev/null +++ b/app/src/main/res/xml/stt_service_metadata.xml @@ -0,0 +1,4 @@ + + From 635c408f981251ff384633a488023f4fea6d8996 Mon Sep 17 00:00:00 2001 From: Stypox Date: Fri, 26 Jul 2024 14:32:29 +0200 Subject: [PATCH 2/5] Improve error reporting in STT service --- .../dicio/io/input/stt_service/SttService.kt | 44 +++++++++++++++++-- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/app/src/main/kotlin/org/stypox/dicio/io/input/stt_service/SttService.kt b/app/src/main/kotlin/org/stypox/dicio/io/input/stt_service/SttService.kt index 4edd7d11..dbde83bf 100644 --- a/app/src/main/kotlin/org/stypox/dicio/io/input/stt_service/SttService.kt +++ b/app/src/main/kotlin/org/stypox/dicio/io/input/stt_service/SttService.kt @@ -1,12 +1,19 @@ package org.stypox.dicio.io.input.stt_service import android.content.Intent +import android.os.Build import android.os.Bundle import android.speech.RecognitionService +import android.speech.RecognizerIntent import android.speech.SpeechRecognizer +import android.util.Log +import androidx.core.os.LocaleListCompat import dagger.hilt.android.AndroidEntryPoint +import org.stypox.dicio.di.LocaleManager import org.stypox.dicio.di.SttInputDeviceWrapper import org.stypox.dicio.io.input.InputEvent +import org.stypox.dicio.util.LocaleUtils +import java.util.Locale import javax.inject.Inject @@ -16,7 +23,24 @@ class SttService : RecognitionService() { @Inject lateinit var sttInputDevice: SttInputDeviceWrapper + @Inject + lateinit var localeManager: LocaleManager + override fun onStartListening(recognizerIntent: Intent, listener: Callback) { + val wantedLanguageExtra = recognizerIntent.getStringExtra(RecognizerIntent.EXTRA_LANGUAGE) + // "und" is "Undetermined", see https://www.loc.gov/standards/iso639-2/php/code_list.php + if (wantedLanguageExtra != null && wantedLanguageExtra != "und") { + val appLanguage = localeManager.locale.value.language + val wantedLanguage = Locale(wantedLanguageExtra).language + if (appLanguage != wantedLanguage) { + Log.e(TAG, "Unsupported language: app=$appLanguage wanted=$wantedLanguageExtra") + // From the javadoc of ERROR_LANGUAGE_UNAVAILABLE: Requested language is supported, + // but not available currently (e.g. not downloaded yet). + listener.error(ERROR_LANGUAGE_UNAVAILABLE) + return + } + } + var beginningOfSpeech = true val willStartListening = sttInputDevice.tryLoad { inputEvent -> when (inputEvent) { @@ -37,7 +61,7 @@ class SttService : RecognitionService() { listener.endOfSpeech() } InputEvent.None -> { - listener.error(SpeechRecognizer.ERROR_NO_MATCH) + listener.error(SpeechRecognizer.ERROR_SPEECH_TIMEOUT) listener.endOfSpeech() } is InputEvent.Partial -> { @@ -56,9 +80,7 @@ class SttService : RecognitionService() { } if (!willStartListening) { - // TODO choose better error to indicate that manual intervention is required to - // download the Vosk model - listener.error(SpeechRecognizer.ERROR_NETWORK) + listener.error(ERROR_LANGUAGE_UNAVAILABLE) } } @@ -69,4 +91,18 @@ class SttService : RecognitionService() { override fun onStopListening(listener: Callback) { sttInputDevice.stopListening() } + + companion object { + val TAG = SttService::class.simpleName + + /** + * From the javadoc of [SpeechRecognizer.ERROR_LANGUAGE_UNAVAILABLE]: Requested language is + * supported, but not available currently (e.g. not downloaded yet). + */ + val ERROR_LANGUAGE_UNAVAILABLE = if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) { + SpeechRecognizer.ERROR_LANGUAGE_UNAVAILABLE + } else { + SpeechRecognizer.ERROR_SERVER + } + } } From 45cd9e38622ad3f0d7a27ee5df139705e4cf844d Mon Sep 17 00:00:00 2001 From: Stypox Date: Fri, 26 Jul 2024 14:43:19 +0200 Subject: [PATCH 3/5] Log remote exceptions in STT service --- .../dicio/io/input/stt_service/SttService.kt | 40 ++++++++++++++----- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/app/src/main/kotlin/org/stypox/dicio/io/input/stt_service/SttService.kt b/app/src/main/kotlin/org/stypox/dicio/io/input/stt_service/SttService.kt index dbde83bf..89a68aeb 100644 --- a/app/src/main/kotlin/org/stypox/dicio/io/input/stt_service/SttService.kt +++ b/app/src/main/kotlin/org/stypox/dicio/io/input/stt_service/SttService.kt @@ -3,16 +3,15 @@ package org.stypox.dicio.io.input.stt_service import android.content.Intent import android.os.Build import android.os.Bundle +import android.os.RemoteException import android.speech.RecognitionService import android.speech.RecognizerIntent import android.speech.SpeechRecognizer import android.util.Log -import androidx.core.os.LocaleListCompat import dagger.hilt.android.AndroidEntryPoint import org.stypox.dicio.di.LocaleManager import org.stypox.dicio.di.SttInputDeviceWrapper import org.stypox.dicio.io.input.InputEvent -import org.stypox.dicio.util.LocaleUtils import java.util.Locale import javax.inject.Inject @@ -36,7 +35,7 @@ class SttService : RecognitionService() { Log.e(TAG, "Unsupported language: app=$appLanguage wanted=$wantedLanguageExtra") // From the javadoc of ERROR_LANGUAGE_UNAVAILABLE: Requested language is supported, // but not available currently (e.g. not downloaded yet). - listener.error(ERROR_LANGUAGE_UNAVAILABLE) + logRemoteExceptions { listener.error(ERROR_LANGUAGE_UNAVAILABLE) } return } } @@ -45,9 +44,15 @@ class SttService : RecognitionService() { val willStartListening = sttInputDevice.tryLoad { inputEvent -> when (inputEvent) { is InputEvent.Error -> { - listener.error(SpeechRecognizer.ERROR_SERVER) + logRemoteExceptions { listener.error(SpeechRecognizer.ERROR_SERVER) } } + is InputEvent.Final -> { + if (beginningOfSpeech) { + logRemoteExceptions { listener.beginningOfSpeech() } + beginningOfSpeech = false + } + val results = Bundle() results.putStringArrayList( SpeechRecognizer.RESULTS_RECOGNITION, @@ -57,30 +62,35 @@ class SttService : RecognitionService() { SpeechRecognizer.CONFIDENCE_SCORES, inputEvent.utterances.map { it.second }.toFloatArray() ) - listener.results(results) - listener.endOfSpeech() + + logRemoteExceptions { listener.results(results) } + logRemoteExceptions { listener.endOfSpeech() } } + InputEvent.None -> { - listener.error(SpeechRecognizer.ERROR_SPEECH_TIMEOUT) - listener.endOfSpeech() + logRemoteExceptions { listener.error(SpeechRecognizer.ERROR_SPEECH_TIMEOUT) } + logRemoteExceptions { listener.endOfSpeech() } } + is InputEvent.Partial -> { if (beginningOfSpeech) { - listener.beginningOfSpeech() + logRemoteExceptions { listener.beginningOfSpeech() } beginningOfSpeech = false } + val partResult = Bundle() partResult.putStringArrayList( SpeechRecognizer.RESULTS_RECOGNITION, arrayListOf(inputEvent.utterance) ) - listener.partialResults(partResult) + + logRemoteExceptions { listener.partialResults(partResult) } } } } if (!willStartListening) { - listener.error(ERROR_LANGUAGE_UNAVAILABLE) + logRemoteExceptions { listener.error(ERROR_LANGUAGE_UNAVAILABLE) } } } @@ -104,5 +114,13 @@ class SttService : RecognitionService() { } else { SpeechRecognizer.ERROR_SERVER } + + fun logRemoteExceptions(f: () -> Unit) { + try { + return f() + } catch (e: RemoteException) { + Log.e(TAG, "Remote exception", e) + } + } } } From 726f2d49027ff2fa67dd313a884b1451e60278d3 Mon Sep 17 00:00:00 2001 From: Stypox Date: Fri, 26 Jul 2024 15:04:49 +0200 Subject: [PATCH 4/5] Improve manifest metadata for STT service --- app/src/main/AndroidManifest.xml | 18 +++++------------- .../dicio/io/input/stt_service/SttService.kt | 1 + app/src/main/res/values/strings.xml | 1 + 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/app/src/main/AndroidManifest.xml b/app/src/main/AndroidManifest.xml index a83cf9c3..b0544d67 100644 --- a/app/src/main/AndroidManifest.xml +++ b/app/src/main/AndroidManifest.xml @@ -22,15 +22,6 @@ - - - - - - - + android:label="@string/stt_service_label" + android:permission="android.permission.RECORD_AUDIO"> diff --git a/app/src/main/kotlin/org/stypox/dicio/io/input/stt_service/SttService.kt b/app/src/main/kotlin/org/stypox/dicio/io/input/stt_service/SttService.kt index 89a68aeb..768381ec 100644 --- a/app/src/main/kotlin/org/stypox/dicio/io/input/stt_service/SttService.kt +++ b/app/src/main/kotlin/org/stypox/dicio/io/input/stt_service/SttService.kt @@ -90,6 +90,7 @@ class SttService : RecognitionService() { } if (!willStartListening) { + Log.w(TAG, "Could not start STT recognizer") logRemoteExceptions { listener.error(ERROR_LANGUAGE_UNAVAILABLE) } } } diff --git a/app/src/main/res/values/strings.xml b/app/src/main/res/values/strings.xml index 5abf4568..96a7324f 100644 --- a/app/src/main/res/values/strings.xml +++ b/app/src/main/res/values/strings.xml @@ -162,4 +162,5 @@ Timer %1$s expires in %2$s The last timer expires in %1$s OK, no timer was canceled + Dicio offline speech recognition From d2229d37a9e0fd05bcc6166ee3a55df9106d5794 Mon Sep 17 00:00:00 2001 From: Stypox Date: Fri, 26 Jul 2024 15:09:17 +0200 Subject: [PATCH 5/5] Add TODOs --- .../stypox/dicio/io/input/stt_service/SttService.kt | 12 ++++++++++++ app/src/main/res/xml/stt_service_metadata.xml | 1 + 2 files changed, 13 insertions(+) diff --git a/app/src/main/kotlin/org/stypox/dicio/io/input/stt_service/SttService.kt b/app/src/main/kotlin/org/stypox/dicio/io/input/stt_service/SttService.kt index 768381ec..9396c939 100644 --- a/app/src/main/kotlin/org/stypox/dicio/io/input/stt_service/SttService.kt +++ b/app/src/main/kotlin/org/stypox/dicio/io/input/stt_service/SttService.kt @@ -16,6 +16,18 @@ import java.util.Locale import javax.inject.Inject +// TODO this class is really simple at the moment, but many more things could be implemented, e.g.: +// - allowing an SttInputDevice to download/support multiple languages +// - handling more EXTRAs, e.g. EXTRA_LANGUAGE, EXTRA_LANGUAGE_PREFERENCE, +// EXTRA_ONLY_RETURN_LANGUAGE_PREFERENCE, EXTRA_LANGUAGE_MODEL, LANGUAGE_MODEL_FREE_FORM, +// LANGUAGE_MODEL_WEB_SEARCH, EXTRA_SEGMENTED_SESSION, +// EXTRA_SPEECH_INPUT_COMPLETE_SILENCE_LENGTH_MILLIS, +// EXTRA_SPEECH_INPUT_POSSIBLY_COMPLETE_SILENCE_LENGTH_MILLIS, +// EXTRA_SPEECH_INPUT_MINIMUM_LENGTH_MILLIS, EXTRA_AUDIO_SOURCE, EXTRA_AUDIO_SOURCE_CHANNEL_COUNT, +// EXTRA_AUDIO_SOURCE_ENCODING, EXTRA_AUDIO_SOURCE_SAMPLING_RATE, EXTRA_BIASING_STRINGS, +// EXTRA_ENABLE_BIASING_DEVICE_CONTEXT +// - if the SttInputDevice is already busy (e.g. another service is using it, or another part of +// Dicio is using it), that needs to be reported with ERROR_BUSY @AndroidEntryPoint class SttService : RecognitionService() { diff --git a/app/src/main/res/xml/stt_service_metadata.xml b/app/src/main/res/xml/stt_service_metadata.xml index fd24ec31..d3965b3b 100644 --- a/app/src/main/res/xml/stt_service_metadata.xml +++ b/app/src/main/res/xml/stt_service_metadata.xml @@ -1,4 +1,5 @@ +