Merge pull request #227 from Stypox/stt-service-kt

Add RecognitionService, so Dicio appears under Voice Input
Stypox · Jul 28, 2024 · cf5dac2 · cf5dac2
2 parents fa2bc9f + d2229d3
commit cf5dac2
Show file tree

Hide file tree

Showing 7 changed files with 193 additions and 5 deletions.
diff --git a/app/src/main/AndroidManifest.xml b/app/src/main/AndroidManifest.xml
@@ -76,5 +76,23 @@
                 <action android:name="android.speech.action.RECOGNIZE_SPEECH" />
             </intent-filter>
         </activity>
+
+        <service
+            android:name=".io.input.stt_service.SttService"
+            android:description="@string/stt_service_label"
+            android:directBootAware="true"
+            android:exported="true"
+            android:foregroundServiceType="microphone"
+            android:icon="@mipmap/ic_launcher"
+            android:label="@string/stt_service_label"
+            android:permission="android.permission.RECORD_AUDIO">
+            <intent-filter>
+                <action android:name="android.speech.RecognitionService"/>
+                <category android:name="android.intent.category.DEFAULT" />
+            </intent-filter>
+            <meta-data
+                android:name="android.speech"
+                android:resource="@xml/stt_service_metadata" />
+        </service>
     </application>
 </manifest>
diff --git a/app/src/main/kotlin/org/stypox/dicio/di/SttInputDeviceWrapper.kt b/app/src/main/kotlin/org/stypox/dicio/di/SttInputDeviceWrapper.kt
@@ -31,7 +31,9 @@ import javax.inject.Singleton
 interface SttInputDeviceWrapper {
     val uiState: StateFlow<SttState?>
 
-    fun tryLoad(thenStartListeningEventListener: ((InputEvent) -> Unit)?)
+    fun tryLoad(thenStartListeningEventListener: ((InputEvent) -> Unit)?): Boolean
+
+    fun stopListening()
 
     fun onClick(eventListener: (InputEvent) -> Unit)
 }
@@ -98,8 +100,12 @@ class SttInputDeviceWrapperImpl(
     }
 
 
-    override fun tryLoad(thenStartListeningEventListener: ((InputEvent) -> Unit)?) {
-        sttInputDevice?.tryLoad(thenStartListeningEventListener)
+    override fun tryLoad(thenStartListeningEventListener: ((InputEvent) -> Unit)?): Boolean {
+        return sttInputDevice?.tryLoad(thenStartListeningEventListener) ?: false
+    }
+
+    override fun stopListening() {
+        sttInputDevice?.stopListening()
     }
 
     override fun onClick(eventListener: (InputEvent) -> Unit) {

diff --git a/app/src/main/kotlin/org/stypox/dicio/io/input/SttInputDevice.kt b/app/src/main/kotlin/org/stypox/dicio/io/input/SttInputDevice.kt
@@ -6,7 +6,9 @@ import org.stypox.dicio.ui.home.SttState
 interface SttInputDevice {
     val uiState: StateFlow<SttState>
 
-    fun tryLoad(thenStartListeningEventListener: ((InputEvent) -> Unit)?)
+    fun tryLoad(thenStartListeningEventListener: ((InputEvent) -> Unit)?): Boolean
+
+    fun stopListening()
 
     fun onClick(eventListener: (InputEvent) -> Unit)
 

diff --git a/app/src/main/kotlin/org/stypox/dicio/io/input/stt_service/SttService.kt b/app/src/main/kotlin/org/stypox/dicio/io/input/stt_service/SttService.kt
@@ -0,0 +1,139 @@
+package org.stypox.dicio.io.input.stt_service
+
+import android.content.Intent
+import android.os.Build
+import android.os.Bundle
+import android.os.RemoteException
+import android.speech.RecognitionService
+import android.speech.RecognizerIntent
+import android.speech.SpeechRecognizer
+import android.util.Log
+import dagger.hilt.android.AndroidEntryPoint
+import org.stypox.dicio.di.LocaleManager
+import org.stypox.dicio.di.SttInputDeviceWrapper
+import org.stypox.dicio.io.input.InputEvent
+import java.util.Locale
+import javax.inject.Inject
+
+
+// TODO this class is really simple at the moment, but many more things could be implemented, e.g.:
+//  - allowing an SttInputDevice to download/support multiple languages
+//  - handling more EXTRAs, e.g. EXTRA_LANGUAGE, EXTRA_LANGUAGE_PREFERENCE,
+//  EXTRA_ONLY_RETURN_LANGUAGE_PREFERENCE, EXTRA_LANGUAGE_MODEL, LANGUAGE_MODEL_FREE_FORM,
+//  LANGUAGE_MODEL_WEB_SEARCH, EXTRA_SEGMENTED_SESSION,
+//  EXTRA_SPEECH_INPUT_COMPLETE_SILENCE_LENGTH_MILLIS,
+//  EXTRA_SPEECH_INPUT_POSSIBLY_COMPLETE_SILENCE_LENGTH_MILLIS,
+//  EXTRA_SPEECH_INPUT_MINIMUM_LENGTH_MILLIS, EXTRA_AUDIO_SOURCE, EXTRA_AUDIO_SOURCE_CHANNEL_COUNT,
+//  EXTRA_AUDIO_SOURCE_ENCODING, EXTRA_AUDIO_SOURCE_SAMPLING_RATE, EXTRA_BIASING_STRINGS,
+//  EXTRA_ENABLE_BIASING_DEVICE_CONTEXT
+//  - if the SttInputDevice is already busy (e.g. another service is using it, or another part of
+//  Dicio is using it), that needs to be reported with ERROR_BUSY
+@AndroidEntryPoint
+class SttService : RecognitionService() {
+
+    @Inject
+    lateinit var sttInputDevice: SttInputDeviceWrapper
+
+    @Inject
+    lateinit var localeManager: LocaleManager
+
+    override fun onStartListening(recognizerIntent: Intent, listener: Callback) {
+        val wantedLanguageExtra = recognizerIntent.getStringExtra(RecognizerIntent.EXTRA_LANGUAGE)
+        // "und" is "Undetermined", see https://www.loc.gov/standards/iso639-2/php/code_list.php
+        if (wantedLanguageExtra != null && wantedLanguageExtra != "und") {
+            val appLanguage = localeManager.locale.value.language
+            val wantedLanguage = Locale(wantedLanguageExtra).language
+            if (appLanguage != wantedLanguage) {
+                Log.e(TAG, "Unsupported language: app=$appLanguage wanted=$wantedLanguageExtra")
+                // From the javadoc of ERROR_LANGUAGE_UNAVAILABLE: Requested language is supported,
+                // but not available currently (e.g. not downloaded yet).
+                logRemoteExceptions { listener.error(ERROR_LANGUAGE_UNAVAILABLE) }
+                return
+            }
+        }
+
+        var beginningOfSpeech = true
+        val willStartListening = sttInputDevice.tryLoad { inputEvent ->
+            when (inputEvent) {
+                is InputEvent.Error -> {
+                    logRemoteExceptions { listener.error(SpeechRecognizer.ERROR_SERVER) }
+                }
+
+                is InputEvent.Final -> {
+                    if (beginningOfSpeech) {
+                        logRemoteExceptions { listener.beginningOfSpeech() }
+                        beginningOfSpeech = false
+                    }
+
+                    val results = Bundle()
+                    results.putStringArrayList(
+                        SpeechRecognizer.RESULTS_RECOGNITION,
+                        ArrayList(inputEvent.utterances.map { it.first })
+                    )
+                    results.putFloatArray(
+                        SpeechRecognizer.CONFIDENCE_SCORES,
+                        inputEvent.utterances.map { it.second }.toFloatArray()
+                    )
+
+                    logRemoteExceptions { listener.results(results) }
+                    logRemoteExceptions { listener.endOfSpeech() }
+                }
+
+                InputEvent.None -> {
+                    logRemoteExceptions { listener.error(SpeechRecognizer.ERROR_SPEECH_TIMEOUT) }
+                    logRemoteExceptions { listener.endOfSpeech() }
+                }
+
+                is InputEvent.Partial -> {
+                    if (beginningOfSpeech) {
+                        logRemoteExceptions { listener.beginningOfSpeech() }
+                        beginningOfSpeech = false
+                    }
+
+                    val partResult = Bundle()
+                    partResult.putStringArrayList(
+                        SpeechRecognizer.RESULTS_RECOGNITION,
+                        arrayListOf(inputEvent.utterance)
+                    )
+
+                    logRemoteExceptions { listener.partialResults(partResult) }
+                }
+            }
+        }
+
+        if (!willStartListening) {
+            Log.w(TAG, "Could not start STT recognizer")
+            logRemoteExceptions { listener.error(ERROR_LANGUAGE_UNAVAILABLE) }
+        }
+    }
+
+    override fun onCancel(listener: Callback) {
+        sttInputDevice.stopListening()
+    }
+
+    override fun onStopListening(listener: Callback) {
+        sttInputDevice.stopListening()
+    }
+
+    companion object {
+        val TAG = SttService::class.simpleName
+
+        /**
+         * From the javadoc of [SpeechRecognizer.ERROR_LANGUAGE_UNAVAILABLE]: Requested language is
+         * supported, but not available currently (e.g. not downloaded yet).
+         */
+        val ERROR_LANGUAGE_UNAVAILABLE = if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) {
+            SpeechRecognizer.ERROR_LANGUAGE_UNAVAILABLE
+        } else {
+            SpeechRecognizer.ERROR_SERVER
+        }
+
+        fun logRemoteExceptions(f: () -> Unit) {
+            try {
+                return f()
+            } catch (e: RemoteException) {
+                Log.e(TAG, "Remote exception", e)
+            }
+        }
+    }
+}
diff --git a/app/src/main/kotlin/org/stypox/dicio/io/input/vosk/VoskInputDevice.kt b/app/src/main/kotlin/org/stypox/dicio/io/input/vosk/VoskInputDevice.kt
@@ -211,13 +211,20 @@ class VoskInputDevice(
      *
      * @param thenStartListeningEventListener if not `null`, causes the [VoskInputDevice] to start
      * listening after it has finished loading, and the received input events are sent there
+     * @return `true` if the input device will start listening (or be ready to do so in case
+     * `thenStartListeningEventListener == null`) at some point,
+     * `false` if manual user intervention is required to start listening
      */
-    override fun tryLoad(thenStartListeningEventListener: ((InputEvent) -> Unit)?) {
+    override fun tryLoad(thenStartListeningEventListener: ((InputEvent) -> Unit)?): Boolean {
         val s = _state.value
         if (s == NotLoaded) {
             load(thenStartListeningEventListener)
+            return true
         } else if (thenStartListeningEventListener != null && s is Loaded) {
             startListening(s.speechService, thenStartListeningEventListener)
+            return true
+        } else {
+            return false
         }
     }
 
@@ -252,6 +259,16 @@ class VoskInputDevice(
         }
     }
 
+    /**
+     * If the recognizer is currently listening, stops listening. Otherwise does nothing.
+     */
+    override fun stopListening() {
+        when (val s = _state.value) {
+            is Listening -> stopListening(s.speechService, s.eventListener, true)
+            else -> {}
+        }
+    }
+
     /**
      * Downloads the model zip file. Sets the state to [Downloading], and periodically updates it
      * with downloading progress, until either [ErrorDownloading] or [Downloaded] are set as state.

diff --git a/app/src/main/res/values/strings.xml b/app/src/main/res/values/strings.xml
@@ -162,4 +162,5 @@
     <string name="skill_timer_query_name">Timer %1$s expires in %2$s</string>
     <string name="skill_timer_query_last">The last timer expires in %1$s</string>
     <string name="skill_timer_none_canceled">OK, no timer was canceled</string>
+    <string name="stt_service_label">Dicio offline speech recognition</string>
 </resources>
diff --git a/app/src/main/res/xml/stt_service_metadata.xml b/app/src/main/res/xml/stt_service_metadata.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- TODO actually set a proper settingsActivity -->
+<recognition-service
+    xmlns:android="http://schemas.android.com/apk/res/android"
+    android:settingsActivity="org.stypox.dicio.MainActivity" />