From 45515b8760785d487f43da084cbc5f36c4ea01fb Mon Sep 17 00:00:00 2001 From: kridneb <109475719+nebkrid@users.noreply.github.com> Date: Mon, 23 Jan 2023 23:31:50 +0100 Subject: [PATCH 1/5] First implementation of RecognitionService for testing and reviewing so that dicio / vosk is registered in system as speech recognition service which can be queried by other apps without any dicio UI. - splitted VoskInputDevice.java in 3 parts: The dicio recognition service SttService.java using vosk, the SpeechRecogServiceInputDevice.java as a more generalized Input for Dicio and the VoskInputDevice.java which handles downloading of vosk models - added preference option to use system provided stt service for dicio instead of vosk --- app/src/main/AndroidManifest.xml | 59 ++- .../java/org/stypox/dicio/MainActivity.java | 36 +- .../input/SpeechRecogServiceInputDevice.java | 248 +++++++++++ .../stypox/dicio/input/VoskInputDevice.java | 301 +++---------- .../dicio/input/stt_service/SttService.java | 394 ++++++++++++++++++ .../org/stypox/dicio/settings/IOFragment.java | 15 +- app/src/main/res/values/arrays.xml | 2 + app/src/main/res/values/strings.xml | 1 + app/src/main/res/values/strings_keys.xml | 1 + app/src/main/res/xml/stt_service_metadata.xml | 5 + 10 files changed, 782 insertions(+), 280 deletions(-) create mode 100644 app/src/main/java/org/stypox/dicio/input/SpeechRecogServiceInputDevice.java create mode 100644 app/src/main/java/org/stypox/dicio/input/stt_service/SttService.java create mode 100644 app/src/main/res/xml/stt_service_metadata.xml diff --git a/app/src/main/AndroidManifest.xml b/app/src/main/AndroidManifest.xml index bce8401cc..c88bf7958 100644 --- a/app/src/main/AndroidManifest.xml +++ b/app/src/main/AndroidManifest.xml @@ -1,47 +1,49 @@ + android:installLocation="auto" > + - - - + - - + android:maxSdkVersion="28" /> - - + tools:ignore="QueryAllPackagesPermission" /> + + + + + + + tools:ignore="GoogleAppIndexingWarning" > + android:windowSoftInputMode="stateUnspecified|adjustResize" > - @@ -51,28 +53,45 @@ android:name="com.android.systemui.action_assist_icon" android:resource="@mipmap/ic_launcher" /> - - - - + android:windowSoftInputMode="adjustResize" > + + + + + + + + + + + + \ No newline at end of file diff --git a/app/src/main/java/org/stypox/dicio/MainActivity.java b/app/src/main/java/org/stypox/dicio/MainActivity.java index 0c1fe170e..90bc56c4d 100644 --- a/app/src/main/java/org/stypox/dicio/MainActivity.java +++ b/app/src/main/java/org/stypox/dicio/MainActivity.java @@ -1,8 +1,5 @@ package org.stypox.dicio; -import static android.Manifest.permission.RECORD_AUDIO; -import static android.content.pm.PackageManager.PERMISSION_GRANTED; - import android.content.Intent; import android.content.SharedPreferences; import android.os.Bundle; @@ -13,23 +10,16 @@ import android.widget.ProgressBar; import android.widget.ScrollView; -import androidx.annotation.NonNull; -import androidx.annotation.Nullable; -import androidx.appcompat.app.ActionBarDrawerToggle; -import androidx.appcompat.widget.SearchView; -import androidx.appcompat.widget.Toolbar; -import androidx.core.app.ActivityCompat; -import androidx.core.view.GravityCompat; -import androidx.drawerlayout.widget.DrawerLayout; -import androidx.preference.PreferenceManager; - import com.google.android.material.floatingactionbutton.ExtendedFloatingActionButton; import com.google.android.material.navigation.NavigationView; +import org.dicio.skill.output.GraphicalOutputDevice; +import org.dicio.skill.output.SpeechOutputDevice; import org.stypox.dicio.eval.SkillEvaluator; import org.stypox.dicio.eval.SkillRanker; import org.stypox.dicio.input.InputDevice; import org.stypox.dicio.input.SpeechInputDevice; +import org.stypox.dicio.input.SpeechRecogServiceInputDevice; import org.stypox.dicio.input.ToolbarInputDevice; import org.stypox.dicio.input.VoskInputDevice; import org.stypox.dicio.input.stt_service.SttServiceActivity; @@ -42,8 +32,19 @@ import org.stypox.dicio.skills.SkillHandler; import org.stypox.dicio.util.BaseActivity; import org.stypox.dicio.util.PermissionUtils; -import org.dicio.skill.output.GraphicalOutputDevice; -import org.dicio.skill.output.SpeechOutputDevice; + +import androidx.annotation.NonNull; +import androidx.annotation.Nullable; +import androidx.appcompat.app.ActionBarDrawerToggle; +import androidx.appcompat.widget.SearchView; +import androidx.appcompat.widget.Toolbar; +import androidx.core.app.ActivityCompat; +import androidx.core.view.GravityCompat; +import androidx.drawerlayout.widget.DrawerLayout; +import androidx.preference.PreferenceManager; + +import static android.Manifest.permission.RECORD_AUDIO; +import static android.content.pm.PackageManager.PERMISSION_GRANTED; public class MainActivity extends BaseActivity implements NavigationView.OnNavigationItemSelectedListener { @@ -304,6 +305,11 @@ private InputDevice buildPrimaryInputDevice() { .getString(getString(R.string.pref_key_input_method), ""); if (preference.equals(getString(R.string.pref_val_input_method_text))) { return new ToolbarInputDevice(); + } else if (preference.equals(getString(R.string.pref_val_input_method_systemStt))) { + //TODO make a hint/data privacy warning etc. in preference when this one is chosen that + // the speech dicio records is given to a third party app according to system + // settings + return new SpeechRecogServiceInputDevice(this); } else { // default return new VoskInputDevice(this); } diff --git a/app/src/main/java/org/stypox/dicio/input/SpeechRecogServiceInputDevice.java b/app/src/main/java/org/stypox/dicio/input/SpeechRecogServiceInputDevice.java new file mode 100644 index 000000000..13dbd7742 --- /dev/null +++ b/app/src/main/java/org/stypox/dicio/input/SpeechRecogServiceInputDevice.java @@ -0,0 +1,248 @@ +package org.stypox.dicio.input; + +import android.app.Activity; +import android.content.Intent; +import android.os.Bundle; +import android.speech.RecognizerIntent; +import android.speech.SpeechRecognizer; +import android.util.Log; +import android.widget.Toast; + +import org.stypox.dicio.R; + +import java.util.ArrayList; + +import androidx.annotation.StringRes; +import androidx.preference.PreferenceManager; + +import static org.stypox.dicio.util.StringUtils.isNullOrEmpty; + +public class SpeechRecogServiceInputDevice extends SpeechInputDevice + implements android.speech.RecognitionListener { + + public static final String TAG = SpeechRecogServiceInputDevice.class.getSimpleName(); + private Activity activity; + + private boolean startListeningOnLoaded = false; + + private SpeechRecognizer speechRecognizer; + private boolean currentlyListening = false; + + + ///////////////////// + // Exposed methods // + ///////////////////// + + public SpeechRecogServiceInputDevice(final Activity activity) { + this.activity = activity; + } + + @Override + public void load() { + load(false); // the user did not press on a button, so manual=false + } + + /** + * @param manual if this is true and the model is not already downloaded, do not start + * downloading it. See {@link #tryToGetInput(boolean)}. + */ + protected void load(final boolean manual) { + if (speechRecognizer == null) { + onLoading(); + speechRecognizer = getRecognizer(); + speechRecognizer.setRecognitionListener(this); + + if (startListeningOnLoaded) { + startListeningOnLoaded = false; + tryToGetInput(manual); + } else { + onInactive(); + } + } + } + + /** + * initializes the recognizers by calling the appropritate + * {@link SpeechRecognizer}.createSpeechRecognizer() . Default is system provided recognizer. + * Overwrite this in case you want to specify. + * @return the {@link SpeechRecognizer} + */ + protected SpeechRecognizer getRecognizer() { + return SpeechRecognizer.createSpeechRecognizer(activity); + } + + /** + * Override this to specify which Intent shall be used in + * {@link SpeechRecognizer}.startListening() + * @return the {@link Intent} according to {@link RecognizerIntent} + */ + protected Intent getRecognizerIntent() { + final Intent i = new Intent(); + i.putExtra(RecognizerIntent.EXTRA_LANGUAGE, PreferenceManager + .getDefaultSharedPreferences(activity) + .getString(activity.getString(R.string.pref_key_language), "en")); + return i; + } + + @Override + public void cleanup() { + super.cleanup(); + cancelGettingInput(); + + activity = null; + } + + @Override + public synchronized void tryToGetInput(final boolean manual) { + if (speechRecognizer == null) { + startListeningOnLoaded = true; + load(manual); // not loaded before, retry + return; // recognizer not ready + } + + super.tryToGetInput(manual); + + Log.d(TAG, "starting recognizer"); + + onLoading(); + speechRecognizer.startListening(getRecognizerIntent()); + currentlyListening = true; + } + + @Override + public void cancelGettingInput() { + if (speechRecognizer != null && currentlyListening) { + //call stoplistening only if it is running! Otherwise ERROR_CLIENT will be reported + speechRecognizer.cancel(); + } + startListeningOnLoaded = false; + } + + ///////////////////// + // Other utilities // + ///////////////////// + + protected void asyncMakeToast(@StringRes final int message) { + activity.runOnUiThread(() -> + Toast.makeText(activity, activity.getString(message), Toast.LENGTH_SHORT).show()); + } + + + /////////////////////////// + // Recognition Callbacks // + /////////////////////////// + + @Override + public void onReadyForSpeech(final Bundle bundle) { + Log.d(TAG, "onReadyForSpeech"); + onListening(); + currentlyListening = true; + } + + @Override + public void onBeginningOfSpeech() { + //no usecase for dicio + Log.d(TAG, "onBeginningOfSpeech"); + } + + @Override + public void onRmsChanged(final float v) { + //no usecase for dicio + Log.d(TAG, "onRmsChanged"); + } + + @Override + public void onBufferReceived(final byte[] bytes) { + //no usecase for dicio + Log.d(TAG, "onBufferReceived"); + } + + @Override + public void onEndOfSpeech() { + Log.d(TAG, "onEndOfSpeech"); + currentlyListening = false; + onInactive(); + } + + @Override + public void onError(final int i) { + Log.d(TAG, "onError called with error code = " + i); + switch (i) { + case SpeechRecognizer.ERROR_AUDIO: + notifyError(new Throwable("ERROR_AUDIO")); + break; + case SpeechRecognizer.ERROR_CLIENT: + notifyError(new Throwable("ERROR_CLIENT")); + break; + case SpeechRecognizer.ERROR_INSUFFICIENT_PERMISSIONS: + notifyError(new Throwable("ERROR_INSUFFICIENT_PERMISSIONS")); + break; + case SpeechRecognizer.ERROR_LANGUAGE_NOT_SUPPORTED: + notifyError(new Throwable("ERROR_LANGUAGE_NOT_SUPPORTED")); + break; + case SpeechRecognizer.ERROR_LANGUAGE_UNAVAILABLE: + notifyError(new Throwable("ERROR_LANGUAGE_UNAVAILABLE")); + break; + case SpeechRecognizer.ERROR_NETWORK: + notifyError(new Throwable("ERROR_NETWORK")); + break; + case SpeechRecognizer.ERROR_NETWORK_TIMEOUT: + notifyError(new Throwable("ERROR_NETWORK_TIMEOUT")); + break; + case SpeechRecognizer.ERROR_NO_MATCH: + Log.d(TAG, "ERROR_NO_MATCH"); + notifyNoInputReceived(); + break; + case SpeechRecognizer.ERROR_RECOGNIZER_BUSY: + notifyError(new Throwable("ERROR_RECOGNIZER_BUSY")); + break; + case SpeechRecognizer.ERROR_SERVER: + notifyError(new Throwable("ERROR_SERVER")); + break; + case SpeechRecognizer.ERROR_SERVER_DISCONNECTED: + notifyError(new Throwable("ERROR_SERVER_DISCONNECTED")); + break; + case SpeechRecognizer.ERROR_SPEECH_TIMEOUT: + notifyError(new Throwable("ERROR_SPEECH_TIMEOUT")); + break; + case SpeechRecognizer.ERROR_TOO_MANY_REQUESTS: + notifyError(new Throwable("ERROR_TOO_MANY_REQUESTS")); + break; + default: + Log.w(TAG, "onError called with unexpected error code = " + i); + notifyError(new Throwable("Unexpected error code = " + i)); + } + //reset views + onEndOfSpeech(); // e.g. Google does not send this after error like No_Match + + + } + + @Override + public void onResults(final Bundle bundle) { + final ArrayList results = bundle.getStringArrayList( + SpeechRecognizer.RESULTS_RECOGNITION); + Log.d(TAG, "onResult called with s = " + results.toString()); + notifyInputReceived(results); + } + + @Override + public void onPartialResults(final Bundle bundle) { + final ArrayList results = bundle.getStringArrayList( + SpeechRecognizer.RESULTS_RECOGNITION); + Log.d(TAG, "onPartialResult called with s = " + results.toString()); + final String partialInput = results.get(0); + if (!isNullOrEmpty(partialInput)) { + notifyPartialInputReceived(partialInput); + } + } + + @Override + public void onEvent(final int i, final Bundle bundle) { + //android docs: "Reserved for adding future events" + Log.d(TAG, "onEvent"); + } + + + +} diff --git a/app/src/main/java/org/stypox/dicio/input/VoskInputDevice.java b/app/src/main/java/org/stypox/dicio/input/VoskInputDevice.java index eb5011fba..0c9089694 100644 --- a/app/src/main/java/org/stypox/dicio/input/VoskInputDevice.java +++ b/app/src/main/java/org/stypox/dicio/input/VoskInputDevice.java @@ -1,60 +1,48 @@ package org.stypox.dicio.input; -import static org.stypox.dicio.util.LocaleUtils.LocaleResolutionResult; -import static org.stypox.dicio.util.LocaleUtils.UnsupportedLocaleException; -import static org.stypox.dicio.util.LocaleUtils.resolveSupportedLocale; -import static org.stypox.dicio.util.StringUtils.isNullOrEmpty; - import android.app.Activity; import android.app.DownloadManager; import android.content.BroadcastReceiver; +import android.content.ComponentName; import android.content.Context; import android.content.Intent; import android.content.IntentFilter; import android.content.SharedPreferences; import android.net.Uri; +import android.speech.SpeechRecognizer; import android.util.Log; -import android.widget.Toast; - -import androidx.annotation.Nullable; -import androidx.annotation.StringRes; -import androidx.core.os.LocaleListCompat; -import androidx.preference.PreferenceManager; -import org.stypox.dicio.BuildConfig; import org.stypox.dicio.R; import org.stypox.dicio.Sections; -import org.json.JSONException; -import org.json.JSONObject; -import org.vosk.LibVosk; -import org.vosk.LogLevel; -import org.vosk.Model; -import org.vosk.Recognizer; -import org.vosk.android.RecognitionListener; -import org.vosk.android.SpeechService; +import org.stypox.dicio.input.stt_service.SttService; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; -import java.util.ArrayList; import java.util.HashMap; import java.util.Map; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; +import androidx.annotation.Nullable; +import androidx.core.os.LocaleListCompat; +import androidx.preference.PreferenceManager; import io.reactivex.rxjava3.android.schedulers.AndroidSchedulers; import io.reactivex.rxjava3.core.Completable; import io.reactivex.rxjava3.disposables.CompositeDisposable; import io.reactivex.rxjava3.schedulers.Schedulers; -public class VoskInputDevice extends SpeechInputDevice { +import static org.stypox.dicio.util.LocaleUtils.LocaleResolutionResult; +import static org.stypox.dicio.util.LocaleUtils.UnsupportedLocaleException; +import static org.stypox.dicio.util.LocaleUtils.resolveSupportedLocale; + +public class VoskInputDevice extends SpeechRecogServiceInputDevice { public static final String TAG = VoskInputDevice.class.getSimpleName(); public static final String MODEL_PATH = "/vosk-model"; public static final String MODEL_ZIP_FILENAME = "model.zip"; - public static final float SAMPLE_RATE = 44100.0f; /** * All small models from Vosk @@ -92,18 +80,13 @@ public class VoskInputDevice extends SpeechInputDevice { private final CompositeDisposable disposables = new CompositeDisposable(); @Nullable private BroadcastReceiver downloadingBroadcastReceiver = null; private Long currentModelDownloadId = null; - @Nullable private SpeechService speechService = null; - - private boolean currentlyInitializingRecognizer = false; - private boolean startListeningOnLoaded = false; - private boolean currentlyListening = false; - ///////////////////// // Exposed methods // ///////////////////// public VoskInputDevice(final Activity activity) { + super(activity); this.activity = activity; } @@ -116,84 +99,71 @@ public void load() { * @param manual if this is true and the model is not already downloaded, do not start * downloading it. See {@link #tryToGetInput(boolean)}. */ - private void load(final boolean manual) { - if (speechService == null && !currentlyInitializingRecognizer) { - if (new File(getModelDirectory(), "ivector").exists()) { - // one directory is in the correct place, so everything should be ok - Log.d(TAG, "Vosk model in place"); - - currentlyInitializingRecognizer = true; - onLoading(); - - disposables.add(Completable.fromAction(this::initializeRecognizer) - .subscribeOn(Schedulers.io()) - .observeOn(AndroidSchedulers.mainThread()) - .subscribe(() -> { - currentlyInitializingRecognizer = false; - if (startListeningOnLoaded) { - startListeningOnLoaded = false; - tryToGetInput(manual); - } else { - onInactive(); - } - }, throwable -> { - currentlyInitializingRecognizer = false; - if ("Failed to initialize recorder. Microphone might be already in use." - .equals(throwable.getMessage())) { - notifyError(new UnableToAccessMicrophoneException()); - } else { - notifyError(throwable); - } - onInactive(); - })); - - } else { - Log.d(TAG, "Vosk model not in place"); - final DownloadManager downloadManager = - (DownloadManager) activity.getSystemService(Context.DOWNLOAD_SERVICE); - - if (currentModelDownloadId == null) { - Log.d(TAG, "Vosk model is not already being downloaded"); - - if (manual) { - // the model needs to be downloaded and no download has already started; - // the user manually triggered the input device, so he surely wants the - // model to be downloaded, so we can proceed - onLoading(); - try { - final LocaleResolutionResult result = resolveSupportedLocale( - LocaleListCompat.create(Sections.getCurrentLocale()), - MODEL_URLS.keySet()); - startDownloadingModel(downloadManager, result.supportedLocaleString); - } catch (final UnsupportedLocaleException e) { - asyncMakeToast(R.string.vosk_model_unsupported_language); - e.printStackTrace(); - onRequiresDownload(); - } + protected void load(final boolean manual) { + if (new File(getModelDirectory(), "ivector").exists()) { + // one directory is in the correct place, so everything should be ok + Log.d(TAG, "Vosk model in place"); + super.load(manual); + } else { + Log.d(TAG, "Vosk model not in place"); + final DownloadManager downloadManager = + (DownloadManager) activity.getSystemService(Context.DOWNLOAD_SERVICE); - } else { - // loading the model would require downloading it, but the user didn't - // explicitly tell the voice recognizer to download files, so notify them - // that a download is required + if (currentModelDownloadId == null) { + Log.d(TAG, "Vosk model is not already being downloaded"); + + if (manual) { + // the model needs to be downloaded and no download has already started; + // the user manually triggered the input device, so he surely wants the + // model to be downloaded, so we can proceed + onLoading(); + try { + final LocaleResolutionResult result = resolveSupportedLocale( + LocaleListCompat.create(Sections.getCurrentLocale()), + MODEL_URLS.keySet()); + startDownloadingModel(downloadManager, result.supportedLocaleString); + } catch (final UnsupportedLocaleException e) { + asyncMakeToast(R.string.vosk_model_unsupported_language); + e.printStackTrace(); onRequiresDownload(); } } else { - Log.d(TAG, "Vosk model already being downloaded: " + currentModelDownloadId); + // loading the model would require downloading it, but the user didn't + // explicitly tell the voice recognizer to download files, so notify them + // that a download is required + onRequiresDownload(); } + + } else { + Log.d(TAG, "Vosk model already being downloaded: " + currentModelDownloadId); } } } + @Override + protected SpeechRecognizer getRecognizer() { + SpeechRecognizer sr = SpeechRecognizer.createSpeechRecognizer(activity, + new ComponentName(activity, SttService.class)); + //additionally call startService so that service is not directly destroyed after + //speech recognizer is unbound (especially important if SttServiceActivity is + // only called from other apps. If dicio app is closed, service is destroyed anyway, + // too. Avoid destroyin in order to avoid re-initialization of SpeechService + //(observed when manually closed - check if this happens too when closed by system + // due to inactivity) + //works also when battery optimization is enabled + //TODO check long term behaviour with and without battery optimization + //TODO check how to call startService if neither Dicio Main app nor + // Dicios SttServiceActivity is called but directly + // SpeechRecognizer.createSpeechRecognizer by a 3rd party app + activity.startService(new Intent(activity, SttService.class)); + return sr; + } + @Override public void cleanup() { super.cleanup(); disposables.clear(); - if (speechService != null) { - speechService.stop(); - speechService.shutdown(); - speechService = null; - } if (currentModelDownloadId != null) { final DownloadManager downloadManager = @@ -209,117 +179,6 @@ public void cleanup() { activity = null; } - @Override - public synchronized void tryToGetInput(final boolean manual) { - if (currentlyInitializingRecognizer) { - startListeningOnLoaded = true; - return; - } else if (speechService == null) { - startListeningOnLoaded = true; - load(manual); // not loaded before, retry - return; // recognizer not ready - } - - if (currentlyListening) { - return; - } - currentlyListening = true; - super.tryToGetInput(manual); - - Log.d(TAG, "starting recognizer"); - - speechService.startListening(new RecognitionListener() { - - @Override - public void onPartialResult(final String s) { - Log.d(TAG, "onPartialResult called with s = " + s); - if (!currentlyListening) { - return; - } - - String partialInput = null; - try { - partialInput = new JSONObject(s).getString("partial"); - } catch (final JSONException e) { - e.printStackTrace(); - } - - if (!isNullOrEmpty(partialInput)) { - notifyPartialInputReceived(partialInput); - } - } - - @Override - public void onResult(final String s) { - Log.d(TAG, "onResult called with s = " + s); - if (!currentlyListening) { - return; - } - - stopRecognizer(); - - final ArrayList inputs = new ArrayList<>(); - try { - final JSONObject jsonResult = new JSONObject(s); - final int size = jsonResult.getJSONArray("alternatives").length(); - for (int i = 0; i < size; i++) { - final String text = jsonResult.getJSONArray("alternatives") - .getJSONObject(i).getString("text"); - if (!isNullOrEmpty(text)) { - inputs.add(text); - } - } - } catch (final JSONException e) { - e.printStackTrace(); - } - - if (inputs.isEmpty()) { - notifyNoInputReceived(); - } else { - notifyInputReceived(inputs); - } - } - - @Override - public void onFinalResult(final String s) { - Log.d(TAG, "onFinalResult called with s = " + s); - // TODO - } - - @Override - public void onError(final Exception e) { - Log.d(TAG, "onError called"); - stopRecognizer(); - notifyError(e); - } - - @Override - public void onTimeout() { - Log.d(TAG, "onTimeout called"); - stopRecognizer(); - notifyNoInputReceived(); - } - }); - onListening(); - } - - @Override - public void cancelGettingInput() { - if (currentlyListening) { - if (speechService != null) { - speechService.stop(); - } - notifyNoInputReceived(); - - // call onInactive() only if we really were listening, so that the SpeechInputDevice - // state icon is preserved if something different from "microphone on" was being shown - onInactive(); - } - - startListeningOnLoaded = false; - currentlyListening = false; - } - /** * Deletes the Vosk model downloaded in the {@link Context#getFilesDir()} if it exists. It also * stops any Vosk model download currently in progress based on the id stored in settings. @@ -337,31 +196,6 @@ public static void deleteCurrentModel(final Context context) { } - //////////////////// - // Initialization // - //////////////////// - - private synchronized void initializeRecognizer() throws IOException { - Log.d(TAG, "initializing recognizer"); - - LibVosk.setLogLevel(BuildConfig.DEBUG ? LogLevel.DEBUG : LogLevel.WARNINGS); - final Model model = new Model(getModelDirectory().getAbsolutePath()); - final Recognizer recognizer = new Recognizer(model, SAMPLE_RATE); - recognizer.setMaxAlternatives(5); - this.speechService = new SpeechService(recognizer, SAMPLE_RATE); - } - - private void stopRecognizer() { - currentlyListening = false; - - if (speechService != null) { - speechService.stop(); - } - - onInactive(); - } - - //////////////////// // Model download // //////////////////// @@ -560,13 +394,4 @@ private void updateCurrentDownloadId(final Context context, final Long id) { } } - - ///////////////////// - // Other utilities // - ///////////////////// - - private void asyncMakeToast(@StringRes final int message) { - activity.runOnUiThread(() -> - Toast.makeText(activity, activity.getString(message), Toast.LENGTH_SHORT).show()); - } } diff --git a/app/src/main/java/org/stypox/dicio/input/stt_service/SttService.java b/app/src/main/java/org/stypox/dicio/input/stt_service/SttService.java new file mode 100644 index 000000000..244fb75d2 --- /dev/null +++ b/app/src/main/java/org/stypox/dicio/input/stt_service/SttService.java @@ -0,0 +1,394 @@ +package org.stypox.dicio.input.stt_service; + +import android.content.Intent; +import android.os.Build; +import android.os.Bundle; +import android.os.RemoteException; +import android.speech.RecognitionService; +import android.speech.RecognizerIntent; +import android.speech.SpeechRecognizer; +import android.util.Log; +import android.widget.Toast; + +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; +import org.stypox.dicio.BuildConfig; +import org.stypox.dicio.R; +import org.vosk.LibVosk; +import org.vosk.LogLevel; +import org.vosk.Model; +import org.vosk.Recognizer; +import org.vosk.android.SpeechService; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; + +import androidx.annotation.Nullable; +import io.reactivex.rxjava3.android.schedulers.AndroidSchedulers; +import io.reactivex.rxjava3.core.Completable; +import io.reactivex.rxjava3.disposables.CompositeDisposable; +import io.reactivex.rxjava3.schedulers.Schedulers; + +import static org.stypox.dicio.util.StringUtils.isNullOrEmpty; + +public class SttService extends RecognitionService { + protected class RecognitionListener implements org.vosk.android.RecognitionListener { + private boolean firstPartialResultReceived = false; + + @Override + public void onPartialResult(final String s) { + Log.d(TAG, "onPartialResult called with s = " + s); + + String partialInput = null; + try { + partialInput = new JSONObject(s).getString("partial"); + } catch (final JSONException e) { + e.printStackTrace(); + } + + if (!isNullOrEmpty(partialInput)) { + if (!firstPartialResultReceived) { + firstPartialResultReceived = true; + try { + callback.beginningOfSpeech(); + } catch (final RemoteException e) { + logRemoteException(e); + } + } + final String[] partialInputArray = {partialInput}; + final Bundle partResult = new Bundle(); + partResult.putStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION, + new ArrayList<>(Arrays.asList(partialInputArray))); + try { + callback.partialResults(partResult); + } catch (final RemoteException e) { + logRemoteException(e); + } + } + } + + @Override + public void onResult(final String s) { + Log.d(TAG, "onResult called with s = " + s); + + stopRecognizer(); + + final ArrayList inputs = new ArrayList<>(); + float[] confidences = null; + try { + final JSONObject jsonResult = new JSONObject(s); + final JSONArray alternatives = jsonResult.getJSONArray("alternatives"); + int size = alternatives.length(); + for (int i = 0; i < size; i++) { + final String text = alternatives.getJSONObject(i).getString("text"); + if (!isNullOrEmpty(text)) { + inputs.add(text); + } + } + //final size may change if empty entries exist + size = inputs.size(); + confidences = new float[size]; + for (int i = 0; i < size; i++) { + confidences[i] = (float) alternatives.getJSONObject(i) + .getDouble("confidence"); + } + + } catch (final JSONException e) { + e.printStackTrace(); + } + + if (inputs.isEmpty()) { + try { + callback.error(SpeechRecognizer.ERROR_NO_MATCH); + } catch (final RemoteException e) { + logRemoteException(e); + } + } else { + final Bundle results = new Bundle(); + results.putStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION, inputs); + results.putFloatArray(SpeechRecognizer.CONFIDENCE_SCORES, confidences); + try { + callback.results(results); + } catch (final RemoteException e) { + logRemoteException(e); + } + } + } + + @Override + public void onFinalResult(final String s) { + Log.d(TAG, "onFinalResult called with s = " + s); + firstPartialResultReceived = false; //reset for next input + try { + //only notify endOfSpeech because s is currently always empty - even if onResult + // was not empty before + callback.endOfSpeech(); + } catch (final RemoteException e) { + logRemoteException(e); + } + } + + @Override + public void onError(final Exception e) { + Log.e(TAG, "onError", e); + stopRecognizer(); + try { + //The Error message is quite general because there is no "generic error code" + callback.error(SpeechRecognizer.ERROR_SERVER); + } catch (final RemoteException ex) { + Log.e(TAG, "onError", e); + } + } + + @Override + public void onTimeout() { + Log.d(TAG, "onTimeout called"); + stopRecognizer(); + try { + callback.error(SpeechRecognizer.ERROR_SPEECH_TIMEOUT); + } catch (final RemoteException e) { + logRemoteException(e); + } + } + } + + /** + docs of SpeechService + ... + */ + @Nullable + private SpeechService speechService = null; + private boolean currentlyInitializingRecognizer = false; + public static final String MODEL_PATH = "/vosk-model"; + public static final String TAG = SttService.class.getSimpleName(); + private final CompositeDisposable disposables = new CompositeDisposable(); + public static final float SAMPLE_RATE = 44100.0f; + private boolean currentlyListening = false; + private boolean startListeningOnLoaded = false; + private boolean onStartCommandCalled = false; + private Intent lastRequestedIntent = null; + Callback callback; + +//TODO support onCheckRecognitionSupport +//TODO support onTriggerModelDownload + + @Override + public void onCreate() { + super.onCreate(); + load(); + Log.d(TAG, "onCreate"); + } + + @Override + public int onStartCommand(final Intent intent, final int flags, final int startId) { + Log.d(TAG, "onStartCommand"); + onStartCommandCalled = true; + return super.onStartCommand(intent, flags, startId); + } + + @Override + public boolean onUnbind(final Intent intent) { + Log.d(TAG, "onUnbind"); + return super.onUnbind(intent); + } + + + @Override + public void onRebind(final Intent intent) { + Log.d(TAG, "onRebind"); + super.onRebind(intent); + } + + + @Override + public void onDestroy() { + Log.d(TAG, "onDestroy"); + disposables.clear(); + if (speechService != null) { + stopRecognizer(); + speechService.shutdown(); + speechService = null; + } + super.onDestroy(); + } + + @Override + protected void onStartListening(final Intent intent, final Callback newCallback) { + Log.d(TAG, "onStartListening"); + Log.d(TAG, "onStartCommand called is " + onStartCommandCalled); + //TODO remove toast or make different type of speech recognition hint or a preference option + // to disable + Toast.makeText(this, this.getString(R.string.pref_input_method_vosk), + Toast.LENGTH_SHORT).show(); + //TODO maybe check here for audio permission of the caller (but already in manifest of this + // service declared => should not happen?): Need a test app without permission + // https://developer.android.com/reference/android/speech/RecognitionService# + // onStartListening(android.content.Intent,%20android.speech.RecognitionService.Callback) + this.callback = newCallback; + lastRequestedIntent = intent; + tryToGetInput(); + + //TODO support Intent Extras if possible with vosk + // EXTRA_LANGUAGE / EXTRA_LANGUAGE_PREFERENCE / EXTRA_ONLY_RETURN_LANGUAGE_PREFERENCE + // Further Extras which may be interesting + // EXTRA_LANGUAGE_MODEL / LANGUAGE_MODEL_FREE_FORM / LANGUAGE_MODEL_WEB_SEARCH + // EXTRA_SEGMENTED_SESSION + // EXTRA_SPEECH_INPUT_COMPLETE_SILENCE_LENGTH_MILLIS / + // EXTRA_SPEECH_INPUT_POSSIBLY_COMPLETE_SILENCE_LENGTH_MILLIS + // EXTRA_SPEECH_INPUT_MINIMUM_LENGTH_MILLIS + // EXTRA_AUDIO_SOURCE / EXTRA_AUDIO_SOURCE_CHANNEL_COUNT / + // EXTRA_AUDIO_SOURCE_ENCODING / EXTRA_AUDIO_SOURCE_SAMPLING_RATE + // EXTRA_BIASING_STRINGS + // EXTRA_ENABLE_BIASING_DEVICE_CONTEXT + + } + + @Override + protected void onCancel(final Callback newCallback) { + Log.d(TAG, "onCancel"); + stopRecognizer(); + } + + @Override + protected void onStopListening(final Callback newCallback) { + Log.d(TAG, "onStopListening"); + if (currentlyListening) { + stopRecognizer(); + } + } + + + + + + private void load() { + if (speechService == null && !currentlyInitializingRecognizer) { + if (new File(getModelDirectory(), "ivector").exists()) { + // one directory is in the correct place, so everything should be ok + Log.d(TAG, "Vosk model in place"); + + currentlyInitializingRecognizer = true; + + disposables.add(Completable.fromAction(this::initializeRecognizer) + .subscribeOn(Schedulers.io()) + .observeOn(AndroidSchedulers.mainThread()) + .subscribe(() -> { + currentlyInitializingRecognizer = false; + if (startListeningOnLoaded) { + startListeningOnLoaded = false; + tryToGetInput(); + } + }, throwable -> { + currentlyInitializingRecognizer = false; + if ("Failed to initialize recorder. Microphone might be already in use." + .equals(throwable.getMessage())) { + callback.error(SpeechRecognizer.ERROR_INSUFFICIENT_PERMISSIONS); + } else { + Log.e(TAG, "load()->initializeRecognizer", throwable); + callback.error(SpeechRecognizer.ERROR_SERVER); + } + })); + + } else { + try { + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) { + callback.error(SpeechRecognizer.ERROR_LANGUAGE_UNAVAILABLE); + } else { + callback.error(SpeechRecognizer.ERROR_SERVER); + } + } catch (final RemoteException e) { + logRemoteException(e); + } + } + } + } + public synchronized void tryToGetInput() { + if (currentlyInitializingRecognizer) { + startListeningOnLoaded = true; + return; + } else if (speechService == null) { + try { + callback.error(SpeechRecognizer.ERROR_SERVER); + } catch (final RemoteException e) { + logRemoteException(e); + } + return; // recognizer not ready + } + //(only one client can be connected via system to speech recognizer (otherwise + // ERROR_BUSY seems to be reported) - check whether currently listening checks are + // necessary at all) - on the other hand they do not harm + if (currentlyListening) { + try { + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) { + callback.error(SpeechRecognizer.ERROR_TOO_MANY_REQUESTS); + } else { + //more generic + callback.error(SpeechRecognizer.ERROR_SERVER); + } + } catch (final RemoteException e) { + logRemoteException(e); + } + return; + } + + currentlyListening = true; + Log.d(TAG, "starting recognizer"); + + speechService.startListening(new RecognitionListener()); + + try { + callback.readyForSpeech(null); + } catch (final RemoteException e) { + logRemoteException(e); + } + } + + private void logRemoteException(final RemoteException e) { + Log.e(TAG, "Remote exception on callback information", e); + } + + private File getModelDirectory() { + return new File(this.getFilesDir(), MODEL_PATH); + } + + + //////////////////// + // Vosk Initialization // + //////////////////// + + private synchronized void initializeRecognizer() throws IOException { + Log.d(TAG, "initializing recognizer"); + + LibVosk.setLogLevel(BuildConfig.DEBUG ? LogLevel.DEBUG : LogLevel.WARNINGS); + final Model model = new Model(getModelDirectory().getAbsolutePath()); + final Recognizer recognizer = new Recognizer(model, SAMPLE_RATE); + recognizer.setMaxAlternatives( + lastRequestedIntent.getIntExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 5)); + this.speechService = new SpeechService(recognizer, SAMPLE_RATE); + } + + /** + * save to call if + */ + private void stopRecognizer() { + if (speechService != null) { + speechService.stop(); //does nothing if recognition is not active. + } else if (currentlyListening) { + //(actually currentlyListening should never be true at this point-however does not harm) + //means SpeechRecognizer.startListening was called, but endOfSpeech not yet + // make sure to free resources so that speech recognizer is not supposed to be busy + try { + callback.endOfSpeech(); + } catch (final RemoteException e) { + logRemoteException(e); + } + } + currentlyListening = false; + + + } +} diff --git a/app/src/main/java/org/stypox/dicio/settings/IOFragment.java b/app/src/main/java/org/stypox/dicio/settings/IOFragment.java index b0c5e19b5..2f50cc16d 100644 --- a/app/src/main/java/org/stypox/dicio/settings/IOFragment.java +++ b/app/src/main/java/org/stypox/dicio/settings/IOFragment.java @@ -2,11 +2,11 @@ import android.os.Bundle; -import androidx.preference.PreferenceFragmentCompat; - import org.stypox.dicio.R; import org.stypox.dicio.input.VoskInputDevice; +import androidx.preference.PreferenceFragmentCompat; + public class IOFragment extends PreferenceFragmentCompat { @Override public void onCreatePreferences(final Bundle savedInstanceState, final String rootKey) { @@ -20,11 +20,12 @@ public void onCreatePreferences(final Bundle savedInstanceState, final String ro } return true; }); - findPreference(getString(R.string.pref_key_input_method)) - .setOnPreferenceChangeListener((preference, newValue) -> { - VoskInputDevice.deleteCurrentModel(requireContext()); - return true; - }); +//TODO Discuss whether this is needed. At least for debugging commented +// findPreference(getString(R.string.pref_key_input_method)) +// .setOnPreferenceChangeListener((preference, newValue) -> { +// VoskInputDevice.deleteCurrentModel(requireContext()); +// return true; +// }); } @Override diff --git a/app/src/main/res/values/arrays.xml b/app/src/main/res/values/arrays.xml index 3fc982afc..c220794a8 100644 --- a/app/src/main/res/values/arrays.xml +++ b/app/src/main/res/values/arrays.xml @@ -39,10 +39,12 @@ @string/pref_input_method_vosk @string/pref_input_method_text + @string/pref_input_method_systemStt @string/pref_val_input_method_vosk @string/pref_val_input_method_text + @string/pref_val_input_method_systemStt diff --git a/app/src/main/res/values/strings.xml b/app/src/main/res/values/strings.xml index a25938f2e..2f35808f8 100644 --- a/app/src/main/res/values/strings.xml +++ b/app/src/main/res/values/strings.xml @@ -53,6 +53,7 @@ Input method Choose the service to use to talk to Dicio - %1$s Text box + System provided text-to-speech-service (speech is handled outside Dicio) Vosk offline speech recognition Speech output method Choose the service Dicio should use to talk to you - %1$s diff --git a/app/src/main/res/values/strings_keys.xml b/app/src/main/res/values/strings_keys.xml index dcbab3204..03bb7adfd 100644 --- a/app/src/main/res/values/strings_keys.xml +++ b/app/src/main/res/values/strings_keys.xml @@ -9,6 +9,7 @@ input_method text vosk + sytemStt speech_output_method android diff --git a/app/src/main/res/xml/stt_service_metadata.xml b/app/src/main/res/xml/stt_service_metadata.xml new file mode 100644 index 000000000..776b6335d --- /dev/null +++ b/app/src/main/res/xml/stt_service_metadata.xml @@ -0,0 +1,5 @@ + + + \ No newline at end of file From 2eb87ebb8a0509efb522fdf6892099cdaf1d3aaa Mon Sep 17 00:00:00 2001 From: kridneb <109475719+nebkrid@users.noreply.github.com> Date: Sun, 5 Feb 2023 10:36:25 +0100 Subject: [PATCH 2/5] - Bugfix: Breakdowns in background - Bugfix: Load new model when language changed - Bugfix: Breakdown when no model is downloaded - Implemented error message notifications for analyzing errors when in background - Audio Permission requirement in manifest declaration of the STT service removed, since it may cause breakdowns in calling app instead of reporting ERROR_INSUFFICIENT_PERMISSION --- app/src/main/AndroidManifest.xml | 10 +- .../dicio/input/stt_service/SttService.java | 246 +++++++++++++----- 2 files changed, 187 insertions(+), 69 deletions(-) diff --git a/app/src/main/AndroidManifest.xml b/app/src/main/AndroidManifest.xml index dfc685c0c..82fc06f32 100644 --- a/app/src/main/AndroidManifest.xml +++ b/app/src/main/AndroidManifest.xml @@ -5,13 +5,16 @@ - + + + android:maxSdkVersion="28" /> + + tools:ignore="QueryAllPackagesPermission" /> + @@ -77,7 +80,6 @@ android:exported="true" android:description="@string/pref_input_method_vosk" android:icon="@mipmap/ic_launcher" - android:permission="android.permission.RECORD_AUDIO" > diff --git a/app/src/main/java/org/stypox/dicio/input/stt_service/SttService.java b/app/src/main/java/org/stypox/dicio/input/stt_service/SttService.java index 244fb75d2..b660171b0 100644 --- a/app/src/main/java/org/stypox/dicio/input/stt_service/SttService.java +++ b/app/src/main/java/org/stypox/dicio/input/stt_service/SttService.java @@ -15,6 +15,9 @@ import org.json.JSONObject; import org.stypox.dicio.BuildConfig; import org.stypox.dicio.R; +import org.stypox.dicio.error.ErrorInfo; +import org.stypox.dicio.error.ErrorUtils; +import org.stypox.dicio.error.UserAction; import org.vosk.LibVosk; import org.vosk.LogLevel; import org.vosk.Model; @@ -101,11 +104,7 @@ public void onResult(final String s) { } if (inputs.isEmpty()) { - try { - callback.error(SpeechRecognizer.ERROR_NO_MATCH); - } catch (final RemoteException e) { - logRemoteException(e); - } + callbackErrorReport(SpeechRecognizer.ERROR_NO_MATCH); } else { final Bundle results = new Bundle(); results.putStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION, inputs); @@ -134,24 +133,16 @@ public void onFinalResult(final String s) { @Override public void onError(final Exception e) { Log.e(TAG, "onError", e); + showErrorNotification(e); stopRecognizer(); - try { - //The Error message is quite general because there is no "generic error code" - callback.error(SpeechRecognizer.ERROR_SERVER); - } catch (final RemoteException ex) { - Log.e(TAG, "onError", e); - } + callbackErrorReport(SpeechRecognizer.ERROR_SERVER); } @Override public void onTimeout() { Log.d(TAG, "onTimeout called"); stopRecognizer(); - try { - callback.error(SpeechRecognizer.ERROR_SPEECH_TIMEOUT); - } catch (final RemoteException e) { - logRemoteException(e); - } + callbackErrorReport(SpeechRecognizer.ERROR_SPEECH_TIMEOUT); } } @@ -162,6 +153,8 @@ public void onTimeout() { */ @Nullable private SpeechService speechService = null; + private Model model; + private long modelDownloadDate; private boolean currentlyInitializingRecognizer = false; public static final String MODEL_PATH = "/vosk-model"; public static final String TAG = SttService.class.getSimpleName(); @@ -173,13 +166,11 @@ public void onTimeout() { private Intent lastRequestedIntent = null; Callback callback; -//TODO support onCheckRecognitionSupport -//TODO support onTriggerModelDownload - @Override public void onCreate() { super.onCreate(); - load(); + LibVosk.setLogLevel(BuildConfig.DEBUG ? LogLevel.DEBUG : LogLevel.WARNINGS); + initialize(); Log.d(TAG, "onCreate"); } @@ -208,11 +199,7 @@ public void onRebind(final Intent intent) { public void onDestroy() { Log.d(TAG, "onDestroy"); disposables.clear(); - if (speechService != null) { - stopRecognizer(); - speechService.shutdown(); - speechService = null; - } + shutdownSpeechService(); super.onDestroy(); } @@ -220,18 +207,80 @@ public void onDestroy() { protected void onStartListening(final Intent intent, final Callback newCallback) { Log.d(TAG, "onStartListening"); Log.d(TAG, "onStartCommand called is " + onStartCommandCalled); + this.callback = newCallback; + //TODO check permission. Actually it seems this is already done by the system interface + // (reports SpeechRecognizer.ERROR_INSUFFICIENT_PERMISSIONS) , but it is + // explicitly recommended in the SpeechRecognizer documentation. However the way it is in + // the docs does not work here due to API Level for requested calls (and since Audio + // Recorder is not directly implemented here but by vosk library) + // https://developer.android.com/reference/android/speech/RecognitionService + // However even if there is a way for app without permission, not a security issue since + // stt service notifies user when speech input is started + if (android.os.Build.VERSION.SDK_INT >= android.os.Build.VERSION_CODES.M) { + final String callingPackageName = getPackageManager().getPackagesForUid( + newCallback.getCallingUid())[0]; +//Not working this way - check fails even for dicio +// int permissionState = PermissionChecker.checkCallingPermission(this, +// "android.permission.RECORD_AUDIO", callingPackageName); +// if (permissionState != PermissionChecker.PERMISSION_GRANTED){ +// callbackErrorReport(SpeechRecognizer.ERROR_INSUFFICIENT_PERMISSIONS); +// return; +// } + } + if (speechService != null && !recogIntentExtrasEquals(lastRequestedIntent, intent)) { + shutdownSpeechService(); + if (intent.hasExtra(RecognizerIntent.EXTRA_LANGUAGE)) { + //check if language change is the reason + Log.d(TAG, "requested language = " + + intent.getStringExtra(RecognizerIntent.EXTRA_LANGUAGE)); + if (!lastRequestedIntent.hasExtra(RecognizerIntent.EXTRA_LANGUAGE) + || !lastRequestedIntent.getStringExtra(RecognizerIntent.EXTRA_LANGUAGE) + .equals(intent.getStringExtra(RecognizerIntent.EXTRA_LANGUAGE))) { + //Since at the moment only one language at the time is supported, just check + // whether the downloaded model has changed. Otherwise use the language which + // is installed anyway + if (getModelDirectory().lastModified() != modelDownloadDate) { + Log.d(TAG, "model last modified " + getModelDirectory().lastModified()); + Log.d(TAG, "model_download_date " + modelDownloadDate); + model = null; //forces reloading + shutdownSpeechService(); //forces reloading + } + } + } + } + lastRequestedIntent = intent; + //TODO remove toast or make different type of speech recognition hint or a preference option // to disable Toast.makeText(this, this.getString(R.string.pref_input_method_vosk), Toast.LENGTH_SHORT).show(); - //TODO maybe check here for audio permission of the caller (but already in manifest of this - // service declared => should not happen?): Need a test app without permission - // https://developer.android.com/reference/android/speech/RecognitionService# - // onStartListening(android.content.Intent,%20android.speech.RecognitionService.Callback) - this.callback = newCallback; - lastRequestedIntent = intent; tryToGetInput(); + } + + /** + * in order to identify whether a new recognizer has to be loaded or not + * @return true if all Extras, which are supported by this STT service, are equal + */ + protected boolean recogIntentExtrasEquals(final Intent i1, final Intent i2) { + final Bundle ie1 = i1.getExtras(); + final Bundle ie2 = i2.getExtras(); + final String[] supportedExtras = {RecognizerIntent.EXTRA_LANGUAGE, + RecognizerIntent.EXTRA_MAX_RESULTS}; + for (final String key: supportedExtras) { + final Object extra1 = ie1.get(key); + final Object extra2 = ie2.get(key); + //return false if they are not equal or one (but noth both) is null + if (extra1 != null) { + if (!extra1.equals(extra2)) { + return false; + } + } else if (extra2 != null) { + return false; + } + } + return true; + //TODO support Intent Extras if possible with vosk // EXTRA_LANGUAGE / EXTRA_LANGUAGE_PREFERENCE / EXTRA_ONLY_RETURN_LANGUAGE_PREFERENCE // Further Extras which may be interesting @@ -244,9 +293,9 @@ protected void onStartListening(final Intent intent, final Callback newCallback) // EXTRA_AUDIO_SOURCE_ENCODING / EXTRA_AUDIO_SOURCE_SAMPLING_RATE // EXTRA_BIASING_STRINGS // EXTRA_ENABLE_BIASING_DEVICE_CONTEXT - } + @Override protected void onCancel(final Callback newCallback) { Log.d(TAG, "onCancel"); @@ -265,7 +314,7 @@ protected void onStopListening(final Callback newCallback) { - private void load() { + private void initialize() { if (speechService == null && !currentlyInitializingRecognizer) { if (new File(getModelDirectory(), "ivector").exists()) { // one directory is in the correct place, so everything should be ok @@ -273,7 +322,7 @@ private void load() { currentlyInitializingRecognizer = true; - disposables.add(Completable.fromAction(this::initializeRecognizer) + disposables.add(Completable.fromAction(this::loadModel) .subscribeOn(Schedulers.io()) .observeOn(AndroidSchedulers.mainThread()) .subscribe(() -> { @@ -284,25 +333,19 @@ private void load() { } }, throwable -> { currentlyInitializingRecognizer = false; - if ("Failed to initialize recorder. Microphone might be already in use." - .equals(throwable.getMessage())) { - callback.error(SpeechRecognizer.ERROR_INSUFFICIENT_PERMISSIONS); - } else { - Log.e(TAG, "load()->initializeRecognizer", throwable); - callback.error(SpeechRecognizer.ERROR_SERVER); - } + showErrorNotification(throwable); })); } else { - try { + if (callback != null) { if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) { - callback.error(SpeechRecognizer.ERROR_LANGUAGE_UNAVAILABLE); + callbackErrorReport(SpeechRecognizer.ERROR_LANGUAGE_UNAVAILABLE); } else { - callback.error(SpeechRecognizer.ERROR_SERVER); + callbackErrorReport(SpeechRecognizer.ERROR_SERVER); } - } catch (final RemoteException e) { - logRemoteException(e); } + showErrorNotification( + new Throwable(getString(R.string.vosk_model_unsupported_language))); } } } @@ -310,27 +353,45 @@ public synchronized void tryToGetInput() { if (currentlyInitializingRecognizer) { startListeningOnLoaded = true; return; + } else if (model == null) { + Log.w(TAG, "tryToGetInput model==null"); + initialize(); //try to load anew + startListeningOnLoaded = true; + return; // recognizer not ready + } else if (getModelDirectory().lastModified() != modelDownloadDate) { + //if model has changed / updated / etc... + Log.i(TAG, "model directory modified date changed - load it anew"); + Log.d(TAG, "model last modified " + getModelDirectory().lastModified()); + Log.d(TAG, "model_download_date " + modelDownloadDate); + model = null; //reset + shutdownSpeechService(); + initialize(); //load new one + startListeningOnLoaded = true; + return; // recognizer not ready } else if (speechService == null) { try { - callback.error(SpeechRecognizer.ERROR_SERVER); - } catch (final RemoteException e) { - logRemoteException(e); + loadSpeechService(); + } catch (final IOException e) { + if ("Failed to initialize recorder. Microphone might be already in use." + .equals(e.getMessage())) { + callbackErrorReport(SpeechRecognizer.ERROR_AUDIO); + } else { + Log.e(TAG, "load()->initializeRecognizer", e); + showErrorNotification(e); + callbackErrorReport(SpeechRecognizer.ERROR_SERVER); + } + return; } - return; // recognizer not ready } //(only one client can be connected via system to speech recognizer (otherwise // ERROR_BUSY seems to be reported) - check whether currently listening checks are // necessary at all) - on the other hand they do not harm if (currentlyListening) { - try { - if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) { - callback.error(SpeechRecognizer.ERROR_TOO_MANY_REQUESTS); - } else { - //more generic - callback.error(SpeechRecognizer.ERROR_SERVER); - } - } catch (final RemoteException e) { - logRemoteException(e); + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) { + callbackErrorReport(SpeechRecognizer.ERROR_TOO_MANY_REQUESTS); + } else { + //more generic + callbackErrorReport(SpeechRecognizer.ERROR_SERVER); } return; } @@ -349,26 +410,81 @@ public synchronized void tryToGetInput() { private void logRemoteException(final RemoteException e) { Log.e(TAG, "Remote exception on callback information", e); + showErrorNotification(e); + } + + /** + * wrapper for + * calling {@link RecognitionService.Callback#error(int)} and catches the remote exception + * @param errorType see {@link RecognitionService.Callback#error(int)} + */ + protected void callbackErrorReport(final int errorType) { + try { + callback.error(errorType); + } catch (final RemoteException e) { + logRemoteException(e); + } catch (final NullPointerException e) { + showErrorNotification(e); + } } private File getModelDirectory() { return new File(this.getFilesDir(), MODEL_PATH); } + protected void showErrorNotification(final Throwable t) { + final ErrorInfo ei = new ErrorInfo(t, UserAction.STT_SERVICE_SPEECH_TO_TEXT); + ErrorUtils.createNotification(this, ei); + } + //////////////////// // Vosk Initialization // //////////////////// - private synchronized void initializeRecognizer() throws IOException { - Log.d(TAG, "initializing recognizer"); + /** + * load the vosk model. Most time consuming procedure of recognizer intitializiation + */ + private synchronized void loadModel() { + Log.d(TAG, "load Model"); + final long t0 = System.currentTimeMillis(); + model = new Model(getModelDirectory().getAbsolutePath()); + modelDownloadDate = getModelDirectory().lastModified(); + final long t1 = (System.currentTimeMillis() - t0); + Log.i(TAG, "Loading Model takes " + t1 + " ms"); + } - LibVosk.setLogLevel(BuildConfig.DEBUG ? LogLevel.DEBUG : LogLevel.WARNINGS); - final Model model = new Model(getModelDirectory().getAbsolutePath()); + /** + * load the recognizer. call this if a intent with new parameters (compared to last one) is + * received + */ + private void loadSpeechService() throws IOException { + if (speechService != null) { + //first shutdown the old one, if a new one is requested + shutdownSpeechService(); + } + + final long t0 = System.currentTimeMillis(); final Recognizer recognizer = new Recognizer(model, SAMPLE_RATE); - recognizer.setMaxAlternatives( - lastRequestedIntent.getIntExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 5)); + if (lastRequestedIntent != null) { + recognizer.setMaxAlternatives( + lastRequestedIntent.getIntExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 5)); + + } this.speechService = new SpeechService(recognizer, SAMPLE_RATE); + Log.i(TAG, "Loading SpeechService takes " + (System.currentTimeMillis() - t0) + " ms"); + } + + /** + * only shut down speech service + * this still keeps the language model in cache for faster start of speech service + */ + protected void shutdownSpeechService() { + if (speechService != null) { + stopRecognizer(); + speechService.shutdown(); + speechService = null; + } } /** From 6cb85c58b3e8378f174446ce0566f957ea6fda2d Mon Sep 17 00:00:00 2001 From: kridneb <109475719+nebkrid@users.noreply.github.com> Date: Tue, 7 Feb 2023 20:42:29 +0100 Subject: [PATCH 3/5] - Added Sound Preference (choosable per package, for data privacy security in order to notify user that speech input is started from background). --- .../stt_service/MakeSoundPreference.java | 94 +++++++++++++++++++ .../dicio/input/stt_service/SttService.java | 47 +++++++--- app/src/main/res/values/strings.xml | 10 +- app/src/main/res/values/strings_keys.xml | 5 + app/src/main/res/xml/pref_io.xml | 35 +++++-- 5 files changed, 168 insertions(+), 23 deletions(-) create mode 100644 app/src/main/java/org/stypox/dicio/input/stt_service/MakeSoundPreference.java diff --git a/app/src/main/java/org/stypox/dicio/input/stt_service/MakeSoundPreference.java b/app/src/main/java/org/stypox/dicio/input/stt_service/MakeSoundPreference.java new file mode 100644 index 000000000..49799a1c0 --- /dev/null +++ b/app/src/main/java/org/stypox/dicio/input/stt_service/MakeSoundPreference.java @@ -0,0 +1,94 @@ +package org.stypox.dicio.input.stt_service; + +import android.content.Context; +import android.content.SharedPreferences; +import android.preference.PreferenceManager; +import android.util.AttributeSet; + +import org.stypox.dicio.R; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import androidx.annotation.NonNull; +import androidx.annotation.Nullable; +import androidx.preference.MultiSelectListPreference; + +/** + * a MultiSelectListPreference which uses R.string.pref_key_stt_onbegin_nosound_entries as entries + * and entry values + */ +public class MakeSoundPreference extends MultiSelectListPreference { + final SharedPreferences preferences; + final String helperPrefKey; + final String[] ownPackageName = new String[1]; + + public MakeSoundPreference(@NonNull final Context context, @Nullable final AttributeSet attrs, + final int defStyleAttr, final int defStyleRes) { + super(context, attrs, defStyleAttr, defStyleRes); + preferences = PreferenceManager.getDefaultSharedPreferences(context); + helperPrefKey = context.getString(R.string.pref_key_stt_onlisten_sound_entries); + ownPackageName[0] = context.getPackageName(); + } + + public MakeSoundPreference(@NonNull final Context context, @Nullable final AttributeSet attrs, + final int defStyleAttr) { + super(context, attrs, defStyleAttr); + preferences = PreferenceManager.getDefaultSharedPreferences(context); + helperPrefKey = context.getString(R.string.pref_key_stt_onlisten_sound_entries); + ownPackageName[0] = context.getPackageName(); + } + + public MakeSoundPreference(@NonNull final Context context, @Nullable final AttributeSet attrs) { + super(context, attrs); + preferences = PreferenceManager.getDefaultSharedPreferences(context); + helperPrefKey = context.getString(R.string.pref_key_stt_onlisten_sound_entries); + ownPackageName[0] = context.getPackageName(); + } + + public MakeSoundPreference(@NonNull final Context context) { + super(context); + preferences = PreferenceManager.getDefaultSharedPreferences(context); + helperPrefKey = context.getString(R.string.pref_key_stt_onlisten_sound_entries); + ownPackageName[0] = context.getPackageName(); + } + + @Override + public CharSequence[] getEntries() { + final Set entries = preferences.getStringSet(helperPrefKey, + new HashSet<>(Arrays.asList(ownPackageName))); + final String[] back = new String[entries.size()]; + int i = 0; + for (final String e: entries) { + back[i] = e; + i++; + } + return back; + } + + @Override + public CharSequence[] getEntryValues() { + return getEntries(); + } + // +// protected void runtimePopulateEntries(Context context){ +// final SharedPreferences settings = PreferenceManager.getDefaultSharedPreferences(context); +// settings.getStringSet() +// final List entries = new ArrayList<>(Arrays.asList(getEntries())); +// final List entriesValues = new ArrayList<>(Arrays.asList(getEntries())); +// setEntries(entries.toArray(new CharSequence[]{})); +// setEntryValues(entriesValues.toArray(new CharSequence[]{})); +// } +// +// public void addEntry(CharSequence newEntry) { +// final Set entries = new HashSet<>(Arrays.asList(getEntries())); +// entries.add(newEntry); +// setEntries(entries.toArray(new CharSequence[]{})); +// } +// public void addEntryValue(CharSequence newEntry) { +// final List entryValues = new ArrayList<>(Arrays.asList(getEntries())); +// entryValues.add(newEntry); +// setEntryValues(entryValues.toArray(new CharSequence[]{})); +// } +} diff --git a/app/src/main/java/org/stypox/dicio/input/stt_service/SttService.java b/app/src/main/java/org/stypox/dicio/input/stt_service/SttService.java index b660171b0..4936e720c 100644 --- a/app/src/main/java/org/stypox/dicio/input/stt_service/SttService.java +++ b/app/src/main/java/org/stypox/dicio/input/stt_service/SttService.java @@ -1,6 +1,10 @@ package org.stypox.dicio.input.stt_service; import android.content.Intent; +import android.content.SharedPreferences; +import android.media.Ringtone; +import android.media.RingtoneManager; +import android.net.Uri; import android.os.Build; import android.os.Bundle; import android.os.RemoteException; @@ -8,7 +12,6 @@ import android.speech.RecognizerIntent; import android.speech.SpeechRecognizer; import android.util.Log; -import android.widget.Toast; import org.json.JSONArray; import org.json.JSONException; @@ -28,8 +31,11 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; import androidx.annotation.Nullable; +import androidx.preference.PreferenceManager; import io.reactivex.rxjava3.android.schedulers.AndroidSchedulers; import io.reactivex.rxjava3.core.Completable; import io.reactivex.rxjava3.disposables.CompositeDisposable; @@ -208,7 +214,7 @@ protected void onStartListening(final Intent intent, final Callback newCallback) Log.d(TAG, "onStartListening"); Log.d(TAG, "onStartCommand called is " + onStartCommandCalled); this.callback = newCallback; - //TODO check permission. Actually it seems this is already done by the system interface + //Regarding check permission: Actually it seems this is already done by the system interface // (reports SpeechRecognizer.ERROR_INSUFFICIENT_PERMISSIONS) , but it is // explicitly recommended in the SpeechRecognizer documentation. However the way it is in // the docs does not work here due to API Level for requested calls (and since Audio @@ -216,16 +222,33 @@ protected void onStartListening(final Intent intent, final Callback newCallback) // https://developer.android.com/reference/android/speech/RecognitionService // However even if there is a way for app without permission, not a security issue since // stt service notifies user when speech input is started - if (android.os.Build.VERSION.SDK_INT >= android.os.Build.VERSION_CODES.M) { + final SharedPreferences preferences = PreferenceManager.getDefaultSharedPreferences(this); + final boolean makeSound = preferences.getBoolean( + getString(R.string.pref_key_stt_onlisten_sound), true); + if (makeSound && android.os.Build.VERSION.SDK_INT >= android.os.Build.VERSION_CODES.M) { final String callingPackageName = getPackageManager().getPackagesForUid( newCallback.getCallingUid())[0]; -//Not working this way - check fails even for dicio -// int permissionState = PermissionChecker.checkCallingPermission(this, -// "android.permission.RECORD_AUDIO", callingPackageName); -// if (permissionState != PermissionChecker.PERMISSION_GRANTED){ -// callbackErrorReport(SpeechRecognizer.ERROR_INSUFFICIENT_PERMISSIONS); -// return; -// } + final Set exceptedPackages = preferences.getStringSet( + getString(R.string.pref_key_stt_sound_onlisten), new HashSet<>()); + if (exceptedPackages.contains(callingPackageName)) { + Log.i(TAG, "Suppressed stt onbegin sound for package " + callingPackageName); + } else { + final Uri notification = RingtoneManager.getDefaultUri( + RingtoneManager.TYPE_NOTIFICATION); + final Ringtone r = RingtoneManager.getRingtone(this, notification); + r.play(); + final Set knownPackages = preferences.getStringSet( + getString(R.string.pref_key_stt_onlisten_sound_entries), new HashSet<>()); + if (!knownPackages.contains(callingPackageName)) { + //add to preference entries to offer to user whether it shall be excepted + final HashSet extendedKnownPackages = new HashSet<>(knownPackages); + extendedKnownPackages.add(callingPackageName); + preferences.edit().putStringSet( + getString(R.string.pref_key_stt_onlisten_sound_entries), + extendedKnownPackages) + .apply(); + } + } } if (speechService != null && !recogIntentExtrasEquals(lastRequestedIntent, intent)) { shutdownSpeechService(); @@ -250,10 +273,6 @@ protected void onStartListening(final Intent intent, final Callback newCallback) } lastRequestedIntent = intent; - //TODO remove toast or make different type of speech recognition hint or a preference option - // to disable - Toast.makeText(this, this.getString(R.string.pref_input_method_vosk), - Toast.LENGTH_SHORT).show(); tryToGetInput(); } diff --git a/app/src/main/res/values/strings.xml b/app/src/main/res/values/strings.xml index 2f35808f8..2557f4295 100644 --- a/app/src/main/res/values/strings.xml +++ b/app/src/main/res/values/strings.xml @@ -69,9 +69,17 @@ DuckDuckGo Default city Set the city to use for weather when you do not explicitly say one. The current behaviour is to get the location from IP info. - Directly send result of speech to text service + Directly send result (for STT with dicio UI) Automatically send speech result to requesting app when listening finishes Wait for manual confirmation before sending speech result to requesting app + Sound + Play a sound when speech input starts + Sound on speech input start is disabled + Exceptions from sound notification + Disable sound per known application + Choose which applications can request speech input without sound notifications. Requires at least Android 6 (Marshmallow). + + The skill \"%1$s\" needs these permissions to work: %2$s Could not evaluate your request Network error diff --git a/app/src/main/res/values/strings_keys.xml b/app/src/main/res/values/strings_keys.xml index 03bb7adfd..b18008f2c 100644 --- a/app/src/main/res/values/strings_keys.xml +++ b/app/src/main/res/values/strings_keys.xml @@ -26,4 +26,9 @@ weather_default_city stt_auto_finish + + pref_key_stt_onlisten_sound + pref_key_stt_sound_onlisten + pref_key_stt_onlisten_sound_entries + \ No newline at end of file diff --git a/app/src/main/res/xml/pref_io.xml b/app/src/main/res/xml/pref_io.xml index 66ea4efc0..55a32408e 100644 --- a/app/src/main/res/xml/pref_io.xml +++ b/app/src/main/res/xml/pref_io.xml @@ -27,13 +27,32 @@ android:key="@string/pref_key_speech_output_method" android:summary="@string/pref_speech_output_method_summary" android:title="@string/pref_speech_output_method" /> + + + - + + \ No newline at end of file From a730a870ef9cdb12b08d6e7aac61d75f8371a863 Mon Sep 17 00:00:00 2001 From: kridneb <109475719+nebkrid@users.noreply.github.com> Date: Thu, 9 Feb 2023 22:25:38 +0100 Subject: [PATCH 4/5] Renamed SpeechRecogServiceInputDevice --- app/src/main/java/org/stypox/dicio/MainActivity.java | 4 ++-- ...ceInputDevice.java => AndroidSttServiceInputDevice.java} | 6 +++--- .../main/java/org/stypox/dicio/input/VoskInputDevice.java | 2 +- .../java/org/stypox/dicio/input/stt_service/SttService.java | 3 +++ 4 files changed, 9 insertions(+), 6 deletions(-) rename app/src/main/java/org/stypox/dicio/input/{SpeechRecogServiceInputDevice.java => AndroidSttServiceInputDevice.java} (97%) diff --git a/app/src/main/java/org/stypox/dicio/MainActivity.java b/app/src/main/java/org/stypox/dicio/MainActivity.java index 90bc56c4d..1f1b40b38 100644 --- a/app/src/main/java/org/stypox/dicio/MainActivity.java +++ b/app/src/main/java/org/stypox/dicio/MainActivity.java @@ -19,7 +19,7 @@ import org.stypox.dicio.eval.SkillRanker; import org.stypox.dicio.input.InputDevice; import org.stypox.dicio.input.SpeechInputDevice; -import org.stypox.dicio.input.SpeechRecogServiceInputDevice; +import org.stypox.dicio.input.AndroidSttServiceInputDevice; import org.stypox.dicio.input.ToolbarInputDevice; import org.stypox.dicio.input.VoskInputDevice; import org.stypox.dicio.input.stt_service.SttServiceActivity; @@ -309,7 +309,7 @@ private InputDevice buildPrimaryInputDevice() { //TODO make a hint/data privacy warning etc. in preference when this one is chosen that // the speech dicio records is given to a third party app according to system // settings - return new SpeechRecogServiceInputDevice(this); + return new AndroidSttServiceInputDevice(this); } else { // default return new VoskInputDevice(this); } diff --git a/app/src/main/java/org/stypox/dicio/input/SpeechRecogServiceInputDevice.java b/app/src/main/java/org/stypox/dicio/input/AndroidSttServiceInputDevice.java similarity index 97% rename from app/src/main/java/org/stypox/dicio/input/SpeechRecogServiceInputDevice.java rename to app/src/main/java/org/stypox/dicio/input/AndroidSttServiceInputDevice.java index 13dbd7742..8a15a55b1 100644 --- a/app/src/main/java/org/stypox/dicio/input/SpeechRecogServiceInputDevice.java +++ b/app/src/main/java/org/stypox/dicio/input/AndroidSttServiceInputDevice.java @@ -17,10 +17,10 @@ import static org.stypox.dicio.util.StringUtils.isNullOrEmpty; -public class SpeechRecogServiceInputDevice extends SpeechInputDevice +public class AndroidSttServiceInputDevice extends SpeechInputDevice implements android.speech.RecognitionListener { - public static final String TAG = SpeechRecogServiceInputDevice.class.getSimpleName(); + public static final String TAG = AndroidSttServiceInputDevice.class.getSimpleName(); private Activity activity; private boolean startListeningOnLoaded = false; @@ -33,7 +33,7 @@ public class SpeechRecogServiceInputDevice extends SpeechInputDevice // Exposed methods // ///////////////////// - public SpeechRecogServiceInputDevice(final Activity activity) { + public AndroidSttServiceInputDevice(final Activity activity) { this.activity = activity; } diff --git a/app/src/main/java/org/stypox/dicio/input/VoskInputDevice.java b/app/src/main/java/org/stypox/dicio/input/VoskInputDevice.java index 0c9089694..877a1648a 100644 --- a/app/src/main/java/org/stypox/dicio/input/VoskInputDevice.java +++ b/app/src/main/java/org/stypox/dicio/input/VoskInputDevice.java @@ -38,7 +38,7 @@ import static org.stypox.dicio.util.LocaleUtils.UnsupportedLocaleException; import static org.stypox.dicio.util.LocaleUtils.resolveSupportedLocale; -public class VoskInputDevice extends SpeechRecogServiceInputDevice { +public class VoskInputDevice extends AndroidSttServiceInputDevice { public static final String TAG = VoskInputDevice.class.getSimpleName(); public static final String MODEL_PATH = "/vosk-model"; diff --git a/app/src/main/java/org/stypox/dicio/input/stt_service/SttService.java b/app/src/main/java/org/stypox/dicio/input/stt_service/SttService.java index 4936e720c..60a65f6b3 100644 --- a/app/src/main/java/org/stypox/dicio/input/stt_service/SttService.java +++ b/app/src/main/java/org/stypox/dicio/input/stt_service/SttService.java @@ -512,6 +512,9 @@ protected void shutdownSpeechService() { private void stopRecognizer() { if (speechService != null) { speechService.stop(); //does nothing if recognition is not active. +//TODO test whether some devices need shutdown call everytime in order to / conflict with performens if yes +// speechService.shutdown(); +// speechService = null; } else if (currentlyListening) { //(actually currentlyListening should never be true at this point-however does not harm) //means SpeechRecognizer.startListening was called, but endOfSpeech not yet From 9e46d32b92065195a4677e663ed3619715951e0c Mon Sep 17 00:00:00 2001 From: kridneb <109475719+nebkrid@users.noreply.github.com> Date: Fri, 10 Feb 2023 23:49:10 +0100 Subject: [PATCH 5/5] Updated README.md for clarify different ways of STT service in android --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ec207b1a9..3acb4ceb3 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,10 @@ Currently Dicio answers questions about: ## Speech to text Dicio uses [Vosk](https://github.com/alphacep/vosk-api/) as its speech to text (`STT`) engine. In order to be able to run on every phone small models are employed, weighing `~50MB`. The download from [here](https://alphacephei.com/vosk/models) starts automatically whenever needed, so the app language can be changed seamlessly. +Dicio exports vosk as a speech-to-text service to the android system. Other apps can query this by different ways: +- [Via an intent](https://developer.android.com/reference/android/speech/RecognizerIntent), which shows up a dicio UI for speech input. The result is then provided to the requesting app (automatically of after user agreed as set in dicio settings). +- [From background](https://developer.android.com/reference/android/speech/SpeechRecognizer), if the requesting app has the record audio permission and dicio is set as speech input within settings -> apps -> default apps -> assistant (the exact path may vary depending on the Android version) +- If you want to use it as a "speech keyboard" (IME), you currently still need an app which use the Android speech-to-text-service and provides an IME (e.g. [this one](https://github.com/Kaljurand/K6nele)) ## Contributing @@ -57,7 +61,6 @@ When contributing keep in mind that other people may have **needs** and **views If you want to translate Dicio to a new language you have to follow these **steps**: