Add support for Coqui STT and .tflite models

- By default, Coqui will be used for inference, with an option to switch to DeepSpeech - Coqui supports .tflite models out-of-the-box, whereas DeepSpeech needs a different package. Refer #41 - English models will be automatically downloaded if run without the model argument - Updated README and requirements.txt to reflect changes
abhirooptalasila · Feb 2, 2022 · 40bb833 · 40bb833
1 parent af8f0a1
commit 40bb833
Show file tree

Hide file tree

Showing 9 changed files with 86 additions and 49 deletions.
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@
 
 ## About
 
-AutoSub is a CLI application to generate subtitle files (.srt, .vtt, and .txt transcript) for any video file using [Mozilla DeepSpeech](https://github.com/mozilla/DeepSpeech). I use the DeepSpeech Python API to run inference on audio segments and [pyAudioAnalysis](https://github.com/tyiannak/pyAudioAnalysis) to split the initial audio on silent segments, producing multiple small files.
+AutoSub is a CLI application to generate subtitle files (.srt, .vtt, and .txt transcript) for any video file using either [Mozilla DeepSpeech](https://github.com/mozilla/DeepSpeech) or [Coqui STT](https://github.com/coqui-ai/STT). I use their open-source models to run inference on audio segments and [pyAudioAnalysis](https://github.com/tyiannak/pyAudioAnalysis) to split the initial audio on silent segments, producing multiple smaller files (makes inference easy).
 
 ⭐ Featured in [DeepSpeech Examples](https://github.com/mozilla/DeepSpeech-examples) by Mozilla
 
@@ -35,15 +35,16 @@ AutoSub is a CLI application to generate subtitle files (.srt, .vtt, and .txt tr
  OR
  $ pip3 install -r requirements-gpu.txt
  ```
-* Use `getmodels.sh` to download the model and scorer files with the version number as argument
- ```bash
- $ ./getmodels.sh 0.9.3
- ```
 * Install FFMPEG. If you're on Ubuntu, this should work fine
  ```bash
  $ sudo apt-get install ffmpeg
  $ ffmpeg -version # I'm running 4.1.4
  ```
+* By default, if no model files are found in the root directory, the script will download v0.9.3 models for DeepSpeech or TFLITE model and Huge Vocab for Coqui. Use `getmodels.sh` to download DeepSpeech model and scorer files with the version number as argument. For Coqui, download from [here](https://coqui.ai/models)
+ ```bash
+ $ ./getmodels.sh 0.9.3
+ ```
+* For .tflite models with DeepSpeech, follow [this](https://github.com/abhirooptalasila/AutoSub/issues/41#issuecomment-968847604)
 
 
 ## Docker
@@ -72,7 +73,9 @@ AutoSub is a CLI application to generate subtitle files (.srt, .vtt, and .txt tr
 
 ## How-to example
 
-* The model files should be in the repo root directory and will be loaded automatically. But incase you have multiple versions, use the `--model` and `--scorer` args while executing
+* The model files should be in the repo root directory and will be loaded/downloaded automatically. Incase you have multiple versions, use the `--model` and `--scorer` args while executing
+* By default, Coqui is used for inference. You can change this by using the `--engine` argument with value `"ds"` for DeepSpeech
+* For languages other than English, you'll need to manually download the model and scorer files. Check [here](https://discourse.mozilla.org/t/links-to-pretrained-models/62688) for DeepSpeech and [here](https://coqui.ai/models) for Coqui.
 * After following the installation instructions, you can run `autosub/main.py` as given below. The `--file` argument is the video file for which subtitles are to be generated
  ```bash
  $ python3 autosub/main.py --file ~/movie.mp4

diff --git a/autosub/audioProcessing.py b/autosub/audioProcessing.py
@@ -2,19 +2,26 @@
 # -*- coding: utf-8 -*-
 
 import sys
+import shlex
 import logger
 import subprocess
 import numpy as np
 from os.path import basename
 
+try:
+ from shlex import quote
+except ImportError:
+ from pipes import quote
+
 _logger = logger.setup_applevel_logger(__name__)
 
+
 def extract_audio(input_file, audio_file_name):
  """Extract audio from input video file and save to audio/ in root dir
 
  Args:
- input_file: input video file
- audio_file_name: save audio WAV file with same filename as video file
+ input_file : input video file
+ audio_file_name : save audio WAV file with same filename as video file
  """
 
  try:
@@ -32,11 +39,11 @@ def convert_samplerate(audio_path, desired_sample_rate):
  ***WONT be called as extract_audio() converts the audio to 16kHz while saving***
 
  Args:
- audio_path: audio file path
- desired_sample_rate: DeepSpeech expects 16kHz
+ audio_path : audio file path
+ desired_sample_rate : DeepSpeech expects 16kHz
 
  Returns:
- numpy buffer: audio signal stored in numpy array
+ numpy buffer : audio signal stored in numpy array
  """
 
  sox_cmd = "sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer \
@@ -46,9 +53,7 @@ def convert_samplerate(audio_path, desired_sample_rate):
  output = subprocess.check_output(
  shlex.split(sox_cmd), stderr=subprocess.PIPE)
  except subprocess.CalledProcessError as e:
- raise RuntimeError("SoX returned non-zero status: {}".format(e.stderr))
+ raise RuntimeError(f"SoX returned non-zero status: {e.stderr}")
  except OSError as e:
- raise OSError(e.errno, "SoX not found, use {}hz files or install it: {}".format(
- desired_sample_rate, e.strerror))
-
+ raise OSError(e.errno, f"SoX not found, use {desired_sample_rate}hz files or install it: {e.strerror}")
  return np.frombuffer(output, np.int16)
diff --git a/autosub/featureExtraction.py b/autosub/featureExtraction.py
@@ -43,7 +43,6 @@ def energy_entropy(frame, n_short_blocks=10):
 
  # Compute entropy of the normalized sub-frame energies:
  entropy = -np.sum(s * np.log2(s + eps))
-
  return entropy
 
 
@@ -71,7 +70,6 @@ def spectral_centroid_spread(fft_magnitude, sampling_rate):
  # Normalize:
  centroid = centroid / (sampling_rate / 2.0)
  spread = spread / (sampling_rate / 2.0)
-
  return centroid, spread
 
 
@@ -98,7 +96,6 @@ def spectral_entropy(signal, n_short_blocks=10):
 
  # compute spectral entropy
  entropy = -np.sum(s * np.log2(s + eps))
-
  return entropy
 
 
@@ -116,7 +113,6 @@ def spectral_flux(fft_magnitude, previous_fft_magnitude):
  sp_flux = np.sum(
  (fft_magnitude / fft_sum - previous_fft_magnitude /
  previous_fft_sum) ** 2)
-
  return sp_flux
 
 
@@ -135,7 +131,6 @@ def spectral_rolloff(signal, c):
  sp_rolloff = np.float64(a[0]) / (float(fft_length))
  else:
  sp_rolloff = 0.0
-
  return sp_rolloff
 
 
@@ -220,7 +215,6 @@ def chroma_features_init(num_fft, sampling_rate):
  for u in unique_chroma:
  idx = np.nonzero(num_chroma == u)
  num_freqs_per_chroma[idx] = idx[0].shape
-
  return num_chroma, num_freqs_per_chroma
 
 
@@ -262,7 +256,6 @@ def chroma_features(signal, sampling_rate, num_fft):
  # ax.set_yticklabels(xaxis)
  # plt.show(block=False)
  # plt.draw()
-
  return chroma_names, final_matrix
 
 
@@ -411,5 +404,4 @@ def feature_extraction(signal, sampling_rate, window, step, deltas=True):
  fft_magnitude_previous = fft_magnitude.copy()
 
  features = np.concatenate(features, 1)
-
  return features, feature_names
diff --git a/autosub/main.py b/autosub/main.py
@@ -21,6 +21,7 @@
 # Line count for SRT file
 line_count = 1
 
+
 def ds_process_audio(ds, audio_file, output_file_handle_dict, split_duration):
  """sttWithMetadata() will run DeepSpeech inference on each audio file 
  generated after remove_silent_segments. These files contain start and end 
@@ -80,17 +81,19 @@ def ds_process_audio(ds, audio_file, output_file_handle_dict, split_duration):
 def main():
  global line_count
  supported_output_formats = ["srt", "vtt", "txt"]
+ supported_engines = ["ds", "stt"]
 
  parser = argparse.ArgumentParser(description="AutoSub")
  parser.add_argument("--format", choices=supported_output_formats, nargs="+",
  help="Create only certain output formats rather than all formats",
  default=supported_output_formats)
- parser.add_argument("--split-duration", dest="split_duration", type=float,
- help="Split run-on sentences exceededing this duration (in seconds) into multiple subtitles",
- default=5)
+ parser.add_argument("--split-duration", dest="split_duration", type=float, default=5,
+ help="Split run-on sentences exceededing this duration (in seconds) into multiple subtitles")
  parser.add_argument("--dry-run", dest="dry_run", action="store_true",
  help="Perform dry-run to verify options prior to running. Also useful to instantiate \
  cuda/tensorflow cache prior to running multiple times")
+ parser.add_argument("--engine", choices=supported_engines, nargs="?", default="stt",
+ help="Select either DeepSpeech or Coqui STT for inference. Latter is default")
  parser.add_argument("--file", required=False, help="Input video file")
  parser.add_argument("--model", required=False, help="Input *.pbmm model file")
  parser.add_argument("--scorer", required=False, help="Input *.scorer file")
@@ -104,7 +107,7 @@ def main():
  ds_scorer = get_model(args, "scorer")
 
  if args.dry_run:
- create_model(ds_model, ds_scorer) 
+ create_model(args.engine, ds_model, ds_scorer) 
  if args.file is not None:
  if not os.path.isfile(args.file):
  _logger.warn(f"Invalid file: {args.file}")
@@ -151,7 +154,7 @@ def main():
  audiofiles.remove(os.path.basename(audio_file_name))
 
  _logger.info("Running inference...")
- ds = create_model(ds_model, ds_scorer) 
+ ds = create_model(args.engine, ds_model, ds_scorer) 
 
  for filename in tqdm(audiofiles):
  audio_segment_path = os.path.join(audio_directory, filename)

diff --git a/autosub/segmentAudio.py b/autosub/segmentAudio.py
@@ -42,7 +42,6 @@ def read_audio_file(input_file):
 
  if signal.ndim == 2 and signal.shape[1] == 1:
  signal = signal.flatten()
-
  return sampling_rate, signal
 
 
@@ -58,7 +57,6 @@ def smooth_moving_avg(signal, window=11):
  signal, 2 * signal[-1] - signal[-1:-window:-1]]
  w = np.ones(window, 'd')
  y = np.convolve(w / w.sum(), s, mode='same')
-
  return y[window:-window + 1]
 
 
@@ -75,7 +73,6 @@ def stereo_to_mono(signal):
  else:
  if signal.shape[1] == 2:
  signal = (signal[:, 1] / 2) + (signal[:, 0] / 2)
-
  return signal
 
 
@@ -185,7 +182,6 @@ def silence_removal(signal, sampling_rate, st_win, st_step, smooth_window=0.5,
  if s_lim[1] - s_lim[0] > min_duration:
  seg_limits_2.append(s_lim)
  seg_limits = seg_limits_2
-
  return seg_limits
 
 

diff --git a/autosub/trainAudio.py b/autosub/trainAudio.py
@@ -35,7 +35,6 @@ def train_svm(features, c_param, kernel='linear'):
  svm = sklearn.svm.SVC(C=c_param, kernel=kernel, probability=True,
  gamma='auto')
  svm.fit(feature_matrix, labels)
-
  return svm
 
 
@@ -95,5 +94,4 @@ def features_to_matrix(features):
  else:
  feature_matrix = np.vstack((feature_matrix, f))
  labels = np.append(labels, i * np.ones((len(f), 1)))
-
  return feature_matrix, labels
diff --git a/autosub/utils.py b/autosub/utils.py
@@ -6,9 +6,22 @@
 import sys
 import shutil
 import logger
-from deepspeech import Model
+import subprocess
+from stt import Model as SModel
+from deepspeech import Model as DModel
 
 _logger = logger.setup_applevel_logger(__name__)
+_models = {
+ "ds": {
+ "model": "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm", 
+ "scorer": "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer"
+ },
+ "stt": {
+ "model": "https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v0.9.3/model.tflite",
+ "scorer": "https://github.com/coqui-ai/STT-models/releases/download/english%2Fcoqui%2Fv1.0.0-huge-vocab/huge-vocabulary.scorer"
+ }
+}
+
 
 def sort_alphanumeric(data):
  """Sort function to sort os.listdir() alphanumerically
@@ -20,9 +33,9 @@ def sort_alphanumeric(data):
 
  convert = lambda text: int(text) if text.isdigit() else text.lower()
  alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
-
  return sorted(data, key=alphanum_key)
 
+
 def clean_folder(folder):
  """Delete everything inside a folder
 
@@ -40,6 +53,26 @@ def clean_folder(folder):
  except Exception as e:
  _logger.warn(f"Failed to delete {file_path}. Reason: {e}")
 
+
+def download_model(engine, fname):
+ """Download model files, if not available locally
+
+ Args:
+ engine : "ds" for DeepSpeech and "stt" for Coqui STT
+ fname : either of "model" or "scorer"
+ """
+
+ _logger.info(f"{fname.capitalize()} not found locally. Downloading")
+ try:
+ _file = _models[engine][fname]
+ command = ["wget", _file, "-q", "--show-progress"]
+ ret = subprocess.run(command).returncode
+ except Exception as e:
+ _logger.error(str(e))
+ sys.exit(1)
+ return _file.split("/")[-1]
+
+
 def get_model(args, arg_name):
  """Will prioritze supplied arguments but if not, try to find files
 
@@ -48,10 +81,13 @@ def get_model(args, arg_name):
  arg_name : either model or scorer file
  """
 
- if arg_name == 'model':
- arg_extension = '.pbmm'
- elif arg_name == 'scorer':
- arg_extension = '.scorer'
+ if arg_name == "model":
+ if args.engine == "ds":
+ arg_extension = ".pbmm"
+ else:
+ arg_extension = ".tflite"
+ elif arg_name == "scorer":
+ arg_extension = ".scorer"
 
  arg = args.__getattribute__(arg_name)
 
@@ -65,12 +101,8 @@ def get_model(args, arg_name):
  num_models = len(models)
 
  if num_models == 0:
- _logger.warn(f"No {arg_name}s specified via --{arg_name} and none found in local directory. Please run getmodel.sh to get some")
- if arg_name == 'model':
- _logger.error("Must specify pbmm model")
- sys.exit(1)
- else:
- model = ''
+ model = download_model(args.engine, arg_name)
+
  elif num_models != 1: 
  _logger.warn(f"Detected {num_models} {arg_name} files in local dir")
  if arg_name == 'model':
@@ -85,16 +117,21 @@ def get_model(args, arg_name):
  _logger.info(f"{arg_name.capitalize()}: {model}")
  return(model)
 
-def create_model(model, scorer):
+
+def create_model(engine, model, scorer):
  """Instantiate model and scorer
 
  Args:
+ engine : "ds" for DeepSpeech and "stt" for Coqui STT
  model : .pbmm model file
  scorer : .scorer file
  """
 
  try:
- ds = Model(model)
+ if engine == "ds":
+ ds = DModel(model)
+ else:
+ ds = SModel(model)
  except:
  _logger.error("Invalid model file")
  sys.exit(1)
@@ -103,4 +140,4 @@ def create_model(model, scorer):
  ds.enableExternalScorer(scorer)
  except:
  _logger.warn("Invalid scorer file. Running inference using only model file")
- return(ds)
+ return(ds)
diff --git a/requirements-gpu.txt b/requirements-gpu.txt
@@ -1,5 +1,7 @@
 cycler==0.10.0
 numpy
+stt==1.0.0
+tensorflow-gpu==1.15
 deepspeech-gpu==0.9.3
 joblib==0.16.0
 kiwisolver==1.2.0

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,6 @@
 cycler==0.10.0
 numpy
+stt==1.0.0
 deepspeech==0.9.3
 joblib==0.16.0
 kiwisolver==1.2.0