diff --git a/CMakeLists.txt b/CMakeLists.txt index ed88e69..c772c03 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,6 +65,11 @@ _install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/tevr-asr-data/" DESTINATION "/us set(CPACK_GENERATOR "DEB") set(CPACK_DEBIAN_PACKAGE_MAINTAINER "Hajo Nils Krabbenhöft") set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS OFF) -set(CPACK_PACKAGE_DESCRIPTION "German speech recognition with wav2vec 2.0 XLS-R 1B enhanced with TEVR tokens" ) +set(CPACK_PACKAGE_VERSION "1.0.0") +set(CPACK_PACKAGE_VERSION_MAJOR 1) +set(CPACK_PACKAGE_VERSION_MINOR 0) +set(CPACK_PACKAGE_VERSION_PATCH 0) +set(CPACK_PACKAGE_FILE_NAME "tevr_asr_tool-1.0.0-Linux-x86_64") +set(CPACK_PACKAGE_DESCRIPTION "German speech recognition with TEVR as a command-line tool" ) set(CPACK_DEBIAN_PACKAGE_DEPENDS "libc6 (>= 2.27), libgcc1 (>= 1:4.7), libstdc++6 (>= 6)" ) include(CPack) diff --git a/README.md b/README.md new file mode 100644 index 0000000..de87c78 --- /dev/null +++ b/README.md @@ -0,0 +1,121 @@ +# TEVR ASR Tool + +* state-of-the-art performance +* no GPU needed +* 100% offline +* 100% private +* 100% free +* MIT license +* Linux x86_64 +* command-line tool +* easy to understand + * only 284 lines of C++ code + * AI model on HuggingFace + +In August 2022, we ranked +**#1 on "Speech Recognition on Common Voice German (using extra training data)"**. +Accordingly, the performance of this tool is considered to be +the best of what's currently possible +in German speech recognition: +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/tevr-improving-speech-recognition-by-token/speech-recognition-on-common-voice-german)](https://paperswithcode.com/sota/speech-recognition-on-common-voice-german?p=tevr-improving-speech-recognition-by-token) + +## Install the Debian/Ubuntu package +Download `tevr_asr_tool-1.0.0-Linux-x86_64.deb` from GitHub: +```bash +wget "URL_HERE" +``` +Install it: +```bash +sudo dpkg -i tevr_asr_tool-1.0.0-Linux-x86_64.deb +``` + +## Install from Source Code +Download submodules: +```bash +git submodule update --init +``` +CMake configure and build: +```bash +cmake -DCMAKE_BUILD_TYPE=MinSizeRel -DCPACK_CMAKE_GENERATOR=Ninja -S . -B build +cmake --build build --target tevr_asr_tool -j 16 +``` +Create debian package: +```bash +(cd build && cpack -G DEB) +``` +Install it: +```bash +sudo dpkg -i build/tevr_asr_tool-1.0.0-Linux-x86_64.deb +``` + +## Usage + +```bash +tevr_asr_tool --target_file=test_audio.wav 2>log.txt +``` +should display the correct transcription +` mückenstiche sollte man nicht aufkratzen `. +And `log.txt` will contain the diagnostics and progress +that was logged to stderr during execution. + +## GPU Acceleration for Developers + +I plan to release a Vulkan & OpenGL-accelerated +real-time low-latency transcription +software for developers soon. +It'll run 100% private + 100% offline +just like this tool, +but instead of processing a WAV file on CPU +it'll stream the real-time GPU transcription +of your microphone input +through a WebRTC-capable REST API +so that you can easily integrate it +with your own voice-controlled projects. +For example, that'll enable +hackable voice typing +together with `pynput.keyboard`. + +If you want to get notified when it launches, +please enter your email at +https://madmimi.com/signups/f0da3b13840d40ce9e061cafea6280d5/join + +## Commercial / GPU Acceleration + +If you have a commercial use-case for this or similar +technology - ideally something that helps +small and medium-sized businesses in northern Germany +become more competitive - +then please contact me at moin@DeutscheKI.de + + +## Research Citation + +If you use this for research, please cite: +```bibtex +@misc{https://doi.org/10.48550/arxiv.2206.12693, + doi = {10.48550/ARXIV.2206.12693}, + url = {https://arxiv.org/abs/2206.12693}, + author = {Krabbenhöft, Hajo Nils and Barth, Erhardt}, + keywords = {Computation and Language (cs.CL), Sound (cs.SD), Audio and Speech Processing (eess.AS), FOS: Computer and information sciences, FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering, FOS: Electrical engineering, electronic engineering, information engineering, F.2.1; I.2.6; I.2.7}, + title = {TEVR: Improving Speech Recognition by Token Entropy Variance Reduction}, + publisher = {arXiv}, + year = {2022}, + copyright = {Creative Commons Attribution 4.0 International} +} +``` + +## Replace the AI Model + +The German AI model and my training scripts can be found on HuggingFace: +https://huggingface.co/fxtentacle/wav2vec2-xls-r-1b-tevr + +The model has undergone XLS-R cross-language pre-training. +You can directly fine-tune it with a different +language dataset - for example CommonVoice English - +and then re-export the files in the +`tevr-asr-data` folder. + +Alternatively, you can donate roughly 2 weeks of +A100 GPU credits to me +and I'll train a suitable recognition model +and upload it to HuggingFace.