extract transcription services to distinct modules

`Transcription` now initializes one of the available services for transcription based on user's choice: `Whisper` or `Deepgram`
bitcointranscripts · Dec 6, 2023 · 2764a7f · 2764a7f
1 parent 37c086e
commit 2764a7f
Show file tree

Hide file tree

Showing 10 changed files with 376 additions and 411 deletions.
diff --git a/Readme.md b/Readme.md
@@ -15,7 +15,15 @@ This transcription tool operates through a structured four-stage process:
 
 1. Preprocess: Gathers all the available metadata for each source (supports YouTube videos&playlists, and RSS feeds)
 2. Process: Downloads and converts sources for transcription preparation
-3. Transcription: Utilizes [`openai-whisper`](https://github.com/openai/whisper) or [Deepgram](https://deepgram.com/) to generate transcripts from MP3 files.
+3. Transcription: Utilizes [`openai-whisper`](https://github.com/openai/whisper) or [Deepgram](https://deepgram.com/) to generate transcripts.
+    1. Converts audio to text.
+        - Preserves raw wisper transcript in SRT
+        - Preserves raw deepgram output in JSON
+    2. Summarize: Generates a summary of the transcript. [only available with deepgram]
+    3. Upload: Saves raw transcript files in an AWS S3 Bucket [optional]
+    4. Constructs the resulting transcript.
+        - Process diarization. [deepgram only]
+        - Process chapters.
 4. Postprocess: Offers multiple options for further actions:
     - **Pull Request**: Opens a PR on the [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) repo for the resulting transcript.
     - **Markdown**: Saves transcripts in a markdown format supported by bitcointranscripts.

diff --git a/app/application.py b/app/application.py
@@ -1,28 +1,12 @@
 """This module provides the transcript cli."""
 import errno
-import json
 import logging
-import mimetypes
 import os
-import re
 import shutil
 import subprocess
-import tempfile
-import time
-from datetime import datetime
-from urllib.parse import parse_qs, urlparse
 
 import boto3
-import pytube
-import requests
-import static_ffmpeg
-import whisper
-import yt_dlp
-from clint.textui import progress
-from deepgram import Deepgram
 from dotenv import dotenv_values
-from moviepy.editor import VideoFileClip
-from pytube.exceptions import PytubeError
 
 from app import __app_name__, __version__
 from app.logging import get_logger
@@ -44,184 +28,6 @@ def convert_wav_to_mp3(abs_path, filename, working_dir="tmp/"):
     return os.path.abspath(os.path.join(working_dir, filename[:-4] + ".mp3"))
 
 
-def decimal_to_sexagesimal(dec):
-    sec = int(dec % 60)
-    minu = int((dec // 60) % 60)
-    hrs = int((dec // 60) // 60)
-
-    return f"{hrs:02d}:{minu:02d}:{sec:02d}"
-
-
-def combine_chapter(chapters, transcript, working_dir="tmp/"):
-    logger = logging.getLogger(__app_name__)
-    try:
-        chapters_pointer = 0
-        transcript_pointer = 0
-        result = ""
-        # chapters index, start time, name
-        # transcript start time, end time, text
-
-        while chapters_pointer < len(chapters) and transcript_pointer < len(
-            transcript
-        ):
-            if (
-                chapters[chapters_pointer][1]
-                <= transcript[transcript_pointer][0]
-            ):
-                result = (
-                    result + "\n\n## " + chapters[chapters_pointer][2] + "\n\n"
-                )
-                chapters_pointer += 1
-            else:
-                result = result + transcript[transcript_pointer][2]
-                transcript_pointer += 1
-
-        while transcript_pointer < len(transcript):
-            result = result + transcript[transcript_pointer][2]
-            transcript_pointer += 1
-
-        return result
-    except Exception as e:
-        logger.error("Error combining chapters")
-        logger.error(e)
-
-
-def combine_deepgram_chapters_with_diarization(deepgram_data, chapters):
-    logger.info("(deepgram) Combining transcript with detected chapters...")
-    try:
-        para = ""
-        string = ""
-        curr_speaker = None
-        words = deepgram_data["results"]["channels"][0]["alternatives"][0][
-            "words"
-        ]
-        words_pointer = 0
-        chapters_pointer = 0
-        while chapters_pointer < len(chapters) and words_pointer < len(words):
-            if chapters[chapters_pointer][1] <= words[words_pointer]["start"]:
-                if para != "":
-                    para = para.strip(" ")
-                    string = string + para + "\n\n"
-                para = ""
-                string = string + f"## {chapters[chapters_pointer][2]}\n\n"
-                chapters_pointer += 1
-            else:
-                if words[words_pointer]["speaker"] != curr_speaker:
-                    if para != "":
-                        para = para.strip(" ")
-                        string = string + para + "\n\n"
-                    para = ""
-                    string = (
-                        string
-                        + f'Speaker {words[words_pointer]["speaker"]}: '
-                        + decimal_to_sexagesimal(words[words_pointer]["start"])
-                    )
-                    curr_speaker = words[words_pointer]["speaker"]
-                    string = string + "\n\n"
-
-                para = para + " " + words[words_pointer]["punctuated_word"]
-                words_pointer += 1
-        while words_pointer < len(words):
-            if words[words_pointer]["speaker"] != curr_speaker:
-                if para != "":
-                    para = para.strip(" ")
-                    string = string + para + "\n\n"
-                para = ""
-                string = (
-                    string + f'Speaker {words[words_pointer]["speaker"]}:'
-                    f' {decimal_to_sexagesimal(words[words_pointer]["start"])}'
-                )
-                curr_speaker = words[words_pointer]["speaker"]
-                string = string + "\n\n"
-
-            para = para + " " + words[words_pointer]["punctuated_word"]
-            words_pointer += 1
-        para = para.strip(" ")
-        string = string + para
-        return string
-    except Exception as e:
-        logger.error("Error combining deepgram chapters")
-        logger.error(e)
-
-
-def get_deepgram_transcript(deepgram_data, diarize):
-    logger = logging.getLogger(__app_name__)
-    try:
-        if diarize:
-            logger.info(f"(deepgram) Processing diarization...")
-            para = ""
-            string = ""
-            curr_speaker = None
-            for word in deepgram_data["results"]["channels"][0]["alternatives"][0][
-                "words"
-            ]:
-                if word["speaker"] != curr_speaker:
-                    if para != "":
-                        para = para.strip(" ")
-                        string = string + para + "\n\n"
-                    para = ""
-                    string = (
-                        string + f'Speaker {word["speaker"]}: '
-                        f'{decimal_to_sexagesimal(word["start"])}'
-                    )
-                    curr_speaker = word["speaker"]
-                    string = string + "\n\n"
-
-                para = para + " " + word["punctuated_word"]
-            para = para.strip(" ")
-            string = string + para
-            return string
-        else:
-            return deepgram_data["results"]["channels"][0]["alternatives"][0][
-                "transcript"
-            ]
-    except Exception as e:
-        raise Exception(f"Error while getting deepgram transcript: {e}")
-
-
-def get_deepgram_summary(deepgram_data):
-    logger = logging.getLogger(__app_name__)
-    try:
-        summaries = deepgram_data["results"]["channels"][0]["alternatives"][0][
-            "summaries"
-        ]
-        summary = ""
-        for x in summaries:
-            summary = summary + " " + x["summary"]
-        return summary.strip(" ")
-    except Exception as e:
-        logger.error("Error getting summary")
-        logger.error(e)
-
-
-def process_mp3_deepgram(filename, summarize, diarize):
-    """using deepgram"""
-    logger = logging.getLogger(__app_name__)
-    logger.info("Transcribing audio to text using deepgram...")
-    try:
-        config = dotenv_values(".env")
-        dg_client = Deepgram(config["DEEPGRAM_API_KEY"])
-
-        with open(filename, "rb") as audio:
-            mimeType = mimetypes.MimeTypes().guess_type(filename)[0]
-            source = {"buffer": audio, "mimetype": mimeType}
-            response = dg_client.transcription.sync_prerecorded(
-                source,
-                {
-                    "punctuate": True,
-                    "speaker_labels": True,
-                    "diarize": diarize,
-                    "smart_formatting": True,
-                    "summarize": summarize,
-                    "model": "whisper-large",
-                },
-            )
-            audio.close()
-        return response
-    except Exception as e:
-        raise Exception(f"(deepgram) Error transcribing audio to text: {e}")
-
-
 def create_pr(absolute_path, loc, username, curr_time, title):
     logger = logging.getLogger(__app_name__)
     branch_name = loc.replace("/", "-")
@@ -242,40 +48,6 @@ def create_pr(absolute_path, loc, username, curr_time, title):
     logger.info("Please check the PR for the transcription.")
 
 
-def combine_deepgram_with_chapters(deepgram_data, chapters):
-    logger.info("(deepgram) Combining transcript with detected chapters...")
-    try:
-        chapters_pointer = 0
-        words_pointer = 0
-        result = ""
-        words = deepgram_data["results"]["channels"][0]["alternatives"][0][
-            "words"
-        ]
-        # chapters index, start time, name
-        # transcript start time, end time, text
-        while chapters_pointer < len(chapters) and words_pointer < len(words):
-            if chapters[chapters_pointer][1] <= words[words_pointer]["end"]:
-                result = (
-                    result + "\n\n## " + chapters[chapters_pointer][2] + "\n\n"
-                )
-                chapters_pointer += 1
-            else:
-                result = result + words[words_pointer]["punctuated_word"] + " "
-                words_pointer += 1
-
-        # Append the final chapter heading and remaining content
-        while chapters_pointer < len(chapters):
-            result = result + "\n\n## " + chapters[chapters_pointer][2] + "\n\n"
-            chapters_pointer += 1
-        while words_pointer < len(words):
-            result = result + words[words_pointer]["punctuated_word"] + " "
-            words_pointer += 1
-
-        return result
-    except Exception as e:
-        raise Exception(f"Error combining deepgram with chapters: {e}")
-
-
 def clean_up(tmp_dir):
     try:
         shutil.rmtree(tmp_dir)
@@ -284,31 +56,6 @@ def clean_up(tmp_dir):
             raise
 
 
-def generate_srt(data, filename, model_output_dir):
-    time_in_str = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
-    if not os.path.isdir(model_output_dir):
-        os.makedirs(model_output_dir)
-    output_file = os.path.join(
-        model_output_dir, filename + "_" + time_in_str + ".srt"
-    )
-    logger.info(f"Writing srt to {output_file}...")
-    with open(output_file, "w") as f:
-        for index, segment in enumerate(data):
-            start_time, end_time, text = segment
-            f.write(f"{index+1}\n")
-            f.write(f"{format_time(start_time)} --> {format_time(end_time)}\n")
-            f.write(f"{text.strip()}\n\n")
-    logger.info("File saved")
-    return output_file
-
-
-def format_time(time):
-    hours = int(time / 3600)
-    minutes = int((time % 3600) / 60)
-    seconds = int(time % 60)
-    milliseconds = int((time % 1) * 1000)
-    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
-
 
 def upload_file_to_s3(file_path):
     logger = logging.getLogger(__app_name__)

diff --git a/app/services/__init__.py b/app/services/__init__.py
@@ -0,0 +1,2 @@
+from .whisper import Whisper
+from .deepgram import Deepgram
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .whisper import Whisper
		from .deepgram import Deepgram