Skip to content

Commit

Permalink
extract transcription services to distinct modules
Browse files Browse the repository at this point in the history
`Transcription` now initializes one of the available services for
transcription based on user's choice: `Whisper` or `Deepgram`
  • Loading branch information
kouloumos committed Dec 6, 2023
1 parent 37c086e commit 2764a7f
Show file tree
Hide file tree
Showing 10 changed files with 376 additions and 411 deletions.
10 changes: 9 additions & 1 deletion Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,15 @@ This transcription tool operates through a structured four-stage process:

1. Preprocess: Gathers all the available metadata for each source (supports YouTube videos&playlists, and RSS feeds)
2. Process: Downloads and converts sources for transcription preparation
3. Transcription: Utilizes [`openai-whisper`](https://github.com/openai/whisper) or [Deepgram](https://deepgram.com/) to generate transcripts from MP3 files.
3. Transcription: Utilizes [`openai-whisper`](https://github.com/openai/whisper) or [Deepgram](https://deepgram.com/) to generate transcripts.
1. Converts audio to text.
- Preserves raw wisper transcript in SRT
- Preserves raw deepgram output in JSON
2. Summarize: Generates a summary of the transcript. [only available with deepgram]
3. Upload: Saves raw transcript files in an AWS S3 Bucket [optional]
4. Constructs the resulting transcript.
- Process diarization. [deepgram only]
- Process chapters.
4. Postprocess: Offers multiple options for further actions:
- **Pull Request**: Opens a PR on the [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) repo for the resulting transcript.
- **Markdown**: Saves transcripts in a markdown format supported by bitcointranscripts.
Expand Down
253 changes: 0 additions & 253 deletions app/application.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,12 @@
"""This module provides the transcript cli."""
import errno
import json
import logging
import mimetypes
import os
import re
import shutil
import subprocess
import tempfile
import time
from datetime import datetime
from urllib.parse import parse_qs, urlparse

import boto3
import pytube
import requests
import static_ffmpeg
import whisper
import yt_dlp
from clint.textui import progress
from deepgram import Deepgram
from dotenv import dotenv_values
from moviepy.editor import VideoFileClip
from pytube.exceptions import PytubeError

from app import __app_name__, __version__
from app.logging import get_logger
Expand All @@ -44,184 +28,6 @@ def convert_wav_to_mp3(abs_path, filename, working_dir="tmp/"):
return os.path.abspath(os.path.join(working_dir, filename[:-4] + ".mp3"))


def decimal_to_sexagesimal(dec):
sec = int(dec % 60)
minu = int((dec // 60) % 60)
hrs = int((dec // 60) // 60)

return f"{hrs:02d}:{minu:02d}:{sec:02d}"


def combine_chapter(chapters, transcript, working_dir="tmp/"):
logger = logging.getLogger(__app_name__)
try:
chapters_pointer = 0
transcript_pointer = 0
result = ""
# chapters index, start time, name
# transcript start time, end time, text

while chapters_pointer < len(chapters) and transcript_pointer < len(
transcript
):
if (
chapters[chapters_pointer][1]
<= transcript[transcript_pointer][0]
):
result = (
result + "\n\n## " + chapters[chapters_pointer][2] + "\n\n"
)
chapters_pointer += 1
else:
result = result + transcript[transcript_pointer][2]
transcript_pointer += 1

while transcript_pointer < len(transcript):
result = result + transcript[transcript_pointer][2]
transcript_pointer += 1

return result
except Exception as e:
logger.error("Error combining chapters")
logger.error(e)


def combine_deepgram_chapters_with_diarization(deepgram_data, chapters):
logger.info("(deepgram) Combining transcript with detected chapters...")
try:
para = ""
string = ""
curr_speaker = None
words = deepgram_data["results"]["channels"][0]["alternatives"][0][
"words"
]
words_pointer = 0
chapters_pointer = 0
while chapters_pointer < len(chapters) and words_pointer < len(words):
if chapters[chapters_pointer][1] <= words[words_pointer]["start"]:
if para != "":
para = para.strip(" ")
string = string + para + "\n\n"
para = ""
string = string + f"## {chapters[chapters_pointer][2]}\n\n"
chapters_pointer += 1
else:
if words[words_pointer]["speaker"] != curr_speaker:
if para != "":
para = para.strip(" ")
string = string + para + "\n\n"
para = ""
string = (
string
+ f'Speaker {words[words_pointer]["speaker"]}: '
+ decimal_to_sexagesimal(words[words_pointer]["start"])
)
curr_speaker = words[words_pointer]["speaker"]
string = string + "\n\n"

para = para + " " + words[words_pointer]["punctuated_word"]
words_pointer += 1
while words_pointer < len(words):
if words[words_pointer]["speaker"] != curr_speaker:
if para != "":
para = para.strip(" ")
string = string + para + "\n\n"
para = ""
string = (
string + f'Speaker {words[words_pointer]["speaker"]}:'
f' {decimal_to_sexagesimal(words[words_pointer]["start"])}'
)
curr_speaker = words[words_pointer]["speaker"]
string = string + "\n\n"

para = para + " " + words[words_pointer]["punctuated_word"]
words_pointer += 1
para = para.strip(" ")
string = string + para
return string
except Exception as e:
logger.error("Error combining deepgram chapters")
logger.error(e)


def get_deepgram_transcript(deepgram_data, diarize):
logger = logging.getLogger(__app_name__)
try:
if diarize:
logger.info(f"(deepgram) Processing diarization...")
para = ""
string = ""
curr_speaker = None
for word in deepgram_data["results"]["channels"][0]["alternatives"][0][
"words"
]:
if word["speaker"] != curr_speaker:
if para != "":
para = para.strip(" ")
string = string + para + "\n\n"
para = ""
string = (
string + f'Speaker {word["speaker"]}: '
f'{decimal_to_sexagesimal(word["start"])}'
)
curr_speaker = word["speaker"]
string = string + "\n\n"

para = para + " " + word["punctuated_word"]
para = para.strip(" ")
string = string + para
return string
else:
return deepgram_data["results"]["channels"][0]["alternatives"][0][
"transcript"
]
except Exception as e:
raise Exception(f"Error while getting deepgram transcript: {e}")


def get_deepgram_summary(deepgram_data):
logger = logging.getLogger(__app_name__)
try:
summaries = deepgram_data["results"]["channels"][0]["alternatives"][0][
"summaries"
]
summary = ""
for x in summaries:
summary = summary + " " + x["summary"]
return summary.strip(" ")
except Exception as e:
logger.error("Error getting summary")
logger.error(e)


def process_mp3_deepgram(filename, summarize, diarize):
"""using deepgram"""
logger = logging.getLogger(__app_name__)
logger.info("Transcribing audio to text using deepgram...")
try:
config = dotenv_values(".env")
dg_client = Deepgram(config["DEEPGRAM_API_KEY"])

with open(filename, "rb") as audio:
mimeType = mimetypes.MimeTypes().guess_type(filename)[0]
source = {"buffer": audio, "mimetype": mimeType}
response = dg_client.transcription.sync_prerecorded(
source,
{
"punctuate": True,
"speaker_labels": True,
"diarize": diarize,
"smart_formatting": True,
"summarize": summarize,
"model": "whisper-large",
},
)
audio.close()
return response
except Exception as e:
raise Exception(f"(deepgram) Error transcribing audio to text: {e}")


def create_pr(absolute_path, loc, username, curr_time, title):
logger = logging.getLogger(__app_name__)
branch_name = loc.replace("/", "-")
Expand All @@ -242,40 +48,6 @@ def create_pr(absolute_path, loc, username, curr_time, title):
logger.info("Please check the PR for the transcription.")


def combine_deepgram_with_chapters(deepgram_data, chapters):
logger.info("(deepgram) Combining transcript with detected chapters...")
try:
chapters_pointer = 0
words_pointer = 0
result = ""
words = deepgram_data["results"]["channels"][0]["alternatives"][0][
"words"
]
# chapters index, start time, name
# transcript start time, end time, text
while chapters_pointer < len(chapters) and words_pointer < len(words):
if chapters[chapters_pointer][1] <= words[words_pointer]["end"]:
result = (
result + "\n\n## " + chapters[chapters_pointer][2] + "\n\n"
)
chapters_pointer += 1
else:
result = result + words[words_pointer]["punctuated_word"] + " "
words_pointer += 1

# Append the final chapter heading and remaining content
while chapters_pointer < len(chapters):
result = result + "\n\n## " + chapters[chapters_pointer][2] + "\n\n"
chapters_pointer += 1
while words_pointer < len(words):
result = result + words[words_pointer]["punctuated_word"] + " "
words_pointer += 1

return result
except Exception as e:
raise Exception(f"Error combining deepgram with chapters: {e}")


def clean_up(tmp_dir):
try:
shutil.rmtree(tmp_dir)
Expand All @@ -284,31 +56,6 @@ def clean_up(tmp_dir):
raise


def generate_srt(data, filename, model_output_dir):
time_in_str = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
if not os.path.isdir(model_output_dir):
os.makedirs(model_output_dir)
output_file = os.path.join(
model_output_dir, filename + "_" + time_in_str + ".srt"
)
logger.info(f"Writing srt to {output_file}...")
with open(output_file, "w") as f:
for index, segment in enumerate(data):
start_time, end_time, text = segment
f.write(f"{index+1}\n")
f.write(f"{format_time(start_time)} --> {format_time(end_time)}\n")
f.write(f"{text.strip()}\n\n")
logger.info("File saved")
return output_file


def format_time(time):
hours = int(time / 3600)
minutes = int((time % 3600) / 60)
seconds = int(time % 60)
milliseconds = int((time % 1) * 1000)
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"


def upload_file_to_s3(file_path):
logger = logging.getLogger(__app_name__)
Expand Down
2 changes: 2 additions & 0 deletions app/services/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .whisper import Whisper
from .deepgram import Deepgram
Loading

0 comments on commit 2764a7f

Please sign in to comment.