diff --git a/app/services/deepgram.py b/app/services/deepgram.py index 5a231f5..069e70b 100644 --- a/app/services/deepgram.py +++ b/app/services/deepgram.py @@ -1,5 +1,6 @@ import json import mimetypes +import re import deepgram from dotenv import dotenv_values @@ -64,124 +65,6 @@ def write_to_json_file(self, transcription_service_output, transcript: Transcrip return transcription_service_output_file - def process_with_diarization_and_chapters(self, transcription_service_output, chapters): - logger.info( - "(deepgram) Processing diarization with detected chapters...") - try: - para = "" - string = "" - curr_speaker = None - words = transcription_service_output["results"]["channels"][0]["alternatives"][0][ - "words" - ] - words_pointer = 0 - chapters_pointer = 0 - while chapters_pointer < len(chapters) and words_pointer < len(words): - if chapters[chapters_pointer][1] <= words[words_pointer]["start"]: - if para != "": - para = para.strip(" ") - string = string + para + "\n\n" - para = "" - string = string + f"## {chapters[chapters_pointer][2]}\n\n" - chapters_pointer += 1 - else: - if words[words_pointer]["speaker"] != curr_speaker: - if para != "": - para = para.strip(" ") - string = string + para + "\n\n" - para = "" - string = ( - string - + f'Speaker {words[words_pointer]["speaker"]}: ' - + utils.decimal_to_sexagesimal(words[words_pointer]["start"]) - ) - curr_speaker = words[words_pointer]["speaker"] - string = string + "\n\n" - - para = para + " " + words[words_pointer]["punctuated_word"] - words_pointer += 1 - while words_pointer < len(words): - if words[words_pointer]["speaker"] != curr_speaker: - if para != "": - para = para.strip(" ") - string = string + para + "\n\n" - para = "" - string = ( - string + f'Speaker {words[words_pointer]["speaker"]}:' - f' {utils.decimal_to_sexagesimal(words[words_pointer]["start"])}' - ) - curr_speaker = words[words_pointer]["speaker"] - string = string + "\n\n" - - para = para + " " + words[words_pointer]["punctuated_word"] - words_pointer += 1 - para = para.strip(" ") - string = string + para - return string - except Exception as e: - raise Exception(f"Error combining deepgram chapters: {e}") - - def process_with_diarization(self, transcription_service_output): - logger.info(f"(deepgram) Processing diarization...") - para = "" - string = "" - curr_speaker = None - for word in transcription_service_output["results"]["channels"][0]["alternatives"][0][ - "words" - ]: - if word["speaker"] != curr_speaker: - if para != "": - para = para.strip(" ") - string = string + para + "\n\n" - para = "" - string = ( - string + f'Speaker {word["speaker"]}: ' - f'{utils.decimal_to_sexagesimal(word["start"])}' - ) - curr_speaker = word["speaker"] - string = string + "\n\n" - - para = para + " " + word["punctuated_word"] - para = para.strip(" ") - string = string + para - return string - - def process_with_chapters(self, transcription_service_output, chapters): - logger.info("(deepgram) Combining transcript with detected chapters...") - try: - chapters_pointer = 0 - words_pointer = 0 - result = "" - words = transcription_service_output["results"]["channels"][0]["alternatives"][0][ - "words" - ] - # chapters index, start time, name - # transcript start time, end time, text - while chapters_pointer < len(chapters) and words_pointer < len(words): - if chapters[chapters_pointer][1] <= words[words_pointer]["end"]: - result = ( - result + "\n\n## " + - chapters[chapters_pointer][2] + "\n\n" - ) - chapters_pointer += 1 - else: - result = result + \ - words[words_pointer]["punctuated_word"] + " " - words_pointer += 1 - - # Append the final chapter heading and remaining content - while chapters_pointer < len(chapters): - result = result + "\n\n## " + \ - chapters[chapters_pointer][2] + "\n\n" - chapters_pointer += 1 - while words_pointer < len(words): - result = result + words[words_pointer]["punctuated_word"] + " " - words_pointer += 1 - - return result - except Exception as e: - raise Exception(f"Error combining deepgram with chapters: {e}") - def process_summary(self, transcript: Transcript): with open(transcript.transcription_service_output_file, "r") as outfile: transcription_service_output = json.load(outfile) @@ -197,6 +80,141 @@ def process_summary(self, transcript: Transcript): except Exception as e: logger.error(f"Error getting summary: {e}") + def process_segments(self, transcription_service_output, diarization): + try: + words = transcription_service_output["results"]["channels"][0]["alternatives"][0]["words"] + segments = [] + current_segment = None + + for word in words: + speaker_id = word["speaker"] if diarization else "single_speaker" + speaker_text = word["punctuated_word"] + if speaker_id != current_segment: + # change of speaker + current_segment = speaker_id + segments.append({ + "speaker": speaker_id, + "start": word["start"], + "end": word["end"], + "transcript": "", + "words": [] + }) + + segments[-1]["transcript"] += f"{speaker_text} " + segments[-1]["words"].append(word) + segments[-1]["end"] = word["end"] + + for segment in segments: + segment["transcript"] = segment["transcript"].strip() + + return segments + except Exception as e: + raise Exception( + f"(deepgram) Error constructing speaker segments: {e}") + + def break_segments_into_sentences(self, segments): + result = [] + # Define the sentence splitting pattern + abbreviation_pattern = r'(?= midpoint else sentence_start + + adjusted_chapters = [] + + for chapter in chapters: + chapter_start_time = chapter[1] + chapter_sentence = find_sentence_for_timestamp( + transformed_json, chapter_start_time) + + if chapter_sentence: + adjusted_start_time = adjust_timestamp( + chapter_start_time, chapter_sentence["start"], chapter_sentence["end"]) + adjusted_chapter = [chapter[0], + adjusted_start_time] + chapter[2:] + adjusted_chapters.append(adjusted_chapter) + else: + adjusted_chapters.append(chapter) + + return adjusted_chapters + + def construct_transcript(self, speaker_segments, chapters): + try: + formatted_transcript = "" + chapter_index = 0 if chapters else None + + for speaker_data in speaker_segments: + speaker_id = speaker_data["speaker"] + single_speaker = speaker_id == "single_speaker" + + for i, sentence_data in enumerate(speaker_data["sentences"]): + sentence_start = sentence_data["start"] + first_sentence = i == 0 + + if chapter_index is not None and chapter_index < len(chapters): + chapter_id, chapter_start_time, chapter_title = chapters[chapter_index] + + if chapter_start_time <= sentence_start: + # Chapter starts at this sentence + formatted_transcript += "\n" if not first_sentence else "" + formatted_transcript += f"## {chapter_title}\n\n" + if not single_speaker and not first_sentence: + formatted_transcript += f"Speaker {speaker_id}: {utils.decimal_to_sexagesimal(chapter_start_time)}\n\n" + chapter_index += 1 + + if not single_speaker and first_sentence: + formatted_transcript += f"Speaker {speaker_id}: {utils.decimal_to_sexagesimal(sentence_start)}\n\n" + + formatted_transcript += f'{sentence_data["transcript"]}\n' + + formatted_transcript += "\n" + + return formatted_transcript.strip() + except Exception as e: + raise Exception(f"Error creating output format: {e}") + def finalize_transcript(self, transcript: Transcript): try: with open(transcript.transcription_service_output_file, "r") as outfile: @@ -204,24 +222,19 @@ def finalize_transcript(self, transcript: Transcript): has_diarization = any( 'speaker' in word for word in transcription_service_output['results']['channels'][0]['alternatives'][0]['words']) - has_chapters = len(transcript.source.chapters) > 0 - - if has_chapters: - # With chapters - if has_diarization: - # With diarization - return self.process_with_diarization_and_chapters(transcription_service_output, chapters) - else: - # Without diarization - return self.process_with_chapters(transcription_service_output, transcript.source.chapters) - else: - # Without chapters - if has_diarization: - # With diarization - return self.process_with_diarization(transcription_service_output) - else: - # Without diarization - return transcription_service_output["results"]["channels"][0]["alternatives"][0]["transcript"] + + logger.info( + f"(deepgram) Finalizing transcript [diarization={has_diarization}, chapters={len(transcript.source.chapters)> 0}]...") + speaker_segments = self.process_segments( + transcription_service_output, has_diarization) + speaker_segements_with_sentences = self.break_segments_into_sentences( + speaker_segments) + with open("test.json", "w") as json_file: + json.dump(speaker_segements_with_sentences, json_file, indent=4) + adjusted_chapters = self.adjust_chapter_timestamps( + speaker_segements_with_sentences, transcript.source.chapters) + result = self.construct_transcript( + speaker_segements_with_sentences, adjusted_chapters) return result except Exception as e: diff --git a/app/transcription.py b/app/transcription.py index 0f0fc3d..94ea1d1 100644 --- a/app/transcription.py +++ b/app/transcription.py @@ -360,7 +360,7 @@ def write_to_markdown_file(self, transcript: Transcript, output_dir): # Write to file markdown_file = f"{utils.configure_output_file_path(output_dir, transcript.title, add_timestamp=False)}.md" with open(markdown_file, "w") as opf: - opf.write(meta_data + "\n") + opf.write(meta_data) opf.write(transcript.result + "\n") self.logger.info(f"Markdown file stored at: {markdown_file}") return os.path.abspath(markdown_file)