-
Notifications
You must be signed in to change notification settings - Fork 0
/
configuration.py
108 lines (102 loc) · 4.92 KB
/
configuration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
import argparse
def get_configuration(args):
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
description='''Multi-Stem Conversational Transcriber
''')
parser.add_argument('operationMode', type=str,
choices=['recognize', 'assemble', 'summarize', 'semiauto', 'fullauto'],
help='''Which step to perform:
- recognize: Transcribes all audio files found at the
path using whisper_timestamped and writes
a .words.json file for each.
- assemble: Arranges by timecode the contents of all
.words.json files found at the path,
switching speakers at punctuation, to
produce a readable transcript.txt.
- summarize: Calls OpenAI API to summarize the
transcript.txt at the path using
configurable prompts.
- semiauto: Runs recognize followed immediately by
assemble. (This is the recommended first
pass mode, as it is common to iterate on
assemble multiple times making manual
tweaks to the .words.json files.)
- fullauto: Performs all steps in succession.
''')
parser.add_argument('inputDir', type=str, help='The path to the files to process.')
recognizeConfigGroup = parser.add_argument_group('recognize mode options')
recognizeConfigGroup.add_argument('--extension', type=str, help='''File extension of the audio files to transcribe.
Defaults to "ogg", for use with Craig recordings, but
I would think that things like "wav" or "flac" would
work too.
''')
recognizeConfigGroup.add_argument('--fast', action='store_true',
help='''Prioritize recognition speed over accuracy.
Results in the following changes:
- Uses the "tiny" model instead of the "small" model
- Uses "efficient" params rather than "accurate" ones
Honestly this really doesn't work well at all and I
do not recommend it.
'''
)
assembleConfigGroup = parser.add_argument_group('assemble mode options')
assembleConfigGroup.add_argument('--noEllipses', action='store_true', help='''This script normally inserts ellipses (...) into the
transcript whenever a word is more than 5s after its
predecessor, allowing a speaker change (which is done
on punctuation).
The --noEllipses switch suppresses this behavior.
''')
assembleConfigGroup.add_argument('--disfluentComma', action='store_true', help='''Replace detected disfluencies (e.g. "um", "uh") with a
comma in the transcript.
This may help if you are using --noEllipses.
''')
assembleConfigGroup.add_argument('--noAsterisks', action='store_true', help='''When this script inserts ellipses or disfluency commas
into the transcript, it marks them with an asterisk (*)
for reference.
The --noAsterisks switch suppresses this behavior.
''')
assembleConfigGroup.add_argument('--showTimestamps', action='store_true', help='''Include the start and end seconds of the phrase in
front of each line in the transcript.
i.e. [1905.39-1907.05] Joe: "Look a timestamp."
''')
assembleConfigGroup.add_argument('--corrections', type=str, help='''A list of known incorrect values to replace in the
transcript output. This is a quick way to correct
frequently misinterpreted text such as unusual names.
Each entry is the correct word or phrase with a list of
incorrect ones. For example,
'{"Elsalor":["Elcelor", "I'll solar", "else the Lord"],
"A'Dhem" :["Adam"] }'
This can be a path to a .json file or the actual JSON.
''')
assembleConfigGroup.add_argument('--names', type=str, help='''Replacements for the speaker names as recorded in the
filenames by discord/Craig. These should reflect the
names used by speakers to refer to each other in the
recordings. For example:
'{ "joey__0": "Joe",
"randointernet3000_0": "Bob" }'
This can be a path to a .json file or the actual JSON.
You will be prompted individually for any values not
found here (and given the opportunity to skip that
audio stem).
''')
summarizeConfigGroup = parser.add_argument_group('summarize mode options')
summarizeConfigGroup.add_argument('--promptType', type=str, help='''
This script will call OpenAI's GPT-4 API to summarize
the transcript as many times as it is given prompts to
do so. It will attempt to find text files with the name
pattern "prompt_{promptType}_*.txt", in the following
order:
- in the `inputDir`
- one level above the `inputDir`
- in the location of this script
''')
summarizeConfigGroup.add_argument('--openApiKey', type=str, help='''Due to current LLM token limits (Q1 2024) and the very
large number of tokens needed to summarize transcripts
of much length, the summarize operation calls ChatGPT
4 Turbo (128k tokens). As such, an OpenAI API key is
required to run in summarize (or fullauto) mode.
(It'll probably cost you about $0.10 USD per call.)
''')
config = vars(parser.parse_args(args))
return config