-
Notifications
You must be signed in to change notification settings - Fork 0
/
new-md5
executable file
·315 lines (276 loc) · 9.46 KB
/
new-md5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
#!/usr/bin/env bash
# usage: new-md5 depth dir [--bitrot]"
#
# depth maximum depth for the generated MD5 files.
# dir directory for which the MD5 files will be generated.
# --bitrot use bitrot instead of md5sum, used for directories with
# changing files.
#
# Recursively creates MD5 files and notifies if files specified in existing MD5
# files are missing. New files will be appended to the end of the MD5 file.
#
# Can be used as a cronjob or used in a wrapper script to cycle through
# different directories. Mainly used for static directories, where the data
# should not change, eg. directories containing books or movies. Currently
# doesn't verify existing MD5 files, for which a separate script or command
# should be used.
#
# EXAMPLE 1
#
# $ new-md5 0 /dir/to/books
#
# creates one MD5 file, recursively containing every file found in
# /dir/to/books. If the file structure in /dir/to/books is the following:
#
# Book 1.epub
# Book 2.pdf
# Directory/Book 3.txt
#
# ... a .verify.md5 file will be created in /dir/to/books/.verify.md5,
# containing:
#
# f270899858f8204b62043167ac8d9551 Book 1.epub
# 008e6bc48c8eaa5d2855d57e6b0b8594 Book 2.pdf
# cd5e0e3fba97e8b64cf301bbb350d7cf Directory/Book 3.txt
#
# EXAMPLE 2
#
# $ new-md5
#
# translates to ...
#
# $ new-md5 0 "$PWD"
#
# ... which is identical to example 1, except that the current working directory
# will be used instead of /dir/to/books
#
# EXAMPLE 3
#
# $ new-md5 2 /dir/to/tv_shows
#
# creates as many MD5 files as there are directories within depth 2. If the
# file structure is the following:
#
# /dir/to/tv_shows
# ├── TV Show 1
# │ └── Season 01
# │ ├── Episode 01.mkv
# │ ├── Episode 02.mkv
# │ └── Episode 03.mkv
# └── TV Show 2
# ├── Season 01
# │ ├── Episode 01.mkv
# │ ├── Episode 02.mkv
# │ └── Episode 03.mkv
# └── Season 02
# ├── Episode 01.mkv
# ├── Episode 02.mkv
# └── Episode 03.mkv
#
# ... three .verify.md5 files will be created, as there are three directories
# within depth 2:
# TV Show 1/Season 01/.verify.md5
# TV Show 2/Season 01/.verify.md5
# TV Show 2/Season 02/.verify.md5
#
# ... each of the .verify.md5 files containing the respective directories' files
# recursively, as in example 1.
#
# If a file exists within depth 2 or higher, eg.
#
# /dir/to/tv_shows
# ├── Episode 10.mkv
# ├── TV Show 1
# │ ├── Episode 00.mkv
# │ └── Season 01
# │ ├── Episode 01.mkv
# │ ├── Episode 02.mkv
# │ └── Episode 03.mkv
#
# ... Episode 00.mkv and Episode 10.mkv will not be included in any MD5 file
# and the user will be notified of this. In this example, the second argument
# defining the depth should be 0 instead of 2.
#
# EXAMPLE 4
#
# $ new-md5 0 /dir/to/documents --bitrot
#
# instead of using md5sum for appending new files to the MD5 file, the program
# 'bitrot' will be used to create _and verify_ the files in the directory
# /dir/to/documents.
#
# Unlike in the previous examples, if a file has been changed, it will be
# checked for corruption according to the specifications of 'bitrot'.
#
# See 'https://github.com/ambv/bitrot/' for more details.
#
#
#
# In summary, omit the --bitrot argument when creating MD5 files for large and
# static directories, where files should not change. If working with a
# directory containing files that can change over time, include the --bitrot
# argument.
#
# When generating MD5 files in a clean directory structure, it may be more
# convenient to specify a depth greater than zero (see example 3).
# TODO: add the ability to verify existing archives
# TODO: implement the counter in a better way
# TODO: add proper parsing for arguments
# TODO: implement an argument to enable/disable colorized output
# TODO: better implementation for cleaning up filenames with diacritics
USAGE="usage: new-md5 [depth] [dir] [--bitrot]
depth maximum depth for the generated MD5 files
dir directory for which the MD5 files will be generated
--bitrot use bitrot instead of md5sum, used for directories with
changing files"
if [[ "$1" == "-h" || "$1" == "--help" ]]; then
echo "$USAGE"
exit
fi
# The checksum's filename, without the extension
DEST=".verify"
# The color to print potentially missing files in
ERROR_COLOR="\033[1;41m"
# Whether to create an SHA1 file after running bitrot
CREATE_SHA1_FILE=true
# The depth of the directories where the MD5 files will be generated
depth=$1
# The root directory for the MD5 files
dir="$2"
# Check whether to use bitrot or not
use_bitrot=false
# Notify the user of missing dependencies if necessary
if [[ "$3" == "--bitrot" ]]; then
if ! type bitrot > /dev/null 2>&1; then
echo -n "executable 'bitrot' not found, exiting. "
echo "(installation instructions: https://pypi.org/project/bitrot/)"
exit 1
fi
if ! type sqlite3 > /dev/null 2>&1; then
if [[ $CREATE_SHA1_FILE == true ]]; then
echo -n "executable 'sqlite3' not found, exiting."
exit 1
fi
fi
use_bitrot=true
fi
# Cache file containing the counter for new files
TMPFILE="$(mktemp)"
# Cache file containing the filenames that have disappeared from the file
# structure.
MISSING_FILES="$(mktemp)"
# Arguments for the bitrot program
BITROT_OPTS=(
--follow-links
)
# If given no arguments, use depth 0 and the working directory.
if [[ ! -d "$dir" ]]; then
dir="$PWD"
fi
if [[ -z $depth ]]; then
depth=0
fi
get_dirs () {
# Check for excluded files above the given depth
find "$(readlink -f "$dir")" -maxdepth $depth -type f 2>&1 \
| sed 's,^,EXCLUDED: ,g' >&2
# Get the directories to verify
find "$(readlink -f "$dir")" \
-mindepth $depth -maxdepth $depth -type d -or -type l
}
get_calculated_files () {
# Get a list of already calculated files, if the MD5 file exists
if [[ -f "$DEST.md5" ]]; then
cat "$DEST.md5" | sed 's,^[0-9a-f]\{32\} [ \*],,g' | sort -u
fi
}
get_files () {
# Find all files recursively in the current directory
find -L . -type f -not -name "$DEST.md5" \
| cut -d'/' -f2-
}
get_uncalculated_files () {
# Get files not yet included in the MD5 file
(get_calculated_files; get_files) \
| sort | uniq -u
}
create_sha1 () {
# Creates an SHA1 file based on the contents on the .bitrot.db file
if [[ ! -f ".bitrot.db" ]]; then
echo -n "ERROR: $PWD/.bitrot.db doesn't exist, "
echo "skipping sha1sum file creation"
return
fi
sqlite3 .bitrot.db -separator ' ' \
"SELECT hash,path FROM bitrot WHERE path <> './$DEST.sha1';" \
| sed "s,^\([a-f0-9]\{40\}\) \./,\1 ,g" \
| sed -e 's,ä,ä,g' -e 's,ö,ö,g' -e 's,Ä,Ä,g' -e 's,Ã,Ã,g' \
-e 's,é,é,g' -e 's,Ō,Ō,g' -e 's,í,í,g' -e 's,ō,ō,g' \
-e 's,ö,ö,g' -e 's,č,č,g' -e 's,ū,ū,g' -e 's,ï,ï,g' \
-e 's,å,å,g' -e 's,Ö,Ö,g' \
> "$DEST.sha1"
}
update_md5 () {
original_path="$PWD" # Get the current working directory
# Updates the MD5 files in each of the directories passed from stdin
while read path; do
# If the directory can't be entered, notify the user and skip to the
# next one
if ! cd "$path"; then
echo "Couldn't enter $path, skipping" >&2
continue
fi
# If using bitrot, only execute 'bitrot' and skip to the next one
if $use_bitrot; then
bitrot "${BITROT_OPTS[@]}" 2>&1 \
| sed "s,^,$path: ,g"
# After running bitrot, create a static sha1 file
if [[ "$CREATE_SHA1_FILE" == true ]]; then
create_sha1
fi
continue
fi
# Reset the counter
echo -n 0 > "$TMPFILE"
# Get new files and append them to the MD5 file
get_uncalculated_files | while read file; do
# If a file exists in the MD5 file but not in the file structure,
# notify the user before exiting the script.
if [[ ! -f "$file" ]]; then
echo "$path/$file" >> "$MISSING_FILES"
continue
fi
# Read the counter from the cache file
i="$(cat "$TMPFILE")"
# Increase the counter
j="$(echo "$i+1" | bc)"
# Write the counter to the cache file
echo "$j" > "$TMPFILE"
# Calculate the MD5-checksum of the file and append it to the MD5
# file
md5sum "$file" >> "$DEST.md5"
done
# Read the counter when all of the files have been calculated
i="$(cat "$TMPFILE")"
if [[ $i -gt 0 ]]; then
# Only print the counter and directory when new files have been
# added
echo "$path: $i new files"
fi
done
# Return to the original working directory
cd "$original_path"
}
print_missing () {
# Print missing files if any have been detected, colorizing the output
if [ -s "$MISSING_FILES" ]; then
echo -e "${ERROR_COLOR}MISSING FILES:"
cat "$MISSING_FILES"
echo -en "\033[0m"
fi
}
get_dirs | # Get the directories for which MD5 files will be generated
update_md5 # Generate the MD5 files
print_missing >&2 # Print missing files
rm "$TMPFILE" # Remove the temporary counter file
rm "$MISSING_FILES" # Remove the missing files cache