-
Notifications
You must be signed in to change notification settings - Fork 0
/
search.py
executable file
·140 lines (121 loc) · 3.98 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python
import docx
import PyPDF2
import os
import re
import sys
import argparse
"""
Class for obtaining summary of keywords found in a
folder of transcripts
Supports .pdf, .docx, and .txt file formats
"""
class KeywordSearch:
def __init__(self, keyword_file, directories):
"""
Initialize KeywordSearch object
Parameters
----------
directories: list of strings
Paths of directories where documents are located
Can be relative or absolute
keyword_file: string
Path of keyword file of Enter(newline)-separated
words that are to be searched
"""
# Find transcripts in each directory, get keywords, and print findings
self.transcripts = []
for directory in directories:
self.traverse_directory(curr_path=directory)
self.load_keywords(keyword_file)
print "Folders: {}\nKeywords: {}\nTranscripts: {}\n".format(", ".join(directories), ", ".join(self.keywords), ", ".join(self.transcripts))
def traverse_directory(self, curr_path="."):
"""
Traverse a directory to find all files
parameters
----------
curr_path: string
Current file path of the recursive call
returns
-------
No return value
"""
try:
for item in os.listdir(curr_path):
path = os.path.join(curr_path, item)
if os.path.isdir(path):
self.traverse_directory(curr_path=path)
else:
self.transcripts.append(path)
except Exception as e:
print "Couldn't open '{}':".format(curr_path), e
def load_keywords(self, keyword_file):
"""
Open and parse keyword file
parameters
----------
keyword_file: string
Path of the keyword file
returns
-------
No return value
"""
try:
with open(keyword_file, "r") as f:
text = f.read().splitlines()
self.keywords = [line for line in text if line]
self.counts = {keyword:0 for keyword in self.keywords}
except Exception as e:
print "Couldn't open '{}':".format(keyword_file), e
exit(1)
def parse(self):
"""
Parse all transcripts for keywords
"""
for transcript in self.transcripts:
# Open file based on format
if transcript.endswith(".docx"):
document = docx.Document(transcript)
self.text = [paragraph.text.encode('utf-8') for paragraph in document.paragraphs]
elif transcript.endswith(".pdf"):
pdfFileObj = open(transcript, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
numPages = pdfReader.numPages
self.text = ""
for page in range(numPages):
self.text += pdfReader.getPage(page).extractText().encode("utf-8").strip()
self.text = self.text.split("\n")
elif transcript.endswith(".txt"):
wholeFile = open(transcript, "r")
self.text = wholeFile.read().splitlines()
else:
print "Skipping {}...".format(transcript)
continue
# Print file information
print "Searching through '{}'...".format(transcript)
sys.stdout.write("File: {}\nKeywords:\n".format(transcript))
# Check for keywords
for word in self.keywords:
count = str(len(re.findall(r"({})".format(word), "\n".join(self.text))))
self.counts[word] += int(count)
sys.stdout.write("\t{}: {}\n".format(word, count))
for line in self.text:
found = re.findall(r"({})".format(word), line)
if len(found) > 0:
sys.stdout.write("\t\tLine {}: {}\n".format(str(self.text.index(line)+1), line))
print "Done searching through '{}'\n".format(transcript)
# Print totals
print "Total keyword finds:"
for key in self.keywords:
print "\t{}: {}".format(key, self.counts[key])
print "Search complete!"
if __name__ == "__main__":
"""
Driver Code
"""
parser = argparse.ArgumentParser(description='Obtain summary information on keywords in transcripts (.docx, .pdf, .txt)')
parser.add_argument('keyword_file', type=str, help='Path of keyword file of Enter(newline)-separated words that are to be searched')
parser.add_argument('directories', type=str, nargs="+", help='Path of directory where documents are located. Can be relative or absolute')
args = parser.parse_args()
k = KeywordSearch(args.keyword_file, args.directories)
k.parse()