-
Notifications
You must be signed in to change notification settings - Fork 17
/
invertedIndex.py
109 lines (88 loc) · 3.35 KB
/
invertedIndex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# Example code in python programming language demonstrating some of the features of an inverted index.
# In this example, we scan a directory containing the corpus of files. (In this case the documents are reports on articles
# and authors submitted to the Journal "Communications of the Association for Computing Machinery"
#
# In this example we see each file being read, tokenized (each word or term is extracted) combined into a sorted
# list of unique terms.
#
# We also see the creation of a documents dictionary containing each document in sorted form with an index assigned to it.
# Each unique term is written out into a terms dictionary in sorted order with an index number assigned for each term.
# From our readings we know that to complete teh inverted index all that we need to do is create a third file that will
# coorelate each term with the list of documents that it was extracted from.
# We can further develop this example by keeping a reference for each term of the documents that it came from and by
# developing a list of the documents thus creating the term and document dictionaries.
#
# As you work with this example, think about how you might enhance it to assign a unique index number to each term and to
# each document and how you might create a data structure that links the term index with the document index.
import sys,os,re
import time
import math
# define global variables used as counters
tokens = 0
documents = 0
terms = 0
termindex = 0
docindex = 0
# initialize list variable
#
alltokens = []
alldocs = []
#
# Capture the start time of the routine so that we can determine the total running
# time required to process the corpus
#
t2 = time.localtime()
# set the name of the directory for the corpus
dirname = "/Data/SourceCode/infoRetrieval/cacm"
# For each document in the directory read the document into a string
#
all = [f for f in os.listdir(dirname)]
for f in all:
documents+=1
with open(dirname+'/'+f, 'r') as myfile:
alldocs.append(f)
data=myfile.read().replace('\n', '')
for token in data.split():
alltokens.append(token)
tokens+=1
# Open for write a file for the document dictionary
documentfile = open(dirname+'/'+'documents.dat', 'w')
alldocs.sort()
for f in alldocs:
docindex += 1
documentfile.write(f+','+str(docindex)+os.linesep)
documentfile.close()
#
# Sort the tokens in the list
alltokens.sort()
#
# Define a list for the unique terms
g=[]
#
# Identify unique terms in the corpus
for i in alltokens:
if i not in g:
g.append(i)
terms+=1
terms = len(g)
# Output Index to disk file. As part of this process we assign an 'index' number to each unique term.
#
indexfile = open(dirname+'/'+'index.dat', 'w')
for i in g:
termindex += 1
indexfile.write(i+','+str(termindex)+os.linesep)
indexfile.close()
# Print metrics on corpus
#
print ('Processing Start Time: %.2d:%.2d' % (t2.tm_hour, t2.tm_min))
print ("Documents %i" % documents)
print ("Tokens %i" % tokens)
print ("Terms %i" % terms)
t2 = time.localtime()
print ('Processing End Time: %.2d:%.2d' % (t2.tm_hour, t2.tm_min))
# size of vocabulary (M) in Heap's law, k=40, peta=0.5
#T is terms/tokens?
print("============================================")
M = 40*math.pow(tokens, 0.5)
print("size of vocabulary (M) in Heap's law, k=40, peta=0.5")
print("size of vocabulary %i" %M)