-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathshingles.go
192 lines (169 loc) · 5.38 KB
/
shingles.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
package shingles
import (
"fmt"
"hash/crc32"
"regexp"
"sort"
"strings"
"sync"
)
const (
UNIGRAM = iota + 1
BIGRAM
TRIGRAM
FOURGRAM
FIVEGRAM
SIXGRAM
SEVENGRAM
EIGHTGRAM
)
var stopwords = []string{"i", "me", "my", "myself", "we", "our", "ours", "ourselves",
"you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she",
"her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs",
"themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am",
"is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do",
"does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as",
"until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into",
"through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in",
"out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there",
"when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other",
"some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s",
"t", "can", "will", "just", "don", "should", "now"}
// NGram is a single n-gram, including it's frequency in the corpora
type NGram struct {
ngram string
occurrences int
}
// Shingles holds a number of NGrams representing a given corpora.
type Shingles struct {
mtx sync.Mutex
n int
ngrams map[uint32]*NGram
hashes []uint32
}
func contains(slice []string, val string) bool {
for _, item := range slice {
if item == val {
return true
}
}
return false
}
func hash(s string) uint32 {
return crc32.ChecksumIEEE([]byte(s))
}
func frequency(count int, total int) (frequency float32) {
return (float32)(count) / (float32)(total)
}
// Extract all sentences from a given input
func sentences(s string) (sentences []string) {
re := regexp.MustCompile(`(?m)\b([^.,;\-\?\!]{1,})\b`) // A sentence extractor... works surprisingly well, but only for latin characters.
for _, match := range re.FindAllString(s, -1) {
sentences = append(sentences, match)
}
return
}
// Extract all words from a given input, removing any english stopwords if normalizing
func words(s string, normalize bool) (words []string) {
re := regexp.MustCompile(`(?m)\b([\w'-]{1,})\b`) // A word extractor... works surprisingly well, but only for latin characters.
for _, match := range re.FindAllString(s, -1) {
switch {
case normalize:
if contains(stopwords, strings.ToLower(match)) {
continue
}
words = append(words, strings.ToLower(match))
default:
words = append(words, match)
}
}
return
}
// Compute n-grams for a set of words given as a slice of strings
func computeNGrams(words []string, length int) (ngrams []string) {
max := len(words) - length
for i := 0; i <= max; i++ {
ngrams = append(ngrams, strings.Join(words[i:i+length], " "))
}
return
}
// Add a given n-gram in the form of a string array
func (sh *Shingles) add(ngrams []string) {
for _, ngram := range ngrams {
hash := hash(ngram)
sh.mtx.Lock()
existing, exists := sh.ngrams[hash]
switch {
case exists: // Update existing n-gram
existing.occurrences = existing.occurrences + 1
default:
sh.ngrams[hash] = &NGram{
ngram: ngram,
occurrences: 1,
}
sh.hashes = append(sh.hashes, hash)
}
sh.mtx.Unlock()
}
}
//
// The following implements the necessary methods for the sort interface
// This allows us to return a list of ngrams sorted by *occurrences*.
//
// Len returns the number of n-grams
func (sh *Shingles) Len() int {
sh.mtx.Lock()
defer sh.mtx.Unlock()
return len(sh.hashes)
}
// Less returns true when the value at index i is less than the value at index j
func (sh *Shingles) Less(i, j int) bool {
sh.mtx.Lock()
defer sh.mtx.Unlock()
return sh.ngrams[sh.hashes[i]].occurrences < sh.ngrams[sh.hashes[j]].occurrences
}
// Swap two values
func (sh *Shingles) Swap(i, j int) {
sh.mtx.Lock()
sh.hashes[i], sh.hashes[j] = sh.hashes[j], sh.hashes[i]
sh.mtx.Unlock()
}
// Initialize the shingles
func (sh *Shingles) Initialize(n int) {
sh.mtx.Lock()
sh.n = n
sh.ngrams = make(map[uint32]*NGram, 10000)
sh.mtx.Unlock()
}
// Count the number of ngrams
func (sh *Shingles) Count() int {
return len(sh.hashes)
}
// Incorporate extracted n-grams. The corpora will be any text, from which sentences and words are extracted.
func (sh *Shingles) Incorporate(corpora string, normalize bool) {
for _, sentence := range sentences(corpora) {
sh.add(computeNGrams(words(sentence, normalize), sh.n))
}
}
// Walk and print all n-grams
func (sh *Shingles) Walk() {
for hash, ngram := range sh.ngrams {
fmt.Printf("hash:%v, ngram:'%v', count:%v, frequency:%v\n", hash, ngram.ngram, ngram.occurrences, frequency(ngram.occurrences, sh.Count()))
}
}
// SortedWalk walks and prints all n-grams, sorted by decreasing frequency
func (sh *Shingles) SortedWalk() {
sort.Sort(sort.Reverse(sh))
for _, hash := range sh.hashes {
fmt.Printf("hash:%v, ngram:'%v', count:%v, frequency:%v\n", hash, sh.ngrams[hash].ngram, sh.ngrams[hash].occurrences, frequency(sh.ngrams[hash].occurrences, sh.Count()))
}
}
// SortedList returns a list of all n-grams, sorted by decreasing frequency
func (sh *Shingles) SortedList() []string {
var ngrams []string
sort.Sort(sort.Reverse(sh))
for _, hash := range sh.hashes {
ngrams = append(ngrams, sh.ngrams[hash].ngram)
}
return ngrams
}