-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathplsa-defn.h
292 lines (242 loc) · 9.57 KB
/
plsa-defn.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
/*
** Probabilistic latent semantic analysis (PLSA, multiprocessor version)
** Copyright (C) 2009-2010 by Raymond Wan ([email protected])
**
** This program is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef PLSA_DEFN_H
#define PLSA_DEFN_H
/*
** Define the type of floating point to use; highly recommend to use only double
*/
#if 1
/*! Data type to use for probabilities */
typedef double PROBNODE;
/*! Data type to use for probabilities (in MPI functions); must match the definition of PROBNODE */
#define MPI_TYPE MPI_DOUBLE
#else
/*! Data type to use for probabilities */
typedef float PROBNODE;
/*! Data type to use for probabilities (in MPI functions); must match the definition of PROBNODE */
#define MPI_TYPE MPI_FLOAT
#endif
/*
** e^(-87.49823353) = 1.0E-38
** e^(-73.682723) = 1.0E-32
** e^(-55.26204223) = 1.0E-24
** e^(-25.32843602) = 0.00000000001
** e^(-23.02585093) = 0.0000000001
** e^(-20.72326584) = 0.000000001
** e^(-18.42068074) = 0.00000001
** e^(-16.11809565) = 0.0000001
** e^(-13.81551056) = 0.000001
** e^(-11.51292547) = 0.00001
** e^(-9.210340372) = 0.0001
*/
/*! Accuracy of floating point values as a log (base e) value, multiplied by -1 */
#define LN_LIMIT 23.02585093
/*! Minimum probability */
#define MIN_PROB (1.0E-24)
/*! Macro to perform a log */
#define DOLOG(X) (logf (X))
/*! Macro to perform the exp function */
#define DOEXP(X) (expf (X))
/*! Macro to perform log (1 + x) */
#define DOLOGONE(X) (log1pf (X))
/*! Macro to perform log (1 + expt(x)) */
#define DOLOG1PEXP(x) DOLOGONE(DOEXP(x))
/*! Generate a random number between [0, 1); cast to floating point first to prevent overflow */
#define RANDOM_FLOAT ((PROBNODE)rand () / ((PROBNODE)RAND_MAX + (PROBNODE)1.0))
/*! Test if two double values are close to each other */
#define DBL_LESS(A,B) ((B - A) > DBL_EPSILON)
/*! Minimum difference between two maximum likelihoods */
#define ML_DELTA 0.001
/*! ID of the main processor is always 0 */
#define MAINPROC 0
/*! Number of digits to round; used when outputting to binary only */
#define ROUND_DIGITS 100000000
/********************************************************************/
/* Definitions and functions related to MPI */
/*
** Definitions from Quinn (2003), pg.120 [BLOCK_SIZE corrected] */
/* id = process rank;
** p = total number of process;
** n = number of items
** index = position of the item to see who is responsible for it
*/
#define BLOCK_LOW(id, p, n) ((id) * (n)/(p))
#define BLOCK_HIGH(id, p, n) (BLOCK_LOW ((id)+1, p, n) - 1)
#define BLOCK_SIZE(id, p, n) (BLOCK_LOW ((id) + 1, p, n)-BLOCK_LOW(id, p, n))
#define BLOCK_OWNER(index, p, n) (((p) * ((index)+1)-1)/(n))
/*! Maximum latent state -- value must be a multiple of 10 and the true maximum state is 1 less. Affects the function macro MSG_TAG. */
#define MAX_CLUSTERS 1000
/*! Define the message tag based on the iteration, type of message, and cluster number */
#define MSG_TAG(X,Y,Z) ((X * 10000) + (Y * MAX_CLUSTERS) + Z)
#define TAG_PROBW1_Z 1
#define TAG_PROBW2_Z 2
#define TAG_PROBZ 3
#define TAG_PROBZ_W1W2 4
#define TAG_PROBW1W2 5
/*! V is the current process; W is the recipient; X, Y, Z are the iteration, message type, and cluster ID */
#if DEBUG
#define MSG_SEND_STATUS(V,W,X,Y,Z) fprintf (stderr, "[%u] Send to %u (%u, %u, %u)...\n", V, W, X, Y, Z);
#define MSG_RECV_STATUS(V,W,X,Y,Z) fprintf (stderr, "[%u] Receive from %u (%u, %u, %u)...\n", V, W, X, Y, Z);
#else
#define MSG_SEND_STATUS(V,W,X,Y,Z) ;
#define MSG_RECV_STATUS(V,W,X,Y,Z) ;
#endif
/********************************************************************/
/* Inline functions */
/*! Define'd function to indicate program progress */
#define PROGRESS_MSG(A) \
if (info -> verbose) { \
fprintf (stderr, "==\t%s\n", A); \
}
#define FOPEN(FILENAME,FP,MODE) \
FP = fopen ((char*) FILENAME, MODE); \
if (FP == NULL) { \
fprintf (stderr, "Error %s %s.\n", (strcmp (MODE, "w") == 0) ? "creating" : "opening", FILENAME); \
exit (EXIT_FAILURE); \
}
#define FCLOSE(FP) \
(void) fclose (FP);
/********************************************************************/
/* Functions for accessing cooccurrence structure */
/*! Function to retrieve from the cooccurrence array */
#define SET_COS(W,X,Y,Z) \
{ \
info -> cos[W][X].column = Y; \
info -> cos[W][X].x = Z; \
}
/*! Function to retrieve the position from the cooccurrence array */
#define GET_COS(W,X) (info -> cos[W][X].x)
/*! Function to retrieve the cooccurrence count from the cooccurrence array */
#define GET_COS_POSITION(W,X) (info -> cos[W][X].column)
/********************************************************************/
/* Functions for accessing probabilities */
/*! Function to retrieve from P(w1|z); translate 2D to 1D co-ordinates -- X is z; Y is w1 */
#define GET_PROBW1_Z_PREV(X,Y) (info -> probw1_z_prev[X * info -> m + Y])
#define GET_PROBW1_Z_CURR(X,Y) (info -> probw1_z_curr[X * info -> m + Y])
/*! Function to retrieve from P(w2|z); translate 2D to 1D co-ordinates -- X is z; Y is w2 */
#define GET_PROBW2_Z_PREV(X,Y) (info -> probw2_z_prev[X * info -> n + Y])
#define GET_PROBW2_Z_CURR(X,Y) (info -> probw2_z_curr[X * info -> n + Y])
/*! Function to retrieve from P(z) -- X is z */
#define GET_PROBZ_PREV(X) (info -> probz_prev[X])
#define GET_PROBZ_CURR(X) (info -> probz_curr[X])
/*! Function to map P(z|w1,w2) to (P(w1|z) * P(w2|z) * P(z)) -- W is z; X is w1; Y is w2 */
#define GET_PROBZ_W1W2_PREV(W,X,Y) (GET_PROBW1_Z_PREV(W,X) + GET_PROBW2_Z_PREV(W,Y) + GET_PROBZ_PREV(W))
#define GET_PROBZ_W1W2_CURR(W,X,Y) (GET_PROBW1_Z_CURR(W,X) + GET_PROBW2_Z_CURR(W,Y) + GET_PROBZ_CURR(W))
/*! Function to retrieve from P(i,j) -- X is w1; Y is w2 */
#define GET_PROB_W1W2(X, Y) (info -> prob_w1w2[X * info -> n + Y])
#define logSumsInline(A,B) \
{ \
register PROBNODE x, y; \
if (A > B) { \
x = A; y = B; \
} \
else { \
x = B; y = A; \
} \
\
/* a > b */ \
\
A = (fabs (y - x) > LN_LIMIT) ? x : x + DOLOG1PEXP (y - x); \
}
/********************************************************************/
typedef struct cooccur {
/*! The co-occurrence count, as a log value */
PROBNODE x;
/*! Column position of this value */
unsigned int column;
} COOCCUR;
typedef struct info {
/*! Verbose output? */
bool verbose;
/*! Debugging output? */
bool debug;
/*! Text I/O */
bool textio;
/*! Should the output values be rounded? */
bool rounding;
/*! Suppress output */
bool no_output;
/*! Random seed */
unsigned int seed;
/*! Number of clusters */
unsigned int num_clusters;
/*! Base filename for the output file */
char *base_fn;
/*! Maximum number of iterations */
unsigned int maxiter;
/*! Intervals to output p(x,y); UINT_MAX means do not output */
unsigned int snapshot;
/*! Number of unique query terms */
unsigned int m;
/*! Number of terms in the document collection */
unsigned int n;
/*! Co-occurrence filename */
char *co_fn;
/*! Co-occurrence counts in a COOCCUR data structure */
COOCCUR **cos;
/*! List of row identifiers (m of them) */
unsigned int *row_ids;
/*! List of column identifiers (m of them) */
unsigned int *column_ids;
/*! Iteration; only calculated by the main process and broadcasted to others */
unsigned int iter;
/*! P(w1|z) of size (k * m) */
PROBNODE *probw1_z_curr;
/*! P(w2|z) of size (k * n) */
PROBNODE *probw2_z_curr;
/*! P(z) of size (k) */
PROBNODE *probz_curr;
/*! P'(w1|z) of size (k * m) */
PROBNODE *probw1_z_prev;
/*! P'(w2|z) of size (k * n) */
PROBNODE *probw2_z_prev;
/*! P'(z) of size (k) */
PROBNODE *probz_prev;
/*! P(w1,w2) of size (m * n) */
PROBNODE *prob_w1w2;
/* Variables specific to Open MP */
int threads;
/* Variables specific to MPI */
/*! ID of this process */
signed int world_id;
/*! Number of processes total */
signed int world_size;
/*! Starting block (cluster) for this process to handle */
unsigned int block_start;
/*! Ending block (cluster) for this process to handle */
unsigned int block_end;
/*! Size of the block for this process to handle */
unsigned int block_size;
/*! Number of floating point exception errors */
unsigned int sigfpe_count;
/* Various times */
time_t program_start;
double run_time;
double readCO_time;
double initEM_time;
double calculateProbW1W2_time;
double calculateML_time;
double swapPrevCurr_time;
double applyEMStep_time;
double gatherProbs_time;
double normalizeProbs_time;
double distributeProbs_time;
double printCoProbs_time;
time_t program_end;
} INFO;
#endif