-
Notifications
You must be signed in to change notification settings - Fork 5
/
create_words.cpp
123 lines (115 loc) · 4.75 KB
/
create_words.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#include <array>
#include <atomic>
#include <random>
#include <string>
#include <string_view>
//---------------------------------------------------------------------------
#include <udo/UDOperator.hpp>
//---------------------------------------------------------------------------
using namespace std;
using namespace std::literals::string_view_literals;
//---------------------------------------------------------------------------
/// The words that will be selected randomly. Words are taken from "Topics of
/// Interest" at http://vldb.org/pvldb/vol15-contributions/
static constexpr array words = {
"Data Mining and Analytics"sv,
"Data Warehousing, OLAP, Parallel and Distributed Data Mining"sv,
"Mining and Analytics for Scientific and Business data, Social Networks, Time Series, Streams, Text, Web, Graphs, Rules, Patterns, Logs, and Spatio-temporal Data"sv,
"Data Privacy and Security"sv,
"Blockchain"sv,
"Access Control and Privacy"sv,
"Database Engines"sv,
"Access Methods, Concurrency Control, Recovery and Transactions"sv,
"Hardware Accelerators"sv,
"Query Processing and Optimization"sv,
"Storage Management, Multi-core Databases, In-memory Data Management"sv,
"Views, Indexing and Search"sv,
"Database Performance"sv,
"Tuning, Benchmarking and Performance Measurement"sv,
"Administration and Manageability"sv,
"Distributed Database Systems"sv,
"Content Delivery Networks, Database-as-a-service, and Resource Management"sv,
"Cloud Data Management"sv,
"Distributed Analytics"sv,
"Distributed Transactions"sv,
"Graphs, Networks, and Semistructured Data"sv,
"Graph Data Management, Recommendation Systems, Social Networks"sv,
"Hierarchical, Non-relational, and other Modern Data Models"sv,
"Information Integration and Data Quality"sv,
"Data Cleaning, Data Discovery and Data Exploration"sv,
"Heterogeneous and Federated DBMS, Metadata Management"sv,
"Web Data Management and Semantic Web"sv,
"Knowledge Graphs and Knowledge Management"sv,
"Languages"sv,
"Data Models and Query Languages"sv,
"Schema Management and Design"sv,
"Machine Learning, AI and Databases"sv,
"Data Management Issues and Support for Machine Learning and AI"sv,
"Machine Learning and Applied AI for Data Management"sv,
"Novel DB Architectures"sv,
"Embedded and Mobile Databases"sv,
"Data management on novel hardware"sv,
"Real-time databases, Sensors and IoT, Stream Databases"sv,
"Crowd-sourcing"sv,
"Provenance and Workflows"sv,
"Profile-based and Context-Aware Data Management"sv,
"Process Mining"sv,
"Provenance analytics"sv,
"Debugging"sv,
"Specialized and Domain-Specific Data Management"sv,
"Spatial Databases and Temporal Databases"sv,
"Crowdsourcing"sv,
"Ethical Data Management"sv,
"Fuzzy, Probabilistic and Approximate Data"sv,
"Image and Multimedia Databases"sv,
"Scientific and Medical Data Management"sv,
"Text, Semi-Structured Data, and IR"sv,
"Information Retrieval"sv,
"Text in Databases"sv,
"Data Extraction"sv,
"User Interfaces"sv,
"Database Usability"sv,
"Database support for Visual Analytics"sv,
"Visualization"sv,
};
//---------------------------------------------------------------------------
/// The output of this operator
struct Output {
udo::String word;
};
//---------------------------------------------------------------------------
class CreateWords : public udo::UDOperator<udo::EmptyTuple, Output> {
private:
/// The total number of words that should be generated
uint64_t numWords;
/// The counter to track the number of words that were generated
atomic<uint64_t> wordCount = 0;
public:
/// Constructor
explicit CreateWords(uint64_t numWords) : numWords(numWords) {}
/// Produce the output
bool postProduce(LocalState& /*localState*/) {
uint64_t localWordCount = wordCount.fetch_add(10000);
if (localWordCount >= numWords)
return true;
uint64_t seed = 42 + localWordCount;
mt19937_64 gen(seed);
uniform_int_distribution<size_t> wordIndexDistr(0, words.size() - 1);
uniform_int_distribution<uint32_t> randomNumberDistr;
for (uint64_t i = 0; i < 10000 && localWordCount + i < numWords; ++i) {
auto baseWord = words[wordIndexDistr(gen)];
// Add a random number as prefix and suffix to the string so that it's
// not just a bunch of identical strings.
string word = to_string(randomNumberDistr(gen));
word += ' ';
word += baseWord;
word += ' ';
word += to_string(randomNumberDistr(gen));
Output output;
output.word = string_view(word);
produceOutputTuple(output);
}
return false;
}
};
//---------------------------------------------------------------------------