-
Notifications
You must be signed in to change notification settings - Fork 185
/
query_model_other.py
executable file
·152 lines (132 loc) · 7.94 KB
/
query_model_other.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import numpy as np
from .aad_globals import QUERY_EUCLIDEAN_DIST_MIN
from ..common.utils import get_sample_feature_ranges, append, logger
from .query_model import Query, get_first_vals_not_marked
from .query_model_euclidean import filter_by_euclidean_distance
from .forest_aad_detector import is_forest_detector
from .forest_description import (
get_regions_for_description, get_region_volumes, get_compact_regions, get_region_memberships
)
"""
Custom Query models kept here to decouple the base APIs from custom APIs.
"""
class QueryTopDiverseSubspace(Query):
""" Batch queries """
def __init__(self, opts=None, **kwargs):
Query.__init__(self, opts)
self.order_by_euclidean_diversity = False
def update_query_state(self, **kwargs):
pass
def filter_by_diversity(self, instance_ids, region_memberships, queried=None, n_select=3):
""" Return the n most diverse instances
:param instance_ids: np.array(int)
The indexes to instances in the order they appear in the training dataset.
:param region_memberships: np.ndarray(int)
Bit-map of region memberships
:param queried: np.array(int)
The instance indexes (might be included in instance_ids) whose labels have
already been queried and therefore should be ignored in output
:param n_select: int
Maximum number of instances to output
"""
regions = region_memberships
# logger.debug("regions: %s\n%s" % (str(regions.shape), str(regions)))
selected_regions = np.zeros(regions.shape[1])
selected_instances = list()
if queried is not None and len(queried) > 0:
# record all region memberships for instances which have already been queried
queried_set = set(queried)
ignore_indexes = list()
for i, inst in enumerate(instance_ids):
if inst in queried_set:
ignore_indexes.append(i)
selected_regions = selected_regions + regions[i, :]
if len(ignore_indexes) > 0:
# remove all queried instances from consideration further down
regions = np.delete(regions, ignore_indexes, axis=0)
instance_ids = np.delete(instance_ids, ignore_indexes)
if False:
logger.debug("queried:\n%s\nselected_regions:\n%s\nIgnored:\n%s" %
(str(queried), str(selected_regions), str(ignore_indexes)))
curr_inst_ids = instance_ids
for i in range(n_select):
if len(selected_instances) == n_select or regions.shape[0] == 0:
break
regs = np.dot(regions, selected_regions)
# logger.debug("regs:\n%s" % str(regs))
# We need stable sort for which we use mergesort.
# The parameter instance_ids contains instance indexes in
# sorted order of anomaly scores.
sorted_inst_indexes = np.argsort(regs, kind='mergesort')
inst = curr_inst_ids[sorted_inst_indexes[0]]
selected_regions = selected_regions + regions[sorted_inst_indexes[0], :]
# logger.debug("selected_regions:\n%s" % str(selected_regions))
curr_inst_ids = np.delete(curr_inst_ids, sorted_inst_indexes[0])
regions = np.delete(regions, sorted_inst_indexes[0], axis=0)
selected_instances.append(inst)
return selected_instances
def get_next_query(self, **kwargs):
consider_queried = False
ordered_indexes = kwargs.get("ordered_indexes")
queried_items = kwargs.get("queried_items")
remaining_budget = kwargs.get("remaining_budget")
ensemble = kwargs.get("ensemble")
feature_ranges = kwargs.get("feature_ranges")
model = kwargs.get("model")
if self.opts.n_explore < self.opts.num_query_batch:
raise ValueError("Error: option n_explore (%d) less than n_batch (%d)" %
(self.opts.n_explore, self.opts.num_query_batch))
items = get_first_vals_not_marked(ordered_indexes, queried_items, start=0,
n=self.opts.n_explore)
if len(items) == 0:
return None
if is_forest_detector(self.opts.detector_type) and True:
# feature_ranges will be used to compute volumes
if feature_ranges is None:
feature_ranges = get_sample_feature_ranges(ensemble.samples)
instance_ids = items
queried_anom_indexes = None
if consider_queried and queried_items is not None and len(queried_items) > 0:
queried_items = np.array(queried_items, dtype=int)
# Filter only the labeled anomalies for diversity computation
queried_anom_indexes = np.where(ensemble.labels[queried_items] == 1)[0]
if len(queried_anom_indexes) > 0:
if len(queried_anom_indexes) > 50:
# The ILP to select compact regions could be expensive.
# Therefore, take a subsample.
np.random.shuffle(queried_anom_indexes)
queried_anom_indexes = queried_anom_indexes[0:50]
instance_ids = append(instance_ids, queried_items[queried_anom_indexes])
reg_idxs = get_regions_for_description(ensemble.samples, instance_indexes=instance_ids,
model=model, n_top=self.opts.describe_n_top)
volumes = get_region_volumes(model, reg_idxs, feature_ranges)
# logger.debug("reg_idxs:%d\n%s" % (len(reg_idxs), str(list(reg_idxs))))
compact_reg_idxs = get_compact_regions(ensemble.samples, instance_indexes=instance_ids,
region_indexes=reg_idxs, model=model, volumes=volumes,
p=self.opts.describe_volume_p)
if True:
logger.debug("#reg_idxs:%d, #compact regions:%d, #queried_anoms: %d, p: %d, n_top: %d" %
(len(reg_idxs), len(compact_reg_idxs),
0 if queried_anom_indexes is None else len(queried_anom_indexes),
self.opts.describe_volume_p, self.opts.describe_n_top))
# logger.debug("compact regions:%d\n%s" % (len(reg_idxs), str(list(reg_idxs))))
instance_ids, region_memberships = get_region_memberships(ensemble.samples,
instance_indexes=instance_ids,
region_indexes=compact_reg_idxs,
model=model)
if self.order_by_euclidean_diversity:
# arrange instance_ids by euclidean diversity first
# logger.debug("ordering by euclidean diversity")
init_ordered_items = filter_by_euclidean_distance(ensemble.samples, instance_ids,
n_select=len(instance_ids),
dist_type=QUERY_EUCLIDEAN_DIST_MIN)
logger.debug("\ninstance_ids:\n%s\ninit_ordered_items:\n%s" % (str(instance_ids), str(init_ordered_items)))
else:
init_ordered_items = instance_ids
filtered_items = self.filter_by_diversity(init_ordered_items, region_memberships,
queried=queried_items,
n_select=min(remaining_budget, self.opts.num_query_batch))
# logger.debug("\nitems:\n%s\nfiltered_items:\n%s\ninstance_ids:\n%s" % (str(items), str(filtered_items), str(instance_ids)))
else:
raise RuntimeError("QueryTopBatch supported only for forest-based models")
return filtered_items