Merge pull request #635 from QData/hard-label-attack

hard label classification
QData · Sep 11, 2023 · f848247 · f848247
2 parents cab4e0f + 87c4671
commit f848247
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 4 deletions.
diff --git a/textattack/attack_args.py b/textattack/attack_args.py
@@ -112,6 +112,7 @@
     #
     # Classification goal functions
     #
+    "hardlabel-classification": "textattack.goal_functions.classification.HardLabelClassification",
     "targeted-classification": "textattack.goal_functions.classification.TargetedClassification",
     "untargeted-classification": "textattack.goal_functions.classification.UntargetedClassification",
     "input-reduction": "textattack.goal_functions.classification.InputReduction",
@@ -127,15 +128,13 @@
 @dataclass
 class AttackArgs:
     """Attack arguments to be passed to :class:`~textattack.Attacker`.
-
     Args:
         num_examples (:obj:`int`, 'optional`, defaults to :obj:`10`):
             The number of examples to attack. :obj:`-1` for entire dataset.
         num_successful_examples (:obj:`int`, `optional`, defaults to :obj:`None`):
             The number of successful adversarial examples we want. This is different from :obj:`num_examples`
             as :obj:`num_examples` only cares about attacking `N` samples while :obj:`num_successful_examples` aims to keep attacking
             until we have `N` successful cases.
-
             .. note::
                 If set, this argument overrides `num_examples` argument.
         num_examples_offset (:obj: `int`, `optional`, defaults to :obj:`0`):
@@ -149,7 +148,6 @@ class AttackArgs:
         query_budget (:obj:`int`, `optional`, defaults to :obj:`None`):
             The maximum number of model queries allowed per example attacked.
             If not set, we use the query budget set in the :class:`~textattack.goal_functions.GoalFunction` object (which by default is :obj:`float("inf")`).
-
             .. note::
                 Setting this overwrites the query budget set in :class:`~textattack.goal_functions.GoalFunction` object.
         checkpoint_interval (:obj:`int`, `optional`, defaults to :obj:`None`):
@@ -468,7 +466,6 @@ def create_loggers_from_args(cls, args):
 class _CommandLineAttackArgs:
     """Attack args for command line execution. This requires more arguments to
     create ``Attack`` object as specified.
-
     Args:
         transformation (:obj:`str`, `optional`, defaults to :obj:`"word-swap-embedding"`):
             Name of transformation to use.

diff --git a/textattack/goal_functions/classification/hardlabel_classification.py b/textattack/goal_functions/classification/hardlabel_classification.py
@@ -0,0 +1,41 @@
+"""
+Determine if an attack has been successful in Hard Label Classficiation.
+----------------------------------------------------
+"""
+
+
+from .classification_goal_function import ClassificationGoalFunction
+
+
+class HardLabelClassification(ClassificationGoalFunction):
+    """An hard label attack on classification models which attempts to maximize
+    the semantic similarity of the label such that the target is outside of the
+    decision boundary.
+
+    Args:
+        target_max_score (float): If set, goal is to reduce model output to
+            below this score. Otherwise, goal is to change the overall predicted
+            class.
+    """
+
+    def __init__(self, *args, target_max_score=None, **kwargs):
+        self.target_max_score = target_max_score
+        super().__init__(*args, **kwargs)
+
+    def _is_goal_complete(self, model_output, _):
+        if self.target_max_score:
+            return model_output[self.ground_truth_output] < self.target_max_score
+        elif (model_output.numel() == 1) and isinstance(
+            self.ground_truth_output, float
+        ):
+            return abs(self.ground_truth_output - model_output.item()) >= 0.5
+        else:
+            return model_output.argmax() != self.ground_truth_output
+
+    def _get_score(self, model_output, _):
+        # If the model outputs a single number and the ground truth output is
+        # a float, we assume that this is a regression task.
+        if (model_output.numel() == 1) and isinstance(self.ground_truth_output, float):
+            return max(model_output.item(), self.ground_truth_output)
+        else:
+            return 1 - model_output[self.ground_truth_output]