diff --git a/docs/1start/multilingual-visualization.md b/docs/1start/multilingual-visualization.md index e94003718..ef76c1109 100644 --- a/docs/1start/multilingual-visualization.md +++ b/docs/1start/multilingual-visualization.md @@ -19,6 +19,8 @@ TextAttack Extended Functions (Multilingual) - see example code for using our framework to attack French-BERT: [https://github.com/QData/TextAttack/blob/master/examples/attack/attack_camembert.py](https://github.com/QData/TextAttack/blob/master/examples/attack/attack_camembert.py) . +- see tutorial notebook for using our framework to attack Chinese-NLP model.: [https://textattack.readthedocs.io/en/latest/2notebook/Example_6_Chinese_Attack.html](https://textattack.readthedocs.io/en/latest/2notebook/Example_6_Chinese_Attack.html) + ## User defined custom inputs and models diff --git a/docs/1start/references.md b/docs/1start/references.md index 803d73f34..95534a18a 100644 --- a/docs/1start/references.md +++ b/docs/1start/references.md @@ -63,3 +63,51 @@ How to Cite TextAttack primaryClass={cs.CL} } ``` + + +## Our defense paper: Title: "Towards Improving Adversarial Training of NLP Models" + + +- Abstract: Adversarial training, a method for learning robust deep neural networks, constructs adversarial examples during training. However, recent methods for generating NLP adversarial examples involve combinatorial search and expensive sentence encoders for constraining the generated instances. As a result, it remains challenging to use vanilla adversarial training to improve NLP models' performance, and the benefits are mainly uninvestigated. This paper proposes a simple and improved vanilla adversarial training process for NLP models, which we name Attacking to Training (A2T). The core part of A2T is a new and cheaper word substitution attack optimized for vanilla adversarial training. We use A2T to train BERT and RoBERTa models on IMDB, Rotten Tomatoes, Yelp, and SNLI datasets. Our results empirically show that it is possible to train robust NLP models using a much cheaper adversary. We demonstrate that vanilla adversarial training with A2T can improve an NLP model's robustness to the attack it was originally trained with and also defend the model against other types of word substitution attacks. Furthermore, we show that A2T can improve NLP models' standard accuracy, cross-domain generalization, and interpretability. + + +### Code is available + +We share all codes of this defense analysis at [https://github.com/QData/Textattack-A2T](https://github.com/QData/Textattack-A2T) . + + +### Citations: +``` +@misc{yoo2021improving, + title={Towards Improving Adversarial Training of NLP Models}, + author={Jin Yong Yoo and Yanjun Qi}, + year={2021}, + eprint={2109.00544}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +## Our extended use case paper: "Expanding Scope: Adapting English Adversarial Attacks to Chinese" + + +### Abstract: +Recent studies have revealed that NLP predictive models are vulnerable to adversarial attacks. Most existing studies focused on designing attacks to evaluate the robustness of NLP models in the English language alone. Literature has seen an increasing need for NLP solutions for other languages. We, therefore, ask one natural question: whether state-of-the-art (SOTA) attack methods generalize to other languages. This paper investigates how to adapt SOTA adversarial attack algorithms in English to the Chinese language. Our experiments show that attack methods previously applied to English NLP can generate high-quality adversarial examples in Chinese when combined with proper text segmentation and linguistic constraints. In addition, we demonstrate that the generated adversarial examples can achieve high fluency and semantic consistency by focusing on the Chinese language's morphology and phonology, which in turn can be used to improve the adversarial robustness of Chinese NLP models. + +### Venue: +TrustNLP: Third Workshop on Trustworthy Natural Language Processing Colocated with the Annual Conference of the Association for Computational + +### Tutorial code: +See notebook for using our framework to attack Chinese-NLP model.: [https://textattack.readthedocs.io/en/latest/2notebook/Example_6_Chinese_Attack.html](https://textattack.readthedocs.io/en/latest/2notebook/Example_6_Chinese_Attack.html) + + +### Citations: +``` +@article{liu2023expanding, + title={Expanding Scope: Adapting English Adversarial Attacks to Chinese}, + author={Liu, Hanyu and Cai, Chengyuan and Qi, Yanjun}, + journal={arXiv preprint arXiv:2306.04874}, + year={2023} +} +``` + diff --git a/docs/2notebook/Example_6_Chinese Attack.ipynb b/docs/2notebook/Example_6_Chinese Attack.ipynb deleted file mode 100644 index 6363dfb57..000000000 --- a/docs/2notebook/Example_6_Chinese Attack.ipynb +++ /dev/null @@ -1,590 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "xK7B3NnYaPR6" - }, - "source": [ - "# Attacking Chinese Models" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/Users/ccy/Documents/GitHub/TextAttackqdata/TextAttack\n" - ] - } - ], - "source": [ - "cd ../.." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing /Users/ccy/Documents/GitHub/TextAttackqdata/TextAttack\n", - "Requirement already satisfied: bert-score>=0.3.5 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from textattack==0.3.0) (0.3.7)\n", - "Requirement already satisfied: editdistance in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from textattack==0.3.0) (0.5.3)\n", - "Requirement already satisfied: flair in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from textattack==0.3.0) (0.9)\n", - "Requirement already satisfied: filelock in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from textattack==0.3.0) (3.0.12)\n", - "Requirement already satisfied: language_tool_python in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from textattack==0.3.0) (2.4.7)\n", - "Requirement already satisfied: lemminflect in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from textattack==0.3.0) (0.2.1)\n", - "Requirement already satisfied: lru-dict in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from textattack==0.3.0) (1.1.6)\n", - "Requirement already satisfied: datasets in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from textattack==0.3.0) (1.1.3)\n", - "Requirement already satisfied: nltk in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from textattack==0.3.0) (3.5)\n", - "Requirement already satisfied: numpy<1.19.0 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from textattack==0.3.0) (1.18.5)\n", - "Requirement already satisfied: pandas>=1.0.1 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from textattack==0.3.0) (1.2.0)\n", - "Requirement already satisfied: scipy==1.4.1 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from textattack==0.3.0) (1.4.1)\n", - "Requirement already satisfied: torch!=1.8,>=1.7.0 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from textattack==0.3.0) (1.9.0)\n", - "Requirement already satisfied: transformers>=3.3.0 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from textattack==0.3.0) (4.1.1)\n", - "Requirement already satisfied: terminaltables in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from textattack==0.3.0) (3.1.0)\n", - "Requirement already satisfied: tqdm<4.50.0,>=4.27 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from textattack==0.3.0) (4.49.0)\n", - "Requirement already satisfied: word2number in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from textattack==0.3.0) (1.1)\n", - "Requirement already satisfied: num2words in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from textattack==0.3.0) (0.5.10)\n", - "Requirement already satisfied: more-itertools in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from textattack==0.3.0) (8.8.0)\n", - "Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from textattack==0.3.0) (1.7.1)\n", - "Requirement already satisfied: pywordseg==0.1.4 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from textattack==0.3.0) (0.1.4)\n", - "Requirement already satisfied: pinyin==0.4.0 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from textattack==0.3.0) (0.4.0)\n", - "Requirement already satisfied: requests in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from bert-score>=0.3.5->textattack==0.3.0) (2.25.1)\n", - "Requirement already satisfied: matplotlib in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from bert-score>=0.3.5->textattack==0.3.0) (3.3.3)\n", - "Requirement already satisfied: huggingface-hub in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from flair->textattack==0.3.0) (0.1.2)\n", - "Requirement already satisfied: regex in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from flair->textattack==0.3.0) (2020.11.13)\n", - "Requirement already satisfied: conllu>=4.0 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from flair->textattack==0.3.0) (4.4.1)\n", - "Requirement already satisfied: wikipedia-api in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from flair->textattack==0.3.0) (0.5.4)\n", - "Requirement already satisfied: gdown==3.12.2 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from flair->textattack==0.3.0) (3.12.2)\n", - "Requirement already satisfied: bpemb>=0.3.2 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from flair->textattack==0.3.0) (0.3.2)\n", - "Requirement already satisfied: hyperopt>=0.1.1 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from flair->textattack==0.3.0) (0.2.5)\n", - "Requirement already satisfied: sqlitedict>=1.6.0 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from flair->textattack==0.3.0) (1.7.0)\n", - "Requirement already satisfied: janome in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from flair->textattack==0.3.0) (0.4.1)\n", - "Requirement already satisfied: ftfy in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from flair->textattack==0.3.0) (5.8)\n", - "Requirement already satisfied: konoha<5.0.0,>=4.0.0 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from flair->textattack==0.3.0) (4.6.2)\n", - "Requirement already satisfied: deprecated>=1.2.4 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from flair->textattack==0.3.0) (1.2.10)\n", - "Requirement already satisfied: tabulate in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from flair->textattack==0.3.0) (0.8.7)\n", - "Requirement already satisfied: scikit-learn>=0.21.3 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from flair->textattack==0.3.0) (0.24.0)\n", - "Requirement already satisfied: python-dateutil>=2.6.1 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from flair->textattack==0.3.0) (2.8.2)\n", - "Requirement already satisfied: mpld3==0.3 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from flair->textattack==0.3.0) (0.3)\n", - "Requirement already satisfied: gensim<=3.8.3,>=3.4.0 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from flair->textattack==0.3.0) (3.8.3)\n", - "Requirement already satisfied: lxml in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from flair->textattack==0.3.0) (4.6.2)\n", - "Requirement already satisfied: sentencepiece==0.1.95 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from flair->textattack==0.3.0) (0.1.95)\n", - "Requirement already satisfied: segtok>=1.5.7 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from flair->textattack==0.3.0) (1.5.10)\n", - "Requirement already satisfied: langdetect in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from flair->textattack==0.3.0) (1.0.8)\n", - "Requirement already satisfied: pyarrow>=0.17.1 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from datasets->textattack==0.3.0) (3.0.0)\n", - "Requirement already satisfied: dill in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from datasets->textattack==0.3.0) (0.3.3)\n", - "Requirement already satisfied: xxhash in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from datasets->textattack==0.3.0) (2.0.0)\n", - "Requirement already satisfied: multiprocess in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from datasets->textattack==0.3.0) (0.70.11.1)\n", - "Requirement already satisfied: click in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from nltk->textattack==0.3.0) (7.1.2)\n", - "Requirement already satisfied: joblib in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from nltk->textattack==0.3.0) (1.0.0)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: pytz>=2017.3 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from pandas>=1.0.1->textattack==0.3.0) (2020.5)\n", - "Requirement already satisfied: typing-extensions in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from torch!=1.8,>=1.7.0->textattack==0.3.0) (3.7.4.3)\n", - "Requirement already satisfied: sacremoses in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from transformers>=3.3.0->textattack==0.3.0) (0.0.43)\n", - "Requirement already satisfied: tokenizers==0.9.4 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from transformers>=3.3.0->textattack==0.3.0) (0.9.4)\n", - "Requirement already satisfied: packaging in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from transformers>=3.3.0->textattack==0.3.0) (21.2)\n", - "Requirement already satisfied: docopt>=0.6.2 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from num2words->textattack==0.3.0) (0.6.2)\n", - "Requirement already satisfied: overrides==1.9 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from pywordseg==0.1.4->textattack==0.3.0) (1.9)\n", - "Requirement already satisfied: h5py in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from pywordseg==0.1.4->textattack==0.3.0) (2.10.0)\n", - "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from requests->bert-score>=0.3.5->textattack==0.3.0) (1.24.3)\n", - "Requirement already satisfied: idna<3,>=2.5 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from requests->bert-score>=0.3.5->textattack==0.3.0) (2.10)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from requests->bert-score>=0.3.5->textattack==0.3.0) (2020.12.5)\n", - "Requirement already satisfied: chardet<5,>=3.0.2 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from requests->bert-score>=0.3.5->textattack==0.3.0) (4.0.0)\n", - "Requirement already satisfied: cycler>=0.10 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from matplotlib->bert-score>=0.3.5->textattack==0.3.0) (0.10.0)\n", - "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from matplotlib->bert-score>=0.3.5->textattack==0.3.0) (2.4.7)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from matplotlib->bert-score>=0.3.5->textattack==0.3.0) (1.3.1)\n", - "Requirement already satisfied: pillow>=6.2.0 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from matplotlib->bert-score>=0.3.5->textattack==0.3.0) (8.0.1)\n", - "Requirement already satisfied: pyyaml in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from huggingface-hub->flair->textattack==0.3.0) (5.3.1)\n", - "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from huggingface-hub->flair->textattack==0.3.0) (3.10.1)\n", - "Requirement already satisfied: six in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from gdown==3.12.2->flair->textattack==0.3.0) (1.15.0)\n", - "Requirement already satisfied: networkx>=2.2 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from hyperopt>=0.1.1->flair->textattack==0.3.0) (2.5)\n", - "Requirement already satisfied: cloudpickle in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from hyperopt>=0.1.1->flair->textattack==0.3.0) (1.6.0)\n", - "Requirement already satisfied: future in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from hyperopt>=0.1.1->flair->textattack==0.3.0) (0.18.2)\n", - "Requirement already satisfied: wcwidth in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from ftfy->flair->textattack==0.3.0) (0.2.5)\n", - "Requirement already satisfied: wrapt<2,>=1.10 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from deprecated>=1.2.4->flair->textattack==0.3.0) (1.12.1)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from scikit-learn>=0.21.3->flair->textattack==0.3.0) (2.1.0)\n", - "Requirement already satisfied: smart-open>=1.8.1 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from gensim<=3.8.3,>=3.4.0->flair->textattack==0.3.0) (4.1.0)\n", - "Requirement already satisfied: zipp>=0.5 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from importlib-metadata; python_version < \"3.8\"->huggingface-hub->flair->textattack==0.3.0) (3.4.0)\n", - "Requirement already satisfied: decorator>=4.3.0 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from networkx>=2.2->hyperopt>=0.1.1->flair->textattack==0.3.0) (4.4.2)\n", - "Building wheels for collected packages: textattack\n", - " Building wheel for textattack (setup.py) ... \u001b[?25ldone\n", - "\u001b[?25h Created wheel for textattack: filename=textattack-0.3.0-py3-none-any.whl size=361956 sha256=73a4428fde6a96cc8009965a00a7a6ef20abc06202e91eaaf4e03823368f4d9a\n", - " Stored in directory: /private/var/folders/fy/b8pxlc0d1hbbs54f6fy9wd8h0000gn/T/pip-ephem-wheel-cache-rijvyn7u/wheels/21/34/eb/f0c01bff3429818e44c0d5cd0d06a65a13cdc1a6ee894221ba\n", - "Successfully built textattack\n", - "Installing collected packages: textattack\n", - " Attempting uninstall: textattack\n", - " Found existing installation: textattack 0.3.0\n", - " Uninstalling textattack-0.3.0:\n", - " Successfully uninstalled textattack-0.3.0\n", - "Successfully installed textattack-0.3.0\n", - "\u001b[33mWARNING: You are using pip version 20.1.1; however, version 22.0.3 is available.\n", - "You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.7/bin/python3.7 -m pip install --upgrade pip' command.\u001b[0m\n" - ] - } - ], - "source": [ - "!pip3 install ." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from textattack.transformations import WordSwap\n", - "import transformers\n", - "import string" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "textattack: Unknown if model of class compatible with goal function .\n", - "Using custom data configuration default\n", - "Reusing dataset csv (/Users/ccy/.cache/huggingface/datasets/csv/default-1fe846e8bbc39aa4/0.0.0/2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2)\n" - ] - } - ], - "source": [ - "#attack example\n", - "import os\n", - "import pandas as pd\n", - "import datasets\n", - "import transformers\n", - "from textattack.models.wrappers import HuggingFaceModelWrapper\n", - "tokenizer = transformers.AutoTokenizer.from_pretrained(\"Raychanan/bert-base-chinese-FineTuned-Binary-Best\")\n", - "model = transformers.AutoModelForSequenceClassification.from_pretrained(\"Raychanan/bert-base-chinese-FineTuned-Binary-Best\")\n", - "\n", - "model_wrapper = HuggingFaceModelWrapper(model, tokenizer)\n", - "\n", - "from textattack.goal_functions import UntargetedClassification\n", - "goal_function = UntargetedClassification(model_wrapper, query_budget=10000)\n", - "\n", - "from textattack.datasets import HuggingFaceDataset\n", - "\n", - "#get demo dataset path\n", - "path = os.path.abspath('')\n", - "\n", - "path_list = path.split(os.sep)\n", - "path_list.append('examples/dataset/chinese_data_demo.tsv')\n", - "demo_data_path = os.path.join(\"/\", *path_list)\n", - "\n", - "dataset = datasets.load_dataset('csv', data_files=demo_data_path, delimiter=\"\\t\")[\"train\"]\n", - "\n", - "dataset = HuggingFaceDataset(\n", - " dataset,\n", - "# lang=\"zh\",\n", - " dataset_columns=([\"text\"], \"label\"),\n", - " label_names=[\"Negative\", \"Positive\"]\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "nSAHSoI_aPSO" - }, - "outputs": [], - "source": [ - "from textattack.search_methods import GreedyWordSwapWIR\n", - "from textattack.transformations import ChineseWordSwapHowNet\n", - "from textattack.transformations import ChineseHomophoneCharacterSwap\n", - "from textattack.constraints.pre_transformation import RepeatModification, StopwordModification\n", - "from textattack import Attack\n", - "\n", - "transformation = ChineseHomophoneCharacterSwap()\n", - "\n", - "stopwords = set(\n", - " [\"、\", \"。\", \"〈\", \"〉\", \"《\", \"》\", \"一\", \"一个\", \"一些\", \"一何\", \"一切\", \"一则\", \"一方面\", \"一旦\", \"一来\", \"一样\", \"一种\", \"一般\", \"一转眼\", \"七\", \"万一\", \"三\", \"上\", \"上下\", \"下\", \"不\", \"不仅\", \"不但\", \"不光\", \"不单\", \"不只\", \"不外乎\", \"不如\", \"不妨\", \"不尽\", \"不尽然\", \"不得\", \"不怕\", \"不惟\", \"不成\", \"不拘\", \"不料\", \"不是\", \"不比\", \"不然\", \"不特\", \"不独\", \"不管\", \"不至于\", \"不若\", \"不论\", \"不过\", \"不问\", \"与\", \"与其\", \"与其说\", \"与否\", \"与此同时\", \"且\", \"且不说\", \"且说\", \"两者\", \"个\", \"个别\", \"中\", \"临\", \"为\", \"为了\", \"为什么\", \"为何\", \"为止\", \"为此\", \"为着\", \"乃\", \"乃至\", \"乃至于\", \"么\", \"之\", \"之一\", \"之所以\", \"之类\", \"乌乎\", \"乎\", \"乘\", \"九\", \"也\", \"也好\", \"也罢\", \"了\", \"二\", \"二来\", \"于\", \"于是\", \"于是乎\", \"云云\", \"云尔\", \"五\", \"些\", \"亦\", \"人\", \"人们\", \"人家\", \"什\", \"什么\", \"什么样\", \"今\", \"介于\", \"仍\", \"仍旧\", \"从\", \"从此\", \"从而\", \"他\", \"他人\", \"他们\", \"他们们\", \"以\", \"以上\", \"以为\", \"以便\", \"以免\", \"以及\", \"以故\", \"以期\", \"以来\", \"以至\", \"以至于\", \"以致\", \"们\", \"任\", \"任何\", \"任凭\", \"会\", \"似的\", \"但\", \"但凡\", \"但是\", \"何\", \"何以\", \"何况\", \"何处\", \"何时\", \"余外\", \"作为\", \"你\", \"你们\", \"使\", \"使得\", \"例如\", \"依\", \"依据\", \"依照\", \"便于\", \"俺\", \"俺们\", \"倘\", \"倘使\", \"倘或\", \"倘然\", \"倘若\", \"借\", \"借傥然\", \"假使\", \"假如\", \"假若\", \"做\", \"像\", \"儿\", \"先不先\", \"光\", \"光是\", \"全体\", \"全部\", \"八\", \"六\", \"兮\", \"共\", \"关于\", \"关于具体地说\", \"其\", \"其一\", \"其中\", \"其二\", \"其他\", \"其余\", \"其它\", \"其次\", \"具体地说\", \"具体说来\", \"兼之\", \"内\", \"再\", \"再其次\", \"再则\", \"再有\", \"再者\", \"再者说\", \"再说\", \"冒\", \"冲\", \"况且\", \"几\", \"几时\", \"凡\", \"凡是\", \"凭\", \"凭借\", \"出于\", \"出来\", \"分\", \"分别\", \"则\", \"则甚\", \"别\", \"别人\", \"别处\", \"别是\", \"别的\", \"别管\", \"别说\", \"到\", \"前后\", \"前此\", \"前者\", \"加之\", \"加以\", \"区\", \"即\", \"即令\", \"即使\", \"即便\", \"即如\", \"即或\", \"即若\", \"却\", \"去\", \"又\", \"又及\", \"及\", \"及其\", \"及至\", \"反之\", \"反而\", \"反过来\", \"反过来说\", \"受到\", \"另\", \"另一方面\", \"另外\", \"另悉\", \"只\", \"只当\", \"只怕\", \"只是\", \"只有\", \"只消\", \"只要\", \"只限\", \"叫\", \"叮咚\", \"可\", \"可以\", \"可是\", \"可见\", \"各\", \"各个\", \"各位\", \"各种\", \"各自\", \"同\", \"同时\", \"后\", \"后者\", \"向\", \"向使\", \"向着\", \"吓\", \"吗\", \"否则\", \"吧\", \"吧哒\", \"含\", \"吱\", \"呀\", \"呃\", \"呕\", \"呗\", \"呜\", \"呜呼\", \"呢\", \"呵\", \"呵呵\", \"呸\", \"呼哧\", \"咋\", \"和\", \"咚\", \"咦\", \"咧\", \"咱\", \"咱们\", \"咳\", \"哇\", \"哈\", \"哈哈\", \"哉\", \"哎\", \"哎呀\", \"哎哟\", \"哗\", \"哟\", \"哦\", \"哩\", \"哪\", \"哪个\", \"哪些\", \"哪儿\", \"哪天\", \"哪年\", \"哪怕\", \"哪样\", \"哪边\", \"哪里\", \"哼\", \"哼唷\", \"唉\", \"唯有\", \"啊\", \"啐\", \"啥\", \"啦\", \"啪达\", \"啷当\", \"喂\", \"喏\", \"喔唷\", \"喽\", \"嗡\", \"嗡嗡\", \"嗬\", \"嗯\", \"嗳\", \"嘎\", \"嘎登\", \"嘘\", \"嘛\", \"嘻\", \"嘿\", \"嘿嘿\", \"四\", \"因\", \"因为\", \"因了\", \"因此\", \"因着\", \"因而\", \"固然\", \"在\", \"在下\", \"在于\", \"地\", \"基于\", \"处在\", \"多\", \"多么\", \"多少\", \"大\", \"大家\", \"她\", \"她们\", \"好\", \"如\", \"如上\", \"如上所述\", \"如下\", \"如何\", \"如其\", \"如同\", \"如是\", \"如果\", \"如此\", \"如若\", \"始而\", \"孰料\", \"孰知\", \"宁\", \"宁可\", \"宁愿\", \"宁肯\", \"它\", \"它们\", \"对\", \"对于\", \"对待\", \"对方\", \"对比\", \"将\", \"小\", \"尔\", \"尔后\", \"尔尔\", \"尚且\", \"就\", \"就是\", \"就是了\", \"就是说\", \"就算\", \"就要\", \"尽\", \"尽管\", \"尽管如此\", \"岂但\", \"己\", \"已\", \"已矣\", \"巴\", \"巴巴\", \"年\", \"并\", \"并且\", \"庶乎\", \"庶几\", \"开外\", \"开始\", \"归\", \"归齐\", \"当\", \"当地\", \"当然\", \"当着\", \"彼\", \"彼时\", \"彼此\", \"往\", \"待\", \"很\", \"得\", \"得了\", \"怎\", \"怎么\", \"怎么办\", \"怎么样\", \"怎奈\", \"怎样\", \"总之\", \"总的来看\", \"总的来说\", \"总的说来\", \"总而言之\", \"恰恰相反\", \"您\", \"惟其\", \"慢说\", \"我\", \"我们\", \"或\", \"或则\", \"或是\", \"或曰\", \"或者\", \"截至\", \"所\", \"所以\", \"所在\", \"所幸\", \"所有\", \"才\", \"才能\", \"打\", \"打从\", \"把\", \"抑或\", \"拿\", \"按\", \"按照\", \"换句话说\", \"换言之\", \"据\", \"据此\", \"接着\", \"故\", \"故此\", \"故而\", \"旁人\", \"无\", \"无宁\", \"无论\", \"既\", \"既往\", \"既是\", \"既然\", \"日\", \"时\", \"时候\", \"是\", \"是以\", \"是的\", \"更\", \"曾\", \"替\", \"替代\", \"最\", \"月\", \"有\", \"有些\", \"有关\", \"有及\", \"有时\", \"有的\", \"望\", \"朝\", \"朝着\", \"本\", \"本人\", \"本地\", \"本着\", \"本身\", \"来\", \"来着\", \"来自\", \"来说\", \"极了\", \"果然\", \"果真\", \"某\", \"某个\", \"某些\", \"某某\", \"根据\", \"欤\", \"正值\", \"正如\", \"正巧\", \"正是\", \"此\", \"此地\", \"此处\", \"此外\", \"此时\", \"此次\", \"此间\", \"毋宁\", \"每\", \"每当\", \"比\", \"比及\", \"比如\", \"比方\", \"没奈何\", \"沿\", \"沿着\", \"漫说\", \"点\", \"焉\", \"然则\", \"然后\", \"然而\", \"照\", \"照着\", \"犹且\", \"犹自\", \"甚且\", \"甚么\", \"甚或\", \"甚而\", \"甚至\", \"甚至于\", \"用\", \"用来\", \"由\", \"由于\", \"由是\", \"由此\", \"由此可见\", \"的\", \"的确\", \"的话\", \"直到\", \"相对而言\", \"省得\", \"看\", \"眨眼\", \"着\", \"着呢\", \"矣\", \"矣乎\", \"矣哉\", \"离\", \"秒\", \"称\", \"竟而\", \"第\", \"等\", \"等到\", \"等等\", \"简言之\", \"管\", \"类如\", \"紧接着\", \"纵\", \"纵令\", \"纵使\", \"纵然\", \"经\", \"经过\", \"结果\", \"给\", \"继之\", \"继后\", \"继而\", \"综上所述\", \"罢了\", \"者\", \"而\", \"而且\", \"而况\", \"而后\", \"而外\", \"而已\", \"而是\", \"而言\", \"能\", \"能否\", \"腾\", \"自\", \"自个儿\", \"自从\", \"自各儿\", \"自后\", \"自家\", \"自己\", \"自打\", \"自身\", \"至\", \"至于\", \"至今\", \"至若\", \"致\", \"般的\", \"若\", \"若夫\", \"若是\", \"若果\", \"若非\", \"莫不然\", \"莫如\", \"莫若\", \"虽\", \"虽则\", \"虽然\", \"虽说\", \"被\", \"要\", \"要不\", \"要不是\", \"要不然\", \"要么\", \"要是\", \"譬喻\", \"譬如\", \"让\", \"许多\", \"论\", \"设使\", \"设或\", \"设若\", \"诚如\", \"诚然\", \"该\", \"说\", \"说来\", \"请\", \"诸\", \"诸位\", \"诸如\", \"谁\", \"谁人\", \"谁料\", \"谁知\", \"贼死\", \"赖以\", \"赶\", \"起\", \"起见\", \"趁\", \"趁着\", \"越是\", \"距\", \"跟\", \"较\", \"较之\", \"边\", \"过\", \"还\", \"还是\", \"还有\", \"还要\", \"这\", \"这一来\", \"这个\", \"这么\", \"这么些\", \"这么样\", \"这么点儿\", \"这些\", \"这会儿\", \"这儿\", \"这就是说\", \"这时\", \"这样\", \"这次\", \"这般\", \"这边\", \"这里\", \"进而\", \"连\", \"连同\", \"逐步\", \"通过\", \"遵循\", \"遵照\", \"那\", \"那个\", \"那么\", \"那么些\", \"那么样\", \"那些\", \"那会儿\", \"那儿\", \"那时\", \"那样\", \"那般\", \"那边\", \"那里\", \"都\", \"鄙人\", \"鉴于\", \"针对\", \"阿\", \"除\", \"除了\", \"除外\", \"除开\", \"除此之外\", \"除非\", \"随\", \"随后\", \"随时\", \"随着\", \"难道说\", \"零\", \"非\", \"非但\", \"非徒\", \"非特\", \"非独\", \"靠\", \"顺\", \"顺着\", \"首先\", \"︿\", \"!\", \"#\", \"$\", \"%\", \"&\", \"(\", \")\", \"*\", \"+\", \",\", \"0\", \"1\", \"2\", \"3\", \"4\", \"5\", \"6\", \"7\", \"8\", \"9\", \":\", \";\", \"<\", \">\", \"?\", \"@\", \"[\", \"]\", \"{\", \"|\", \"}\", \"~\", \"¥\"]\n", - " )\n", - "stopwords = stopwords.union(set(string.punctuation))\n", - "\n", - "constraints = [RepeatModification(),\n", - " StopwordModification(stopwords = stopwords)]\n", - "\n", - "search_method = GreedyWordSwapWIR()\n", - "\n", - "attack = Attack(goal_function, constraints, transformation, search_method)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "LyokhnFtaPSQ", - "outputId": "d8a43c4f-1551-40c9-d031-a42b429ed33d", - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\r", - " 0%| | 0/10 [00:00 [[Positive (76%)]]\n", - "\n", - "一分都不想给,连个快递都不会送,第二次送到家,要是别人不告诉我几别人百块钱就白花了\n", - "\n", - "一分都步想给,练个快第都不灰松,第二次宋到家,要是别人不告诉我几别人白块钱就拜花了\n", - "\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[Succeeded / Failed / Skipped / Total] 2 / 0 / 0 / 2: 20%|▏| 2/10 [03:08<12:35," - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--------------------------------------------- Result 2 ---------------------------------------------\n", - "[[Positive (97%)]] --> [[Negative (63%)]]\n", - "\n", - "优点忒多了,不用多介绍了.\n", - "\n", - "有点忒多了,不用多介少了.\n", - "\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[Succeeded / Failed / Skipped / Total] 2 / 1 / 0 / 3: 30%|▎| 3/10 [05:39<13:13," - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--------------------------------------------- Result 3 ---------------------------------------------\n", - "[[Positive (99%)]] --> [[[FAILED]]]\n", - "\n", - "京东东西非常好,物流也非常给力,送货小哥服务很热情,希望京东越来越好,赞一个?!\n", - "\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[Succeeded / Failed / Skipped / Total] 3 / 1 / 0 / 4: 40%|▍| 4/10 [06:37<09:56," - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--------------------------------------------- Result 4 ---------------------------------------------\n", - "[[Negative (99%)]] --> [[Positive (56%)]]\n", - "\n", - "一半以上都有点小问题,有几个不能吃。\n", - "\n", - "一般以上都有点小文题,有及个部能池。\n", - "\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[Succeeded / Failed / Skipped / Total] 4 / 1 / 0 / 5: 50%|▌| 5/10 [07:17<07:17," - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--------------------------------------------- Result 5 ---------------------------------------------\n", - "[[Positive (92%)]] --> [[Negative (93%)]]\n", - "\n", - "性价比高,毕竟华为也是国内名牌。\n", - "\n", - "性假比搞,毕竟华为也是过内名牌。\n", - "\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[Succeeded / Failed / Skipped / Total] 4 / 2 / 0 / 6: 60%|▌| 6/10 [11:53<07:55," - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--------------------------------------------- Result 6 ---------------------------------------------\n", - "[[Positive (98%)]] --> [[[FAILED]]]\n", - "\n", - "物流超级快。快递大哥态度很好的哟,打开快递真的是没有失望,和我想象中的一样,男票穿的很显瘦!牛仔裤控!满意极了,裤子男票穿走了,没办法上图,总之很好评\n", - "\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[Succeeded / Failed / Skipped / Total] 5 / 2 / 0 / 7: 70%|▋| 7/10 [12:46<05:28," - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--------------------------------------------- Result 7 ---------------------------------------------\n", - "[[Negative (98%)]] --> [[Positive (80%)]]\n", - "\n", - "收到的苹果与图片不符,很小,并且一盒中有5个坏的。\n", - "\n", - "收到的苹过与图片不负,很小,并且一盒中有5个怀的。\n", - "\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[Succeeded / Failed / Skipped / Total] 5 / 2 / 1 / 8: 80%|▊| 8/10 [12:47<03:11," - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--------------------------------------------- Result 8 ---------------------------------------------\n", - "[[Positive (55%)]] --> [[[SKIPPED]]]\n", - "\n", - "发热量也太大了吧,刚开机没多久,仅上网,机器就很热了,gpu就没有下过50度,cp一直44度以上,不知道是正常的还是我的这台有问题,希望有人指教一下~\n", - "\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[Succeeded / Failed / Skipped / Total] 6 / 2 / 1 / 9: 90%|▉| 9/10 [13:11<01:27," - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--------------------------------------------- Result 9 ---------------------------------------------\n", - "[[Negative (93%)]] --> [[Positive (85%)]]\n", - "\n", - "买了两条,这条裤子码数偏大了!\n", - "\n", - "买了两条,这条裤子码数篇大了!\n", - "\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[Succeeded / Failed / Skipped / Total] 7 / 2 / 1 / 10: 100%|█| 10/10 [14:06<00:0" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--------------------------------------------- Result 10 ---------------------------------------------\n", - "[[Positive (86%)]] --> [[Negative (72%)]]\n", - "\n", - "手感冷冰冰的,除了小点好像没问题,蛮好的\n", - "\n", - "受感冷冰冰的,除了小店号像没文题,蛮好的\n", - "\n", - "\n", - "\n", - "+-------------------------------+--------+\n", - "| Attack Results | |\n", - "+-------------------------------+--------+\n", - "| Number of successful attacks: | 7 |\n", - "| Number of failed attacks: | 2 |\n", - "| Number of skipped attacks: | 1 |\n", - "| Original accuracy: | 90.0% |\n", - "| Accuracy under attack: | 20.0% |\n", - "| Attack success rate: | 77.78% |\n", - "| Average perturbed word %: | 43.91% |\n", - "| Average num. words per input: | 18.8 |\n", - "| Avg num queries: | 45.89 |\n", - "+-------------------------------+--------+\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "from tqdm import tqdm\n", - "from textattack.loggers import CSVLogger\n", - "from textattack.attack_results import SuccessfulAttackResult\n", - "from textattack import Attacker\n", - "from textattack import AttackArgs\n", - "from textattack.datasets import Dataset\n", - "\n", - "attack_args = AttackArgs(num_examples=10)\n", - "\n", - "attacker = Attacker(attack, dataset, attack_args)\n", - "\n", - "attack_results = attacker.attack_dataset()\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['已分都步想给,练咯快递都不会送。']" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#augmentation example\n", - "from textattack.transformations import WordSwapRandomCharacterDeletion\n", - "from textattack.transformations import WordSwapQWERTY\n", - "from textattack.transformations import CompositeTransformation\n", - "from textattack.transformations import ChineseWordSwapHowNet\n", - "from textattack.transformations import ChineseHomophoneCharacterSwap\n", - "\n", - "from textattack.constraints.pre_transformation import RepeatModification\n", - "from textattack.constraints.pre_transformation import StopwordModification\n", - "\n", - "from textattack.augmentation import Augmenter\n", - "\n", - "# Set up transformation using CompositeTransformation()\n", - "transformation = ChineseHomophoneCharacterSwap()\n", - "# Set up constraints\n", - "constraints = [RepeatModification(), StopwordModification()]\n", - "# Create augmenter with specified parameters\n", - "augmenter = Augmenter(transformation=transformation, pct_words_to_swap = 0.5, transformations_per_example=1)\n", - "s = '一分都不想给,连个快递都不会送。'\n", - "# s = '一分都不想给'\n", - "# Augment!\n", - "augmenter.augment(s)" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "1_Introduction_and_Transformations.ipynb", - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.9" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/docs/2notebook/Example_6_Chinese_Attack.ipynb b/docs/2notebook/Example_6_Chinese_Attack.ipynb new file mode 100644 index 000000000..b032306c7 --- /dev/null +++ b/docs/2notebook/Example_6_Chinese_Attack.ipynb @@ -0,0 +1,2258 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "4b423038915e40158f9da4c07d09aad3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_3711cf0a18994cee8fc840d9a93cf5d3", + "IPY_MODEL_7f77bd7b8e5f45ae94cfc45f915c0c72", + "IPY_MODEL_fe0ca6138bc54b628c03e590c6e96aed" + ], + "layout": "IPY_MODEL_8b39363f69eb46009c5357263a65248c" + } + }, + "3711cf0a18994cee8fc840d9a93cf5d3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6b976fd913584da69456c1b6d53483cb", + "placeholder": "​", + "style": "IPY_MODEL_ea568ab2407f474da3b1f1b2540fa3a8", + "value": "Downloading: 100%" + } + }, + "7f77bd7b8e5f45ae94cfc45f915c0c72": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ff6b34a7e75b443593f3dca5d050cd52", + "max": 615, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_4f31972fd2fd44bbac063bb4b5075e98", + "value": 615 + } + }, + "fe0ca6138bc54b628c03e590c6e96aed": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7de1551891ec447ab6d80ea1de145f16", + "placeholder": "​", + "style": "IPY_MODEL_e5e2c0507c834887b80f5717c1e6d5f3", + "value": " 615/615 [00:00<00:00, 33.8kB/s]" + } + }, + "8b39363f69eb46009c5357263a65248c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6b976fd913584da69456c1b6d53483cb": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ea568ab2407f474da3b1f1b2540fa3a8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ff6b34a7e75b443593f3dca5d050cd52": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4f31972fd2fd44bbac063bb4b5075e98": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "7de1551891ec447ab6d80ea1de145f16": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e5e2c0507c834887b80f5717c1e6d5f3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "588b1321a9274de6a8a9e86622d90be4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_2436b07259a34ee18fe9c1007f7b615b", + "IPY_MODEL_98aac5a0baee4930bd461f2c5fd73f4a", + "IPY_MODEL_34607a8556794a5a86c18abe5bd7e5a5" + ], + "layout": "IPY_MODEL_f78f6701ce4f4b3b9ff0af925620f261" + } + }, + "2436b07259a34ee18fe9c1007f7b615b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a1e3fb5cceed4e95957a17192a641b69", + "placeholder": "​", + "style": "IPY_MODEL_83e9b14c4d354fdc80db4f8a881f19f3", + "value": "Downloading: 100%" + } + }, + "98aac5a0baee4930bd461f2c5fd73f4a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5f5457f292284dd8b914f45e26b2f749", + "max": 1115590446, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2bb72191846f49528663680a315d8b01", + "value": 1115590446 + } + }, + "34607a8556794a5a86c18abe5bd7e5a5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_83eff532314e4edcbfe648b321e9a310", + "placeholder": "​", + "style": "IPY_MODEL_3d30e700d32443fdb37b5ab934d2d70a", + "value": " 1.04G/1.04G [00:25<00:00, 45.4MB/s]" + } + }, + "f78f6701ce4f4b3b9ff0af925620f261": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a1e3fb5cceed4e95957a17192a641b69": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "83e9b14c4d354fdc80db4f8a881f19f3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5f5457f292284dd8b914f45e26b2f749": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2bb72191846f49528663680a315d8b01": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "83eff532314e4edcbfe648b321e9a310": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3d30e700d32443fdb37b5ab934d2d70a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a132f09845a54cbe865cbe8159bb693e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_0af0e1eaea2f48c5b0fec6e550bd1baa", + "IPY_MODEL_dd6b0a5d9db245338a8fdb2ef5b29bf9", + "IPY_MODEL_58fc309041b54e94ae265167fa20d8d7" + ], + "layout": "IPY_MODEL_89dfd3fdc41e417a870901bc79e47495" + } + }, + "0af0e1eaea2f48c5b0fec6e550bd1baa": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_21472d1c4c8b494a8d3660b3320e9d4b", + "placeholder": "​", + "style": "IPY_MODEL_7511bb9ca5424674bb2350dff63c468a", + "value": "Downloading: 100%" + } + }, + "dd6b0a5d9db245338a8fdb2ef5b29bf9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f6dd2c2cb4e346fe9af7026b5d2162e9", + "max": 5069051, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_a34ad57624fc422aa4832db3963298e6", + "value": 5069051 + } + }, + "58fc309041b54e94ae265167fa20d8d7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5167daffe92e44d2acc2af2d9b9738df", + "placeholder": "​", + "style": "IPY_MODEL_acbfb34a353f41649675bd104069d14e", + "value": " 4.83M/4.83M [00:00<00:00, 12.1MB/s]" + } + }, + "89dfd3fdc41e417a870901bc79e47495": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "21472d1c4c8b494a8d3660b3320e9d4b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7511bb9ca5424674bb2350dff63c468a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f6dd2c2cb4e346fe9af7026b5d2162e9": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a34ad57624fc422aa4832db3963298e6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "5167daffe92e44d2acc2af2d9b9738df": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "acbfb34a353f41649675bd104069d14e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "be070cb4a1624b0bb8f9b594c6b951a5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_2edb7130713d4e10a07bbf808abb9771", + "IPY_MODEL_5ae4c618f75d4ef9b65e5020fccb6d72", + "IPY_MODEL_138d8260e67f4bc58106b9b42f7abd12" + ], + "layout": "IPY_MODEL_d7621b5c619a4ce38ebe63924374cf78" + } + }, + "2edb7130713d4e10a07bbf808abb9771": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1b208b6df75f4a9e97faa4e3705a9442", + "placeholder": "​", + "style": "IPY_MODEL_a7871b8ec3ec40e7bbbe6a5f40b79f4a", + "value": "Downloading: 100%" + } + }, + "5ae4c618f75d4ef9b65e5020fccb6d72": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_aeb7ee752d834b4cbaa189419fd75dd4", + "max": 9096718, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_b47dfff73e73410aa89f65e3c5b0c366", + "value": 9096718 + } + }, + "138d8260e67f4bc58106b9b42f7abd12": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bdf3571e59ef4a688ab89d4badda27b1", + "placeholder": "​", + "style": "IPY_MODEL_d3bab427b92144d6b9ce96eac18ceb89", + "value": " 8.68M/8.68M [00:00<00:00, 16.8MB/s]" + } + }, + "d7621b5c619a4ce38ebe63924374cf78": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1b208b6df75f4a9e97faa4e3705a9442": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a7871b8ec3ec40e7bbbe6a5f40b79f4a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "aeb7ee752d834b4cbaa189419fd75dd4": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b47dfff73e73410aa89f65e3c5b0c366": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "bdf3571e59ef4a688ab89d4badda27b1": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d3bab427b92144d6b9ce96eac18ceb89": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "m83IiqVREJ96" + }, + "source": [ + "# Chinese Attack" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6UZ0d84hEJ98" + }, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/QData/TextAttack/blob/master/docs/2notebook/Example_6_Chinese%20Attack.ipynb)\n", + "\n", + "\n", + "[![View Source on GitHub](https://img.shields.io/badge/github-view%20source-black.svg)](https://github.com/QData/TextAttack/blob/master/docs/2notebook/Example_6_Chinese%20Attack.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tjqc2c5_7YaX" + }, + "source": [ + " Please remember to run the following in your notebook enviroment before running the tutorial codes:\n", + "\n", + "```\n", + "pip3 install textattack[tensorflow]\n", + "```\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qZ5xnoevEJ99" + }, + "source": [ + "With a few additional modifications to the standard TextAttack commands, lanaguage models in Chinese can be attacked just as English models. Four transformations are available for either Chinese attack or augmentation:\n", + "\n", + "1. **ChineseHomophoneCharacterSwap**: transforms an input by replacing its words with substitions that share similar/identical pronounciation.\n", + "2. **ChineseMorphonymCharacterSwap**: transforms an input by replacing its words with substitions that share similar glyph structures.\n", + "3. **ChineseWordSwapHowNet**: transforms an input by replacing its words with synonyms provided by [OpenHownet](http://nlp.csai.tsinghua.edu.cn/).\n", + "4. **ChineseWordSwapMaskedLM**: transforms an input with potential replacements using a masked language model." + ] + }, + { + "cell_type": "markdown", + "source": [ + "We begin with imports:" + ], + "metadata": { + "id": "2EP1DJylSfkD" + } + }, + { + "cell_type": "code", + "metadata": { + "id": "5AXyxiLD4X93" + }, + "source": [ + "# Import required packages\n", + "import transformers\n", + "import string\n", + "import os\n", + "import pandas as pd\n", + "import datasets\n", + "\n", + "# Import classes required to build an Attacker\n", + "from textattack.models.wrappers import HuggingFaceModelWrapper\n", + "from textattack.search_methods import GreedyWordSwapWIR\n", + "from textattack.constraints.pre_transformation import RepeatModification, StopwordModification\n", + "from textattack.goal_functions import UntargetedClassification\n", + "\n", + "from textattack import Attack, Attacker, AttackArgs\n", + "from textattack.loggers import CSVLogger\n", + "from textattack.datasets import Dataset, HuggingFaceDataset\n", + "\n", + "# Import optional MUSE for higher quality examples\n", + "from textattack.constraints.semantics.sentence_encoders import MultilingualUniversalSentenceEncoder\n", + "muse = MultilingualUniversalSentenceEncoder(\n", + " threshold=0.9,\n", + " metric=\"cosine\",\n", + " compare_against_original=True,\n", + " window_size=15,\n", + " skip_text_shorter_than_window=True,\n", + ")\n", + "\n", + "# Import the transformations\n", + "\n", + "from textattack.transformations import CompositeTransformation\n", + "from textattack.transformations import ChineseWordSwapMaskedLM\n", + "from textattack.transformations import ChineseMorphonymCharacterSwap\n", + "from textattack.transformations import ChineseWordSwapHowNet\n", + "from textattack.transformations import ChineseHomophoneCharacterSwap" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Models and datasets would also need to be set up:" + ], + "metadata": { + "id": "1mSvCqhHSi0h" + } + }, + { + "cell_type": "code", + "source": [ + "# In this example, we will attack a pre-trained entailment model from HugginFace (https://huggingface.co/uer/roberta-base-finetuned-chinanews-chinese)\n", + "tokenizer = transformers.AutoTokenizer.from_pretrained('uer/roberta-base-finetuned-chinanews-chinese')\n", + "model = transformers.AutoModelForSequenceClassification.from_pretrained('uer/roberta-base-finetuned-chinanews-chinese')\n", + "model_wrapper = HuggingFaceModelWrapper(model, tokenizer)\n", + "\n", + "# Set goal function\n", + "goal_function = UntargetedClassification(model_wrapper, query_budget=10000)\n", + "\n", + "# Set dataset from which we will generate adversraial examples\n", + "path = os.path.abspath('')\n", + "path_list = path.split(os.sep)\n", + "temppath = os.path.normpath('examples/dataset/zh_sentiment/entailment_dataset.tsv')\n", + "dataset = datasets.load_dataset('csv', data_files=temppath, delimiter=\"\\t\")[\"train\"]\n", + "dataset = HuggingFaceDataset(\n", + " dataset,\n", + " dataset_columns=([\"text\"], \"label\"),\n", + " label_names=[\"Mainland China politics\", \"Hong Kong - Macau politics\", \"International news\", \"Financial news\", \"Culture\", \"Entertainment\", \"Sports\"]\n", + " )" + ], + "metadata": { + "id": "CfnC9qUFPq9h" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "If this is your first time running Hownet, run this code block" + ], + "metadata": { + "id": "XfJVzCdRSr3d" + } + }, + { + "cell_type": "code", + "source": [ + "import OpenHowNet\n", + "OpenHowNet.download()" + ], + "metadata": { + "id": "Hgal-PHeQwys" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "Now we are ready to attack! With goal function, transformation, constraints, search method, and goal function, we create the Attacker as any other TextAttack attacks\n" + ], + "metadata": { + "id": "SrtoxdrMSZ0X" + } + }, + { + "cell_type": "code", + "source": [ + "# transformation, using ChineseWordSwapMaskedLM transformation in this example\n", + "\n", + "transformation = ChineseWordSwapMaskedLM()\n", + "\n", + "# constraint\n", + "stopwords = set(\n", + " [\"、\", \"。\", \"〈\", \"〉\", \"《\", \"》\", \"一\", \"一个\", \"一些\", \"一何\", \"一切\", \"一则\", \"一方面\", \"一旦\", \"一来\", \"一样\", \"一种\", \"一般\", \"一转眼\", \"七\", \"万一\", \"三\", \"上\", \"上下\", \"下\", \"不\", \"不仅\", \"不但\", \"不光\", \"不单\", \"不只\", \"不外乎\", \"不如\", \"不妨\", \"不尽\", \"不尽然\", \"不得\", \"不怕\", \"不惟\", \"不成\", \"不拘\", \"不料\", \"不是\", \"不比\", \"不然\", \"不特\", \"不独\", \"不管\", \"不至于\", \"不若\", \"不论\", \"不过\", \"不问\", \"与\", \"与其\", \"与其说\", \"与否\", \"与此同时\", \"且\", \"且不说\", \"且说\", \"两者\", \"个\", \"个别\", \"中\", \"临\", \"为\", \"为了\", \"为什么\", \"为何\", \"为止\", \"为此\", \"为着\", \"乃\", \"乃至\", \"乃至于\", \"么\", \"之\", \"之一\", \"之所以\", \"之类\", \"乌乎\", \"乎\", \"乘\", \"九\", \"也\", \"也好\", \"也罢\", \"了\", \"二\", \"二来\", \"于\", \"于是\", \"于是乎\", \"云云\", \"云尔\", \"五\", \"些\", \"亦\", \"人\", \"人们\", \"人家\", \"什\", \"什么\", \"什么样\", \"今\", \"介于\", \"仍\", \"仍旧\", \"从\", \"从此\", \"从而\", \"他\", \"他人\", \"他们\", \"他们们\", \"以\", \"以上\", \"以为\", \"以便\", \"以免\", \"以及\", \"以故\", \"以期\", \"以来\", \"以至\", \"以至于\", \"以致\", \"们\", \"任\", \"任何\", \"任凭\", \"会\", \"似的\", \"但\", \"但凡\", \"但是\", \"何\", \"何以\", \"何况\", \"何处\", \"何时\", \"余外\", \"作为\", \"你\", \"你们\", \"使\", \"使得\", \"例如\", \"依\", \"依据\", \"依照\", \"便于\", \"俺\", \"俺们\", \"倘\", \"倘使\", \"倘或\", \"倘然\", \"倘若\", \"借\", \"借傥然\", \"假使\", \"假如\", \"假若\", \"做\", \"像\", \"儿\", \"先不先\", \"光\", \"光是\", \"全体\", \"全部\", \"八\", \"六\", \"兮\", \"共\", \"关于\", \"关于具体地说\", \"其\", \"其一\", \"其中\", \"其二\", \"其他\", \"其余\", \"其它\", \"其次\", \"具体地说\", \"具体说来\", \"兼之\", \"内\", \"再\", \"再其次\", \"再则\", \"再有\", \"再者\", \"再者说\", \"再说\", \"冒\", \"冲\", \"况且\", \"几\", \"几时\", \"凡\", \"凡是\", \"凭\", \"凭借\", \"出于\", \"出来\", \"分\", \"分别\", \"则\", \"则甚\", \"别\", \"别人\", \"别处\", \"别是\", \"别的\", \"别管\", \"别说\", \"到\", \"前后\", \"前此\", \"前者\", \"加之\", \"加以\", \"区\", \"即\", \"即令\", \"即使\", \"即便\", \"即如\", \"即或\", \"即若\", \"却\", \"去\", \"又\", \"又及\", \"及\", \"及其\", \"及至\", \"反之\", \"反而\", \"反过来\", \"反过来说\", \"受到\", \"另\", \"另一方面\", \"另外\", \"另悉\", \"只\", \"只当\", \"只怕\", \"只是\", \"只有\", \"只消\", \"只要\", \"只限\", \"叫\", \"叮咚\", \"可\", \"可以\", \"可是\", \"可见\", \"各\", \"各个\", \"各位\", \"各种\", \"各自\", \"同\", \"同时\", \"后\", \"后者\", \"向\", \"向使\", \"向着\", \"吓\", \"吗\", \"否则\", \"吧\", \"吧哒\", \"含\", \"吱\", \"呀\", \"呃\", \"呕\", \"呗\", \"呜\", \"呜呼\", \"呢\", \"呵\", \"呵呵\", \"呸\", \"呼哧\", \"咋\", \"和\", \"咚\", \"咦\", \"咧\", \"咱\", \"咱们\", \"咳\", \"哇\", \"哈\", \"哈哈\", \"哉\", \"哎\", \"哎呀\", \"哎哟\", \"哗\", \"哟\", \"哦\", \"哩\", \"哪\", \"哪个\", \"哪些\", \"哪儿\", \"哪天\", \"哪年\", \"哪怕\", \"哪样\", \"哪边\", \"哪里\", \"哼\", \"哼唷\", \"唉\", \"唯有\", \"啊\", \"啐\", \"啥\", \"啦\", \"啪达\", \"啷当\", \"喂\", \"喏\", \"喔唷\", \"喽\", \"嗡\", \"嗡嗡\", \"嗬\", \"嗯\", \"嗳\", \"嘎\", \"嘎登\", \"嘘\", \"嘛\", \"嘻\", \"嘿\", \"嘿嘿\", \"四\", \"因\", \"因为\", \"因了\", \"因此\", \"因着\", \"因而\", \"固然\", \"在\", \"在下\", \"在于\", \"地\", \"基于\", \"处在\", \"多\", \"多么\", \"多少\", \"大\", \"大家\", \"她\", \"她们\", \"好\", \"如\", \"如上\", \"如上所述\", \"如下\", \"如何\", \"如其\", \"如同\", \"如是\", \"如果\", \"如此\", \"如若\", \"始而\", \"孰料\", \"孰知\", \"宁\", \"宁可\", \"宁愿\", \"宁肯\", \"它\", \"它们\", \"对\", \"对于\", \"对待\", \"对方\", \"对比\", \"将\", \"小\", \"尔\", \"尔后\", \"尔尔\", \"尚且\", \"就\", \"就是\", \"就是了\", \"就是说\", \"就算\", \"就要\", \"尽\", \"尽管\", \"尽管如此\", \"岂但\", \"己\", \"已\", \"已矣\", \"巴\", \"巴巴\", \"年\", \"并\", \"并且\", \"庶乎\", \"庶几\", \"开外\", \"开始\", \"归\", \"归齐\", \"当\", \"当地\", \"当然\", \"当着\", \"彼\", \"彼时\", \"彼此\", \"往\", \"待\", \"很\", \"得\", \"得了\", \"怎\", \"怎么\", \"怎么办\", \"怎么样\", \"怎奈\", \"怎样\", \"总之\", \"总的来看\", \"总的来说\", \"总的说来\", \"总而言之\", \"恰恰相反\", \"您\", \"惟其\", \"慢说\", \"我\", \"我们\", \"或\", \"或则\", \"或是\", \"或曰\", \"或者\", \"截至\", \"所\", \"所以\", \"所在\", \"所幸\", \"所有\", \"才\", \"才能\", \"打\", \"打从\", \"把\", \"抑或\", \"拿\", \"按\", \"按照\", \"换句话说\", \"换言之\", \"据\", \"据此\", \"接着\", \"故\", \"故此\", \"故而\", \"旁人\", \"无\", \"无宁\", \"无论\", \"既\", \"既往\", \"既是\", \"既然\", \"日\", \"时\", \"时候\", \"是\", \"是以\", \"是的\", \"更\", \"曾\", \"替\", \"替代\", \"最\", \"月\", \"有\", \"有些\", \"有关\", \"有及\", \"有时\", \"有的\", \"望\", \"朝\", \"朝着\", \"本\", \"本人\", \"本地\", \"本着\", \"本身\", \"来\", \"来着\", \"来自\", \"来说\", \"极了\", \"果然\", \"果真\", \"某\", \"某个\", \"某些\", \"某某\", \"根据\", \"欤\", \"正值\", \"正如\", \"正巧\", \"正是\", \"此\", \"此地\", \"此处\", \"此外\", \"此时\", \"此次\", \"此间\", \"毋宁\", \"每\", \"每当\", \"比\", \"比及\", \"比如\", \"比方\", \"没奈何\", \"沿\", \"沿着\", \"漫说\", \"点\", \"焉\", \"然则\", \"然后\", \"然而\", \"照\", \"照着\", \"犹且\", \"犹自\", \"甚且\", \"甚么\", \"甚或\", \"甚而\", \"甚至\", \"甚至于\", \"用\", \"用来\", \"由\", \"由于\", \"由是\", \"由此\", \"由此可见\", \"的\", \"的确\", \"的话\", \"直到\", \"相对而言\", \"省得\", \"看\", \"眨眼\", \"着\", \"着呢\", \"矣\", \"矣乎\", \"矣哉\", \"离\", \"秒\", \"称\", \"竟而\", \"第\", \"等\", \"等到\", \"等等\", \"简言之\", \"管\", \"类如\", \"紧接着\", \"纵\", \"纵令\", \"纵使\", \"纵然\", \"经\", \"经过\", \"结果\", \"给\", \"继之\", \"继后\", \"继而\", \"综上所述\", \"罢了\", \"者\", \"而\", \"而且\", \"而况\", \"而后\", \"而外\", \"而已\", \"而是\", \"而言\", \"能\", \"能否\", \"腾\", \"自\", \"自个儿\", \"自从\", \"自各儿\", \"自后\", \"自家\", \"自己\", \"自打\", \"自身\", \"至\", \"至于\", \"至今\", \"至若\", \"致\", \"般的\", \"若\", \"若夫\", \"若是\", \"若果\", \"若非\", \"莫不然\", \"莫如\", \"莫若\", \"虽\", \"虽则\", \"虽然\", \"虽说\", \"被\", \"要\", \"要不\", \"要不是\", \"要不然\", \"要么\", \"要是\", \"譬喻\", \"譬如\", \"让\", \"许多\", \"论\", \"设使\", \"设或\", \"设若\", \"诚如\", \"诚然\", \"该\", \"说\", \"说来\", \"请\", \"诸\", \"诸位\", \"诸如\", \"谁\", \"谁人\", \"谁料\", \"谁知\", \"贼死\", \"赖以\", \"赶\", \"起\", \"起见\", \"趁\", \"趁着\", \"越是\", \"距\", \"跟\", \"较\", \"较之\", \"边\", \"过\", \"还\", \"还是\", \"还有\", \"还要\", \"这\", \"这一来\", \"这个\", \"这么\", \"这么些\", \"这么样\", \"这么点儿\", \"这些\", \"这会儿\", \"这儿\", \"这就是说\", \"这时\", \"这样\", \"这次\", \"这般\", \"这边\", \"这里\", \"进而\", \"连\", \"连同\", \"逐步\", \"通过\", \"遵循\", \"遵照\", \"那\", \"那个\", \"那么\", \"那么些\", \"那么样\", \"那些\", \"那会儿\", \"那儿\", \"那时\", \"那样\", \"那般\", \"那边\", \"那里\", \"都\", \"鄙人\", \"鉴于\", \"针对\", \"阿\", \"除\", \"除了\", \"除外\", \"除开\", \"除此之外\", \"除非\", \"随\", \"随后\", \"随时\", \"随着\", \"难道说\", \"零\", \"非\", \"非但\", \"非徒\", \"非特\", \"非独\", \"靠\", \"顺\", \"顺着\", \"首先\", \"︿\", \"!\", \"#\", \"$\", \"%\", \"&\", \"(\", \")\", \"*\", \"+\", \",\", \"0\", \"1\", \"2\", \"3\", \"4\", \"5\", \"6\", \"7\", \"8\", \"9\", \":\", \";\", \"<\", \">\", \"?\", \"@\", \"[\", \"]\", \"{\", \"|\", \"}\", \"~\", \"¥\"]\n", + " )\n", + "stopwords = stopwords.union(set(string.punctuation))\n", + "constraints = [RepeatModification(),\n", + " StopwordModification(stopwords = stopwords)]\n", + "\n", + "# search method\n", + "search_method = GreedyWordSwapWIR(wir_method=\"weighted-saliency\")\n", + "\n", + "# attack!\n", + "attack = Attack(goal_function, constraints, transformation, search_method)\n", + "attack_args = AttackArgs(num_examples=20)\n", + "attacker = Attacker(attack, dataset, attack_args)\n", + "attack_results = attacker.attack_dataset()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000, + "referenced_widgets": [ + "4b423038915e40158f9da4c07d09aad3", + "3711cf0a18994cee8fc840d9a93cf5d3", + "7f77bd7b8e5f45ae94cfc45f915c0c72", + "fe0ca6138bc54b628c03e590c6e96aed", + "8b39363f69eb46009c5357263a65248c", + "6b976fd913584da69456c1b6d53483cb", + "ea568ab2407f474da3b1f1b2540fa3a8", + "ff6b34a7e75b443593f3dca5d050cd52", + "4f31972fd2fd44bbac063bb4b5075e98", + "7de1551891ec447ab6d80ea1de145f16", + "e5e2c0507c834887b80f5717c1e6d5f3", + "588b1321a9274de6a8a9e86622d90be4", + "2436b07259a34ee18fe9c1007f7b615b", + "98aac5a0baee4930bd461f2c5fd73f4a", + "34607a8556794a5a86c18abe5bd7e5a5", + "f78f6701ce4f4b3b9ff0af925620f261", + "a1e3fb5cceed4e95957a17192a641b69", + "83e9b14c4d354fdc80db4f8a881f19f3", + "5f5457f292284dd8b914f45e26b2f749", + "2bb72191846f49528663680a315d8b01", + "83eff532314e4edcbfe648b321e9a310", + "3d30e700d32443fdb37b5ab934d2d70a", + "a132f09845a54cbe865cbe8159bb693e", + "0af0e1eaea2f48c5b0fec6e550bd1baa", + "dd6b0a5d9db245338a8fdb2ef5b29bf9", + "58fc309041b54e94ae265167fa20d8d7", + "89dfd3fdc41e417a870901bc79e47495", + "21472d1c4c8b494a8d3660b3320e9d4b", + "7511bb9ca5424674bb2350dff63c468a", + "f6dd2c2cb4e346fe9af7026b5d2162e9", + "a34ad57624fc422aa4832db3963298e6", + "5167daffe92e44d2acc2af2d9b9738df", + "acbfb34a353f41649675bd104069d14e", + "be070cb4a1624b0bb8f9b594c6b951a5", + "2edb7130713d4e10a07bbf808abb9771", + "5ae4c618f75d4ef9b65e5020fccb6d72", + "138d8260e67f4bc58106b9b42f7abd12", + "d7621b5c619a4ce38ebe63924374cf78", + "1b208b6df75f4a9e97faa4e3705a9442", + "a7871b8ec3ec40e7bbbe6a5f40b79f4a", + "aeb7ee752d834b4cbaa189419fd75dd4", + "b47dfff73e73410aa89f65e3c5b0c366", + "bdf3571e59ef4a688ab89d4badda27b1", + "d3bab427b92144d6b9ce96eac18ceb89" + ] + }, + "id": "C_0Z8njnRblT", + "outputId": "3890d784-de7f-4b70-f984-cbc9e0c7f700" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Downloading: 0%| | 0.00/615 [00:00 [[[FAILED]]]\n", + "\n", + "林书豪新秀赛上甘心\"跑龙套\" 自称仍是底薪球员\n", + "\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "[Succeeded / Failed / Skipped / Total] 0 / 1 / 0 / 1: 10%|█ | 2/20 [06:55<1:02:18, 207.69s/it]\u001b[A\n", + "[Succeeded / Failed / Skipped / Total] 0 / 2 / 0 / 2: 10%|█ | 2/20 [06:55<1:02:18, 207.70s/it]\u001b[A" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--------------------------------------------- Result 2 ---------------------------------------------\n", + "[[Culture (100%)]] --> [[[FAILED]]]\n", + "\n", + "成都现“真人图书馆”:无书“借人”给你读\n", + "\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "[Succeeded / Failed / Skipped / Total] 0 / 2 / 0 / 2: 15%|█▌ | 3/20 [07:01<39:50, 140.61s/it] \u001b[A\n", + "[Succeeded / Failed / Skipped / Total] 0 / 2 / 1 / 3: 15%|█▌ | 3/20 [07:01<39:50, 140.61s/it]\u001b[A" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--------------------------------------------- Result 3 ---------------------------------------------\n", + "[[Mainland china politics (57%)]] --> [[[SKIPPED]]]\n", + "\n", + "中国经济走向更趋稳健务实\n", + "\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "[Succeeded / Failed / Skipped / Total] 0 / 2 / 1 / 3: 20%|██ | 4/20 [11:33<46:12, 173.28s/it]\u001b[A\n", + "[Succeeded / Failed / Skipped / Total] 0 / 3 / 1 / 4: 20%|██ | 4/20 [11:33<46:12, 173.28s/it]\u001b[A" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--------------------------------------------- Result 4 ---------------------------------------------\n", + "[[Sports (100%)]] --> [[[FAILED]]]\n", + "\n", + "国际田联世界挑战赛 罗伯斯迎来赛季第三冠\n", + "\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "[Succeeded / Failed / Skipped / Total] 0 / 3 / 1 / 4: 25%|██▌ | 5/20 [14:52<44:36, 178.44s/it]\u001b[A" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--------------------------------------------- Result 5 ---------------------------------------------\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "[Succeeded / Failed / Skipped / Total] 1 / 3 / 1 / 5: 25%|██▌ | 5/20 [14:53<44:39, 178.62s/it]\u001b[A" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[[International news (66%)]] --> [[Entertainment (68%)]]\n", + "\n", + "德国一电视台合成“默克尔头巾照”惹争议\n", + "\n", + "德国一电视台合成“性感头巾照”惹争议\n", + "\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "[Succeeded / Failed / Skipped / Total] 1 / 3 / 1 / 5: 30%|███ | 6/20 [14:57<34:55, 149.65s/it]\u001b[A\n", + "[Succeeded / Failed / Skipped / Total] 1 / 3 / 2 / 6: 30%|███ | 6/20 [14:57<34:55, 149.65s/it]\u001b[A" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--------------------------------------------- Result 6 ---------------------------------------------\n", + "[[Mainland china politics (80%)]] --> [[[SKIPPED]]]\n", + "\n", + "朴槿惠今访华 韩媒称访西安可能为增进与习近平友谊\n", + "\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "[Succeeded / Failed / Skipped / Total] 1 / 3 / 2 / 6: 35%|███▌ | 7/20 [15:04<27:59, 129.16s/it]\u001b[A\n", + "[Succeeded / Failed / Skipped / Total] 1 / 3 / 3 / 7: 35%|███▌ | 7/20 [15:04<27:59, 129.16s/it]\u001b[A" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--------------------------------------------- Result 7 ---------------------------------------------\n", + "[[Mainland china politics (59%)]] --> [[[SKIPPED]]]\n", + "\n", + "中国驻休斯敦总领馆举办春节招待会向华裔拜年\n", + "\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "[Succeeded / Failed / Skipped / Total] 1 / 3 / 3 / 7: 40%|████ | 8/20 [15:08<22:43, 113.60s/it]\u001b[A\n", + "[Succeeded / Failed / Skipped / Total] 1 / 3 / 4 / 8: 40%|████ | 8/20 [15:08<22:43, 113.61s/it]\u001b[A" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--------------------------------------------- Result 8 ---------------------------------------------\n", + "[[Culture (93%)]] --> [[[SKIPPED]]]\n", + "\n", + "NASA发现“地球兄弟” 具备生命存活条件\n", + "\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "[Succeeded / Failed / Skipped / Total] 1 / 3 / 4 / 8: 45%|████▌ | 9/20 [15:13<18:36, 101.52s/it]\u001b[A\n", + "[Succeeded / Failed / Skipped / Total] 1 / 3 / 5 / 9: 45%|████▌ | 9/20 [15:13<18:36, 101.52s/it]\u001b[A" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--------------------------------------------- Result 9 ---------------------------------------------\n", + "[[Culture (53%)]] --> [[[SKIPPED]]]\n", + "\n", + "儿子去世后社交网站账号停用 父亲请求保留记忆\n", + "\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "[Succeeded / Failed / Skipped / Total] 1 / 3 / 5 / 9: 50%|█████ | 10/20 [18:20<18:20, 110.06s/it]\u001b[A\n", + "[Succeeded / Failed / Skipped / Total] 2 / 3 / 5 / 10: 50%|█████ | 10/20 [18:20<18:20, 110.06s/it]\u001b[A" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--------------------------------------------- Result 10 ---------------------------------------------\n", + "[[Culture (100%)]] --> [[Entertainment (72%)]]\n", + "\n", + "第六届鲁迅文学奖颁发 格非等35位获奖者领奖\n", + "\n", + "第六届决赛颁发 格非等35位获奖者领奖\n", + "\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "[Succeeded / Failed / Skipped / Total] 2 / 3 / 5 / 10: 55%|█████▌ | 11/20 [22:44<18:36, 124.02s/it]\u001b[A\n", + "[Succeeded / Failed / Skipped / Total] 3 / 3 / 5 / 11: 55%|█████▌ | 11/20 [22:44<18:36, 124.02s/it]\u001b[A" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--------------------------------------------- Result 11 ---------------------------------------------\n", + "[[Hong kong - macau politics (96%)]] --> [[Culture (79%)]]\n", + "\n", + "东莞台商欲借“台博会”搭建内销平台\n", + "\n", + "东莞讯欲借“艺博会”搭建内销平台\n", + "\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "[Succeeded / Failed / Skipped / Total] 3 / 3 / 5 / 11: 60%|██████ | 12/20 [22:48<15:12, 114.07s/it]\u001b[A\n", + "[Succeeded / Failed / Skipped / Total] 3 / 3 / 6 / 12: 60%|██████ | 12/20 [22:48<15:12, 114.07s/it]\u001b[A" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--------------------------------------------- Result 12 ---------------------------------------------\n", + "[[Financial news (56%)]] --> [[[SKIPPED]]]\n", + "\n", + "日本网友买扇贝当下酒菜 发现内有真正珍珠(图)\n", + "\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "[Succeeded / Failed / Skipped / Total] 3 / 3 / 6 / 12: 65%|██████▌ | 13/20 [28:59<15:36, 133.78s/it]\u001b[A\n", + "[Succeeded / Failed / Skipped / Total] 3 / 4 / 6 / 13: 65%|██████▌ | 13/20 [28:59<15:36, 133.78s/it]\u001b[A" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--------------------------------------------- Result 13 ---------------------------------------------\n", + "[[Sports (100%)]] --> [[[FAILED]]]\n", + "\n", + "篮球热潮席卷张江 NBA中投王与拉拉队鼎力加盟\n", + "\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "[Succeeded / Failed / Skipped / Total] 3 / 4 / 6 / 13: 70%|███████ | 14/20 [33:40<14:26, 144.34s/it]\u001b[A\n", + "[Succeeded / Failed / Skipped / Total] 3 / 5 / 6 / 14: 70%|███████ | 14/20 [33:40<14:26, 144.34s/it]\u001b[A" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--------------------------------------------- Result 14 ---------------------------------------------\n", + "[[Sports (100%)]] --> [[[FAILED]]]\n", + "\n", + "UFC终极格斗冠军赛开打 \"草原狼\"遭遇三连败\n", + "\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "[Succeeded / Failed / Skipped / Total] 3 / 5 / 6 / 14: 75%|███████▌ | 15/20 [33:45<11:15, 135.04s/it]\u001b[A\n", + "[Succeeded / Failed / Skipped / Total] 3 / 5 / 7 / 15: 75%|███████▌ | 15/20 [33:45<11:15, 135.04s/it]\u001b[A" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--------------------------------------------- Result 15 ---------------------------------------------\n", + "[[Culture (92%)]] --> [[[SKIPPED]]]\n", + "\n", + "水果style:心形水果惹人爱 骰子西瓜乐趣多(图)\n", + "\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "[Succeeded / Failed / Skipped / Total] 3 / 5 / 7 / 15: 80%|████████ | 16/20 [40:09<10:02, 150.60s/it]\u001b[A\n", + "[Succeeded / Failed / Skipped / Total] 3 / 6 / 7 / 16: 80%|████████ | 16/20 [40:09<10:02, 150.60s/it]\u001b[A" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--------------------------------------------- Result 16 ---------------------------------------------\n", + "[[Sports (100%)]] --> [[[FAILED]]]\n", + "\n", + "同里杯中国天元赛前瞻:芈昱廷李钦诚争挑战权\n", + "\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "[Succeeded / Failed / Skipped / Total] 3 / 6 / 7 / 16: 85%|████████▌ | 17/20 [43:32<07:41, 153.67s/it]\u001b[A\n", + "[Succeeded / Failed / Skipped / Total] 4 / 6 / 7 / 17: 85%|████████▌ | 17/20 [43:32<07:41, 153.67s/it]\u001b[A" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--------------------------------------------- Result 17 ---------------------------------------------\n", + "[[Entertainment (100%)]] --> [[Financial news (99%)]]\n", + "\n", + "桂纶镁为戏体验生活 东北洗衣店当店员\n", + "\n", + "桂纶品牌为首体验生活 东北洗衣店当家\n", + "\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "[Succeeded / Failed / Skipped / Total] 4 / 6 / 7 / 17: 90%|█████████ | 18/20 [44:01<04:53, 146.75s/it]\u001b[A\n", + "[Succeeded / Failed / Skipped / Total] 4 / 7 / 7 / 18: 90%|█████████ | 18/20 [44:01<04:53, 146.75s/it]\u001b[A" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--------------------------------------------- Result 18 ---------------------------------------------\n", + "[[Culture (95%)]] --> [[[FAILED]]]\n", + "\n", + "河南羲皇故都朝祖会流传6000年 一天游客80万人\n", + "\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "[Succeeded / Failed / Skipped / Total] 4 / 7 / 7 / 18: 95%|█████████▌| 19/20 [44:07<02:19, 139.35s/it]\u001b[A\n", + "[Succeeded / Failed / Skipped / Total] 4 / 7 / 8 / 19: 95%|█████████▌| 19/20 [44:07<02:19, 139.35s/it]\u001b[A" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--------------------------------------------- Result 19 ---------------------------------------------\n", + "[[Culture (92%)]] --> [[[SKIPPED]]]\n", + "\n", + "辛柏青谈追求妻子:用1袋洗衣粉、2块肥皂打动她的\n", + "\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "[Succeeded / Failed / Skipped / Total] 4 / 7 / 8 / 19: 100%|██████████| 20/20 [49:19<00:00, 147.96s/it]\u001b[A\n", + "[Succeeded / Failed / Skipped / Total] 5 / 7 / 8 / 20: 100%|██████████| 20/20 [49:19<00:00, 147.96s/it]" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--------------------------------------------- Result 20 ---------------------------------------------\n", + "[[International news (100%)]] --> [[Mainland china politics (66%)]]\n", + "\n", + "朝鲜谴责韩国前方部队打出反朝口号\n", + "\n", + "中国谴责日本前方部队打出侵略口号\n", + "\n", + "\n", + "\n", + "+-------------------------------+--------+\n", + "| Attack Results | |\n", + "+-------------------------------+--------+\n", + "| Number of successful attacks: | 5 |\n", + "| Number of failed attacks: | 7 |\n", + "| Number of skipped attacks: | 8 |\n", + "| Original accuracy: | 60.0% |\n", + "| Accuracy under attack: | 35.0% |\n", + "| Attack success rate: | 41.67% |\n", + "| Average perturbed word %: | 36.39% |\n", + "| Average num. words per input: | 9.3 |\n", + "| Avg num queries: | 45.5 |\n", + "+-------------------------------+--------+\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "As aforementioned, we can also augment Chinese sentences with the provided transformation. A quick examples is shown below:" + ], + "metadata": { + "id": "3e_tQiHWS-Pb" + } + }, + { + "cell_type": "code", + "source": [ + "from textattack.constraints.pre_transformation import RepeatModification\n", + "from textattack.constraints.pre_transformation import StopwordModification\n", + "from textattack.augmentation import Augmenter\n", + "\n", + "# transformation\n", + "transformation = ChineseMorphonymCharacterSwap()\n", + "\n", + "# constraints\n", + "constraints = [RepeatModification(), StopwordModification()]\n", + "\n", + "# Create augmenter with specified parameters\n", + "augmenter = Augmenter(transformation=transformation, pct_words_to_swap = 0.1, transformations_per_example=2)\n", + "s = '听见树林的呢喃,发现溪流中的知识。'\n", + "\n", + "# Augment!\n", + "augmenter.augment(s)" + ], + "metadata": { + "id": "43MCRE0pqVM0", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "2ad12bf5-3bd8-4c8d-913c-949fcae787d3" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Building prefix dict from the default dictionary ...\n", + "DEBUG:jieba:Building prefix dict from the default dictionary ...\n", + "Dumping model to file cache /tmp/jieba.cache\n", + "DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache\n", + "Loading model cost 0.888 seconds.\n", + "DEBUG:jieba:Loading model cost 0.888 seconds.\n", + "Prefix dict has been built successfully.\n", + "DEBUG:jieba:Prefix dict has been built successfully.\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['听见树林的呢喃,发现溪流中的知织。', '听见树林的呢喃,发视溪流中的知识。']" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ] + } + ] +} \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index c36ad5992..5f1934a4a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -51,6 +51,7 @@ TextAttack Documentation Tutorial 8: Attacking Keras models <2notebook/Example_3_Keras.ipynb> Tutorial 9: Attacking multilingual models <2notebook/Example_4_CamemBERT.ipynb> Tutorial10: Explaining Attacking BERT model using Captum <2notebook/Example_5_Explain_BERT.ipynb> + Tutorial11: Attacking multilingual - Chinese NLP model using Textattack <2notebook/Example_6_Chinese_Attack.ipynb> .. toctree:: :maxdepth: 6 diff --git a/examples/attack/attack_keras_parallel.py b/examples/attack/attack_keras_parallel.py index f05fcc2a5..617e08422 100644 --- a/examples/attack/attack_keras_parallel.py +++ b/examples/attack/attack_keras_parallel.py @@ -70,7 +70,6 @@ def __init__(self, model): self.model = model def __call__(self, text_input_list): - x_transform = [] for i, review in enumerate(text_input_list): tokens = [x.strip(",") for x in review.split()] diff --git a/requirements.txt b/requirements.txt index 80f582ec5..02f3be15a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ numpy>=1.21.0 pandas>=1.0.1 scipy>=1.4.1 torch>=1.7.0,!=1.8 -transformers>=4.21.0 +transformers==4.30.0 terminaltables tqdm word2number @@ -23,4 +23,5 @@ jieba OpenHowNet pycld2 click<8.1.0 -pyabsa>=2.0.6 +pinyin +pyabsa>=2.0.6 \ No newline at end of file diff --git a/tests/sample_outputs/run_attack_flair_pos_tagger_bert_score.txt b/tests/sample_outputs/run_attack_flair_pos_tagger_bert_score.txt index 42e1d6a06..0f2b96851 100644 --- a/tests/sample_outputs/run_attack_flair_pos_tagger_bert_score.txt +++ b/tests/sample_outputs/run_attack_flair_pos_tagger_bert_score.txt @@ -26,11 +26,9 @@ ) --------------------------------------------- Result 1 --------------------------------------------- -[[Positive (100%)]] --> [[Negative (98%)]] +[[Positive (100%)]] --> [[[FAILED]]] -exposing the ways we fool ourselves is one [[hour]] photo's real [[strength]] . - -exposing the ways we fool ourselves is one [[stopwatch]] photo's real [[kraft]] . +exposing the ways we fool ourselves is one hour photo's real strength . --------------------------------------------- Result 2 --------------------------------------------- @@ -42,32 +40,32 @@ it's up to you to decide whether to admire these people's dedication to their ca --------------------------------------------- Result 3 --------------------------------------------- -[[Positive (100%)]] --> [[Negative (96%)]] +[[Positive (100%)]] --> [[Negative (71%)]] mostly , [goldbacher] just lets her complicated characters be [[unruly]] , confusing and , through it all , [[human]] . -mostly , [goldbacher] just lets her complicated characters be [[haphazard]] , confusing and , through it all , [[humanistic]] . +mostly , [goldbacher] just lets her complicated characters be [[disorderly]] , confusing and , through it all , [[humans]] . --------------------------------------------- Result 4 --------------------------------------------- [[Positive (99%)]] --> [[Negative (90%)]] -. . . [[quite]] good at [[providing]] some good old fashioned [[spooks]] . +. . . [[quite]] good at [[providing]] some good old [[fashioned]] [[spooks]] . -. . . [[rather]] good at [[provision]] some good old fashioned [[bugging]] . +. . . [[fairly]] good at [[deliver]] some good old [[sculpted]] [[bugging]] . +-------------------------------+--------+ | Attack Results | | +-------------------------------+--------+ -| Number of successful attacks: | 4 | -| Number of failed attacks: | 0 | +| Number of successful attacks: | 3 | +| Number of failed attacks: | 1 | | Number of skipped attacks: | 0 | | Original accuracy: | 100.0% | -| Accuracy under attack: | 0.0% | -| Attack success rate: | 100.0% | -| Average perturbed word %: | 17.56% | +| Accuracy under attack: | 25.0% | +| Attack success rate: | 75.0% | +| Average perturbed word %: | 21.56% | | Average num. words per input: | 16.25 | -| Avg num queries: | 38.5 | +| Avg num queries: | 33.0 | +-------------------------------+--------+ diff --git a/tests/test_transformations.py b/tests/test_transformations.py index 589cc5b6c..506d267a6 100644 --- a/tests/test_transformations.py +++ b/tests/test_transformations.py @@ -57,3 +57,54 @@ def test_word_swap_change_name(): for entity in augmented_text.get_spans("ner"): entity_augmented.append(entity.tag) assert entity_original == entity_augmented + + +def test_chinese_morphonym_character_swap(): + from textattack.augmentation import Augmenter + from textattack.transformations.word_swaps.chn_transformations import ( + ChineseMorphonymCharacterSwap, + ) + + augmenter = Augmenter( + transformation=ChineseMorphonymCharacterSwap(), + pct_words_to_swap=0.1, + transformations_per_example=5, + ) + s = "自然语言处理。" + augmented_text_list = augmenter.augment(s) + augmented_s = "自然语言处埋。" + assert augmented_s or s in augmented_text_list + + +def test_chinese_word_swap_hownet(): + from textattack.augmentation import Augmenter + from textattack.transformations.word_swaps.chn_transformations import ( + ChineseWordSwapHowNet, + ) + + augmenter = Augmenter( + transformation=ChineseWordSwapHowNet(), + pct_words_to_swap=0.1, + transformations_per_example=5, + ) + s = "自然语言。" + augmented_text_list = augmenter.augment(s) + augmented_s = "中间语言。" + assert augmented_s or s in augmented_text_list + + +def test_chinese_word_swap_masked(): + from textattack.augmentation import Augmenter + from textattack.transformations.word_swaps.chn_transformations import ( + ChineseWordSwapMaskedLM, + ) + + augmenter = Augmenter( + transformation=ChineseWordSwapMaskedLM(), + pct_words_to_swap=0.1, + transformations_per_example=5, + ) + s = "自然语言处理。" + augmented_text_list = augmenter.augment(s) + augmented_s = "自然语言文字。" + assert augmented_s or s in augmented_text_list diff --git a/textattack/attack.py b/textattack/attack.py index 629463919..3369a3149 100644 --- a/textattack/attack.py +++ b/textattack/attack.py @@ -57,18 +57,20 @@ class Attack: >>> # Construct our four components for `Attack` >>> from textattack.constraints.pre_transformation import RepeatModification, StopwordModification >>> from textattack.constraints.semantics import WordEmbeddingDistance + >>> from textattack.transformations import WordSwapEmbedding + >>> from textattack.search_methods import GreedyWordSwapWIR >>> goal_function = textattack.goal_functions.UntargetedClassification(model_wrapper) >>> constraints = [ ... RepeatModification(), - ... StopwordModification() + ... StopwordModification(), ... WordEmbeddingDistance(min_cos_sim=0.9) ... ] >>> transformation = WordSwapEmbedding(max_candidates=50) >>> search_method = GreedyWordSwapWIR(wir_method="delete") >>> # Construct the actual attack - >>> attack = Attack(goal_function, constraints, transformation, search_method) + >>> attack = textattack.Attack(goal_function, constraints, transformation, search_method) >>> input_text = "I really enjoyed the new movie that came out last month." >>> label = 1 #Positive diff --git a/textattack/attack_args.py b/textattack/attack_args.py index c33cc26b2..3521ecc8c 100644 --- a/textattack/attack_args.py +++ b/textattack/attack_args.py @@ -708,6 +708,7 @@ def _create_attack_from_args(cls, args, model_wrapper): if args.query_budget: recipe.goal_function.query_budget = args.query_budget recipe.goal_function.model_cache_size = args.model_cache_size + recipe.goal_function.batch_size = args.model_batch_size recipe.constraint_cache_size = args.constraint_cache_size return recipe elif args.attack_from_file: diff --git a/textattack/attack_recipes/__init__.py b/textattack/attack_recipes/__init__.py index 1a903fee6..6e865ddee 100644 --- a/textattack/attack_recipes/__init__.py +++ b/textattack/attack_recipes/__init__.py @@ -41,3 +41,4 @@ from .clare_li_2020 import CLARE2020 from .french_recipe import FrenchRecipe from .spanish_recipe import SpanishRecipe +from .chinese_recipe import ChineseRecipe diff --git a/textattack/attack_recipes/chinese_recipe.py b/textattack/attack_recipes/chinese_recipe.py new file mode 100644 index 000000000..f72be2a31 --- /dev/null +++ b/textattack/attack_recipes/chinese_recipe.py @@ -0,0 +1,52 @@ +import string + +from textattack import Attack +from textattack.constraints.pre_transformation import ( + RepeatModification, + StopwordModification, +) +from textattack.goal_functions import UntargetedClassification +from textattack.search_methods import GreedyWordSwapWIR +from textattack.shared.data import CHN_STOPWORD +from textattack.transformations import ( + ChineseHomophoneCharacterSwap, + ChineseMorphonymCharacterSwap, + ChineseWordSwapHowNet, + ChineseWordSwapMaskedLM, + CompositeTransformation, +) + +from .attack_recipe import AttackRecipe + + +class ChineseRecipe(AttackRecipe): + """An implementation of the attack used in "Beyond Accuracy: Behavioral + Testing of NLP models with CheckList", Ribeiro et al., 2020. + + This attack focuses on a number of attacks used in the Invariance Testing + Method: Contraction, Extension, Changing Names, Number, Location + + https://arxiv.org/abs/2005.04118 + """ + + @staticmethod + def build(model_wrapper): + transformation = CompositeTransformation( + [ + ChineseWordSwapHowNet(), + ChineseWordSwapMaskedLM(), + ChineseMorphonymCharacterSwap(), + ChineseHomophoneCharacterSwap(), + ] + ) + + stopwords = CHN_STOPWORD.union(set(string.punctuation)) + + # Need this constraint to prevent extend and contract modifying each others' changes and forming infinite loop + constraints = [RepeatModification(), StopwordModification(stopwords=stopwords)] + + # Untargeted attack & Greedy search with weighted saliency + goal_function = UntargetedClassification(model_wrapper) + search_method = GreedyWordSwapWIR(wir_method="weighted-saliency") + + return Attack(goal_function, constraints, transformation, search_method) diff --git a/textattack/attack_recipes/morpheus_tan_2020.py b/textattack/attack_recipes/morpheus_tan_2020.py index edf8ae790..b98360a53 100644 --- a/textattack/attack_recipes/morpheus_tan_2020.py +++ b/textattack/attack_recipes/morpheus_tan_2020.py @@ -27,7 +27,6 @@ class MorpheusTan2020(AttackRecipe): @staticmethod def build(model_wrapper): - # # Goal is to minimize BLEU score between the model output given for the # perturbed input sequence and the reference translation diff --git a/textattack/attack_recipes/seq2sick_cheng_2018_blackbox.py b/textattack/attack_recipes/seq2sick_cheng_2018_blackbox.py index de800c522..86b79aa23 100644 --- a/textattack/attack_recipes/seq2sick_cheng_2018_blackbox.py +++ b/textattack/attack_recipes/seq2sick_cheng_2018_blackbox.py @@ -31,7 +31,6 @@ class Seq2SickCheng2018BlackBox(AttackRecipe): @staticmethod def build(model_wrapper, goal_function="non_overlapping"): - # # Goal is non-overlapping output. # diff --git a/textattack/commands/augment_command.py b/textattack/commands/augment_command.py index 118fe0150..2883ded76 100644 --- a/textattack/commands/augment_command.py +++ b/textattack/commands/augment_command.py @@ -32,7 +32,6 @@ def run(self, args): args = textattack.AugmenterArgs(**vars(args)) if args.interactive: - print("\nRunning in interactive mode...\n") augmenter = eval(AUGMENTATION_RECIPE_NAMES[args.recipe])( pct_words_to_swap=args.pct_words_to_swap, diff --git a/textattack/commands/eval_model_command.py b/textattack/commands/eval_model_command.py index 16cbfd2fa..7957fbfee 100644 --- a/textattack/commands/eval_model_command.py +++ b/textattack/commands/eval_model_command.py @@ -56,7 +56,7 @@ def test_model_on_dataset(self, args): while i < min(args.num_examples, len(dataset)): dataset_batch = dataset[i : min(args.num_examples, i + args.batch_size)] batch_inputs = [] - for (text_input, ground_truth_output) in dataset_batch: + for text_input, ground_truth_output in dataset_batch: attacked_text = textattack.shared.AttackedText(text_input) batch_inputs.append(attacked_text.tokenizer_input) ground_truth_outputs.append(ground_truth_output) diff --git a/textattack/constraints/overlap/max_words_perturbed.py b/textattack/constraints/overlap/max_words_perturbed.py index b919978c9..8d09a4108 100644 --- a/textattack/constraints/overlap/max_words_perturbed.py +++ b/textattack/constraints/overlap/max_words_perturbed.py @@ -38,7 +38,6 @@ def __init__( self.max_percent = max_percent def _check_constraint(self, transformed_text, reference_text): - num_words_diff = len(transformed_text.all_words_diff(reference_text)) if self.max_percent: min_num_words = min(len(transformed_text.words), len(reference_text.words)) diff --git a/textattack/datasets/dataset.py b/textattack/datasets/dataset.py index c56931adc..53c924733 100644 --- a/textattack/datasets/dataset.py +++ b/textattack/datasets/dataset.py @@ -125,7 +125,7 @@ def filter_by_labels_(self, labels_to_keep): """ if not isinstance(labels_to_keep, set): labels_to_keep = set(labels_to_keep) - self._dataset = filter(lambda x: x[1] in labels_to_keep, self._dataset) + self._dataset = list(filter(lambda x: x[1] in labels_to_keep, self._dataset)) def __getitem__(self, i): """Return i-th sample.""" diff --git a/textattack/goal_function_results/classification_goal_function_result.py b/textattack/goal_function_results/classification_goal_function_result.py index 3a70ded8e..1b9aaf532 100644 --- a/textattack/goal_function_results/classification_goal_function_result.py +++ b/textattack/goal_function_results/classification_goal_function_result.py @@ -26,7 +26,6 @@ def __init__( num_queries, ground_truth_output, ): - super().__init__( attacked_text, raw_output, diff --git a/textattack/goal_function_results/text_to_text_goal_function_result.py b/textattack/goal_function_results/text_to_text_goal_function_result.py index eae8d91e5..c50e2c11f 100644 --- a/textattack/goal_function_results/text_to_text_goal_function_result.py +++ b/textattack/goal_function_results/text_to_text_goal_function_result.py @@ -23,7 +23,6 @@ def __init__( num_queries, ground_truth_output, ): - super().__init__( attacked_text, raw_output, diff --git a/textattack/goal_functions/goal_function.py b/textattack/goal_functions/goal_function.py index 16f498301..78693f670 100644 --- a/textattack/goal_functions/goal_function.py +++ b/textattack/goal_functions/goal_function.py @@ -176,13 +176,15 @@ def _call_model_uncached(self, attacked_text_list): if isinstance(batch_preds, list): outputs.extend(batch_preds) elif isinstance(batch_preds, np.ndarray): - outputs.append(torch.tensor(batch_preds)) + outputs.append(batch_preds) else: outputs.append(batch_preds) i += self.batch_size if isinstance(outputs[0], torch.Tensor): outputs = torch.cat(outputs, dim=0) + elif isinstance(outputs[0], np.ndarray): + outputs = np.concatenate(outputs).ravel() assert len(inputs) == len( outputs diff --git a/textattack/loggers/weights_and_biases_logger.py b/textattack/loggers/weights_and_biases_logger.py index 6a8303117..7b9990421 100644 --- a/textattack/loggers/weights_and_biases_logger.py +++ b/textattack/loggers/weights_and_biases_logger.py @@ -13,7 +13,6 @@ class WeightsAndBiasesLogger(Logger): """Logs attack results to Weights & Biases.""" def __init__(self, **kwargs): - global wandb wandb = LazyLoader("wandb", globals(), "wandb") diff --git a/textattack/metrics/quality_metrics/perplexity.py b/textattack/metrics/quality_metrics/perplexity.py index e22175219..f1572591f 100644 --- a/textattack/metrics/quality_metrics/perplexity.py +++ b/textattack/metrics/quality_metrics/perplexity.py @@ -94,7 +94,6 @@ def calculate(self, results): return self.all_metrics def calc_ppl(self, texts): - with torch.no_grad(): text = " ".join(texts) eval_loss = [] diff --git a/textattack/search_methods/greedy_word_swap_wir.py b/textattack/search_methods/greedy_word_swap_wir.py index ac17fbf30..5721ce6b6 100644 --- a/textattack/search_methods/greedy_word_swap_wir.py +++ b/textattack/search_methods/greedy_word_swap_wir.py @@ -65,7 +65,6 @@ def _get_index_order(self, initial_text): # compute the largest change in score we can find by swapping each word delta_ps = [] for idx in indices_to_order: - # Exit Loop when search_over is True - but we need to make sure delta_ps # is the same size as softmax_saliency_scores if search_over: diff --git a/textattack/shared/attacked_text.py b/textattack/shared/attacked_text.py index 11d27bfb2..4616b467e 100644 --- a/textattack/shared/attacked_text.py +++ b/textattack/shared/attacked_text.py @@ -259,6 +259,7 @@ def ith_word_diff(self, other_attacked_text: AttackedText, i: int) -> bool: def words_diff_num(self, other_attacked_text: AttackedText) -> int: """The number of words different between two AttackedText objects.""" + # using edit distance to calculate words diff num def generate_tokens(words): result = {} diff --git a/textattack/shared/data.py b/textattack/shared/data.py index 9675fa960..37594f57e 100644 --- a/textattack/shared/data.py +++ b/textattack/shared/data.py @@ -9333,3 +9333,1307 @@ EXTENSION_MAP = {"ain't": "isn't", "aren't": 'are not', "can't": 'cannot', "can't've": 'cannot have', "could've": 'could have', "couldn't": 'could not', "didn't": 'did not', "doesn't": 'does not', "don't": 'do not', "hadn't": 'had not', "hasn't": 'has not', "haven't": 'have not', "he'd": 'he would', "he'd've": 'he would have', "he'll": 'he will', "he's": 'he is', "how'd": 'how did', "how'd'y": 'how do you', "how'll": 'how will', "how's": 'how is', "I'd": 'I would', "I'll": 'I will', "I'm": 'I am', "I've": 'I have', "i'd": 'i would', "i'll": 'i will', "i'm": 'i am', "i've": 'i have', "isn't": 'is not', "it'd": 'it would', "it'll": 'it will', "it's": 'it is', "ma'am": 'madam', "might've": 'might have', "mightn't": 'might not', "must've": 'must have', "mustn't": 'must not', "needn't": 'need not', "oughtn't": 'ought not', "shan't": 'shall not', "she'd": 'she would', "she'll": 'she will', "she's": 'she is', "should've": 'should have', "shouldn't": 'should not', "that'd": 'that would', "that's": 'that is', "there'd": 'there would', "there's": 'there is', "they'd": 'they would', "they'll": 'they will', "they're": 'they are', "they've": 'they have', "wasn't": 'was not', "we'd": 'we would', "we'll": 'we will', "we're": 'we are', "we've": 'we have', "weren't": 'were not', "what're": 'what are', "what's": 'what is', "when's": 'when is', "where'd": 'where did', "where's": 'where is', "where've": 'where have', "who'll": 'who will', "who's": 'who is', "who've": 'who have', "why's": 'why is', "won't": 'will not', "would've": 'would have', "wouldn't": 'would not', "you'd": 'you would', "you'd've": 'you would have', "you'll": 'you will', "you're": 'you are', "you've": 'you have'} # fmt: on + +MORPHONYM_LS = [ + ["延", "诞", "蜒"], + ["彦", "颜", "谚"], + ["扬", "杨", "汤", "场", "肠"], + ["夭", "袄", "沃", "跃", "妖", ""], + ["遥", "摇", "瑶", "谣"], + ["也", "弛", "驰", "施"], + ["亦", "迹", "峦", "恋", "变", "弈", "奕", "蛮"], + ["易", "惕", "踢", "剔", "锡", "赐"], + ["甬", "通", "痛", "桶", "诵", "捅", "俑", "涌", "用", "拥", "佣", ""], + ["由", "迪", "笛", "油", "邮", "抽", "袖", "柚", "庙"], + ["又", "权", "杈", ""], + ["于", "宇", "吁", "迂"], + ["鱼", "鳅", "鲜", "鳍", "鲸", "鲇"], + ["羽", "翔", "翩", "翘", "翻", "翅", "翱", "翠"], + ["聿", "律", "津"], + ["员", "陨", "损"], + ["援", "暖", "缓"], + ["月", "朋", "膊", "脯", "育", "肓", "脊", "背"], + ["匀", "均", "钧"], + ["则", "测", "侧", "铡", ""], + ["乍", "作", "昨", "诈", "炸", ""], + ["斩", "渐", "崭", "暂"], + ["占", "沾", "粘", "站", "战", "黏", "帖", "贴", "玷"], + ["召", "招", "照", "沼"], + ["者", "都", "煮", "暑", "署", "躇", "诸", "绪", "赌", "睹", "堵"], + ["诊", "珍", "趁"], + ["之", "乏", "芝", "泛"], + ["直", "植", "值", "殖", "置"], + ["只", "识", "织", "职", "枳", "帜"], + ["舟", "航", "舰"], + ["主", "注", "往", "柱", "驻", "住"], + ["状", "壮", "妆"], + ["兹", "慈", "滋", "磁", ""], + ["走", "趣", "趋", "越", "起", "趟", "超", "陡", "徒"], + ["坐", "座", "挫"], + ["半", "伴", "拌", "绊", "叛", "判"], + ["孚", "俘", "浮"], + ["秋", "愁", "揪", "鳅"], + ["屈", "掘", "倔"], + ["容", "蓉", "熔", "溶", "榕"], + ["尚", "躺", "淌", "趟"], + ["少", "妙", "纱", "抄", "沙", ""], + ["身", "射", "躲"], + ["生", "性", "姓", "星"], + ["氏", "纸", "低", "底", "抵"], + ["市", "闹", "柿"], + ["式", "拭", "试"], + ["寿", "涛", "祷", "踌", "筹", "铸", "畴", "俦"], + ["叔", "淑", "椒", "督"], + ["寺", "持", "等", "待", "诗", "侍", "特", "恃", "峙"], + ["廷", "挺", "庭", "霆", "艇", "蜓"], + ["宛", "碗", "婉", "腕", "蜿", "惋"], + ["王", "斑", "班", "狂", "枉", "琴", "瑟"], + ["韦", "伟", "苇", "纬"], + ["我", "峨", "娥", "鹅"], + ["昔", "猎", "借", "错", "蜡", "惜", "腊", "鹊", "措"], + ["咸", "减", "喊"], + ["相", "箱", "霜", "湘"], + ["肖", "消", "梢", "销", "捎", "悄", "哨", "稍", "硝"], + ["秀", "锈", "绣", "诱", "透"], + ["玄", "弦", "舷", "眩"], + ["寻", "灵", "雪", "扫"], + ["兰", "拦", "栏", "烂"], + ["劳", "涝", "捞"], + ["里", "童", "埋", "理", "狸", "暑", "著"], + ["历", "厉", "历", "励", "沥"], + ["利", "俐", "犁", "梨"], + ["良", "娘", "狼", "酿"], + ["列", "例", "烈", "裂", "冽", "咧"], + ["临", "监", "鉴", "篮", "蓝"], + ["令", "怜", "伶", "邻", "冷", "领", "龄", "铃", "岭", "玲", "拎"], + ["龙", "拢", "笼", "庞", "宠", "茏", "垄"], + ["录", "碌", "绿", "逮", "剥"], + ["率", "摔", "蟀"], + ["罗", "萝", "箩"], + ["马", "驼", "驮", "驱", "驰", "妈", "吗", "骂"], + ["卖", "续", "读"], + ["毛", "毡", "毯", "毫"], + ["门", "闲", "闷", "闭", "闯", "阔", "闪"], + ["免", "挽", "勉", "冕", "晚", "娩", "搀", "馋", "逸"], + ["苗", "描", "猫", "瞄"], + ["莫", "墓", "暮", "幕", "慕", "模", "摸", "摹", "漠"], + ["木", "林", "材", "村", "柄", "栖", "柩", "框", "沐"], + ["那", "哪", "挪", "娜"], + ["疒", "瘦", "病", "疗", "疼", "痒"], + ["宁", "狞", "拧"], + ["奴", "努", "怒", "恕"], + ["旁", "榜", "膀", "傍", "磅"], + ["票", "飘", "漂", "膘"], + ["其", "斯", "期", "欺", "旗"], + ["契", "楔", "揳"], + ["千", "纤", "迁"], + ["欠", "炊", "吹", "欢", "饮", "坎"], + ["切", "彻", "砌", "沏"], + ["高", "稿", "搞"], + ["鬲", "隔", "融", "嗝"], + ["亘", "恒", "宣", "喧", "楦", "渲", "桓", "垣", "晅", "萱", "暄", "喧", "瑄", "烜", "楦"], + ["更", "硬", "便", "梗", "更"], + ["勾", "构", "钩", "沟"], + ["谷", "俗", "裕", "豁", "浴"], + ["瓜", "孤", "狐", "抓"], + ["贯", "惯", "贯", "掼"], + ["圭", "蛙", "娃", "洼", "桂", "挂", "佳", "涯", "崖", "封", "畦"], + ["贵", "溃", "遗", ""], + ["果", "棵", "颗", "课", "稞"], + ["合", "哈", "拾", "答", "给", "塔", "搭", "恰"], + ["黑", "默", "墨", "黝"], + ["虎", "虚", "虑", "虔"], + ["奂", "焕", "涣", "换", "焕"], + ["灰", "诙", "恢", "碳", "炭"], + ["及", "级", "极", "汲", "吸", "圾"], + ["急", "稳", "隐", "瘾"], + ["己", "记", "纪", "妃"], + ["加", "驾", "架"], + ["家", "稼", "嫁"], + ["监", "滥", "槛"], + ["建", "健", "键"], + ["键", "健"], + ["奖", "桨", "浆", "酱"], + ["皆", "楷", "谐"], + ["介", "价", "阶"], + ["斤", "折", "拆", "析", "近", "浙", "哲", "晰"], + ["京", "凉", "谅", "晾", "景", "惊", "掠"], + ["径", "经", "泾"], + ["敬", "警", "擎", "儆"], + ["句", "苟", "句"], + ["具", "惧", "俱"], + ["诀", "决", "快", "块", "缺"], + ["军", "浑", "挥", "晕", "晖", "辉"], + ["峻", "俊", "骏", "竣", "浚", "悛", "逡", "唆", "梭", "焌"], + ["亢", "坑", "炕", "抗", "吭"], + ["白", "怕", "帕", "伯", "拍", "泊", "柏", "陌", "珀"], + ["办", "苏", "协", "胁"], + ["包", "跑", "炮", "泡", "抱", "袍", "饱", "苞", "刨", "咆"], + ["卑", "脾", "牌", "碑"], + ["贲", "喷", "愤"], + ["必", "密", "蜜", "秘"], + ["辟", "避", "癖", "劈", "壁", "璧"], + ["并", "拼", "饼", "迸"], + ["搏", "博", "傅", "薄", "礴", "缚"], + ["不", "坏", "环"], + ["才", "财", "材"], + ["参", "掺", "惨", "渗"], + ["曹", "糟", "嘈", "遭", "槽"], + ["涨", "胀", "张"], + ["澈", "撤", "辙"], + ["成", "城", "诚", "盛"], + ["丑", "扭", "钮", "纽"], + ["刍", "皱", "煞", "邹"], + ["喘", "揣", "端", "湍", "瑞", "惴"], + ["垂", "陲", "睡", "锤", "棰", "捶"], + ["次", "资", "咨", "姿"], + ["崔", "摧", "催"], + ["旦", "胆", "但", "担", "坦"], + ["登", "凳", "橙", "蹬", "澄"], + ["甸", "句"], + ["东", "冻", "栋"], + ["段", "断"], + ["多", "侈", "移", "够", "哆"], + ["耳", "耻", "职", "联", "聘", "饵", "茸", "耸", "娉", "俜", "骋"], + ["反", "版", "板", "饭", "返"], + ["非", "菲", "霏", "排", "悲", "匪", "辈", "徘"], + ["风", "讽", "枫", "飘", "飚", "飒", "疯"], + ["奉", "棒", "捧"], + ["弗", "沸", "拂", "佛"], + ["甫", "捕", "辅", "哺", "铺", "搏", "脯", "膊", "蒲", "敷"], + ["复", "履", "覆"], + ["甘", "钳", "甜", "柑"], + ["婵", "蝉", "箪", "殚", "掸", "惮", "禅"], + ["颁", "颔", "颌", "颀", "硕", "颐"], + ["妲", "怛", "袒"], + ["秕", "妣", "庇", "毖", "纰", "砒", "毗", "枇", "蚍"], + ["睢", "雎", "哺", "捕", "脯", "铺", "匍", "匐", "圃"], + ["烩", "荟", "桧", "侩", "刽"], + ["牺", "栖", "洒", "晒", "哂"], + ["龚", "龛", "詟", "垄", "陇"], + ["谬", "缪", "缪", "戮", "戳"], + ["揩", "楷", "锴", "谐", "偕", "喈"], + ["戢", "缉", "楫", "辑"], + ["犄", "犄", "掎", "犄", "畸", "崎", "绮", "漪", "旖", "倚"], + ["劼", "桔", "桔", "诘", "拮", "枯"], + ["龌", "龊", "龃", "龉"], + ["怠", "殆", "骀", "饴", "怡", "贻", "贻"], + ["囊", "壤", "攘", "镶", "嚷", "瓤"], + ["麻", "磨", "蘑", "摩", "靡", "魔", "麾"], + ["疆", "僵"], + ["赞", "攒"], + ["辟", "避", "璧", "譬", "僻", "臂", "壁", "劈"], + ["复", "腹", "覆", "馥", "蝮", "履"], + ["焦", "蕉", "礁", "瞧", "憔", "樵"], + ["付", "附", "咐", "驸", "府", "俯", "腐"], + ["攀", "拳", "掌", "撑"], + ["箱", "相", "湘", "厢", "想"], + ["铺", "捕", "哺", "埔", "甫", "辅", "圃", "匍", "蒲"], + ["景", "影"], + ["尚", "淌", "倘", "躺", "趟"], + ["朋", "棚", "鹏"], + ["替", "潜"], + ["鬼", "槐", "愧", "魂", "魄", "魔"], + ["央", "奂", "涣", "唤", "换", "焕", "映", "英"], + ["昆", "混", "棍"], + ["曼", "漫", "慢", "蔓", "谩", "幔", "馒"], + ["莫", "漠", "寞", "摸", "模", "膜"], + ["象", "像", "橡"], + ["告", "浩", "皓", "靠", "诰", "梏", "鹄"], + ["漆", "膝"], + ["繁", "敏"], + ["亭", "停", "婷"], + ["班", "斑"], + ["具", "俱", "惧", "飓"], + ["正", "证", "症", "政", "征"], + ["留", "溜", "榴", "榴"], + ["旦", "担", "坦"], + ["非", "韭", "徘", "辈", "悲", "斐", "裴", "靠", "扉", "霏", "菲", "匪", "蜚", "排"], + ["旬", "询", "殉"], + ["刑", "型"], + ["弟", "第", "递", "梯", "剃", "涕"], + ["兆", "跳", "眺", "挑", "桃", "逃", "佻"], + ["京", "惊", "凉", "晾", "谅", "掠"], + ["巨", "拒", "炬", "距", "矩", "柜"], + ["参", "惨", "渗"], + ["居", "剧", "据", "倨", "锯", "踞"], + ["夸", "挎", "垮", "胯", "跨"], + ["萄", "淘", "陶", "掏"], + ["丰", "峰", "锋", "烽", "蜂", "逢", "缝", "蓬"], + ["扁", "匾", "偏", "翩", "篇", "遍", "骗", "编", "蝙"], + ["争", "筝", "铮", "峥", "挣", "诤", "狰", "净", "静"], + ["者", "诸", "猪", "储", "赌", "睹", "堵", "都", "煮"], + ["旁", "滂", "螃", "榜", "膀", "傍", "谤", "磅", "镑"], + ["黑", "墨", "默", "黩", "黯", "黔"], + ["召", "诏", "招", "昭", "沼"], + ["蹈", "稻", "滔", "韬"], + ["干", "杆", "竿", "汗"], + ["高", "篙", "稿", "搞", "缟"], + ["建", "健", "毽", "腱", "键"], + ["史", "驶", "使"], + ["仰", "昂", "迎", "抑"], + ["烧", "浇", "挠"], + ["台", "抬", "胎", "苔", "怡", "治", "冶", "始"], + ["占", "钻", "贴", "粘"], + ["皮", "披", "波", "菠", "坡", "彼"], + ["挂", "桂", "洼", "封", "卦", "娃", "蛙", "佳", "哇"], + ["古", "枯", "估", "故", "做"], + ["帝", "啼", "谛", "缔", "蒂", "蹄"], + ["容", "溶", "榕"], + ["汛", "迅", "讯"], + ["肖", "消", "悄", "稍", "捎", "霄", "哨"], + ["包", "饱", "泡", "抱", "炮", "袍"], + ["不", "丕", "歪", "否", "坏", "怀", "环", "环"], + ["今", "令", "邻", "领", "翎", "冷", "拎", "玲", "铃", "伶", "怜"], + ["上", "止", "址", "让", "企", "扯", "肯"], + ["至", "到", "倒", "侄", "致"], + ["青", "清", "晴", "情", "晴", "静", "睛", "精", "猜", "靓", "靛", "倩", "靓"], + ["白", "怕", "拍", "伯", "泊", "柏"], + ["欠", "次", "软", "低", "吹", "砍", "欣", "欢"], + ["式", "试", "拭", "轼"], + ["十", "什", "计", "针", "叶", "汁"], + ["弓", "引", "弯", "湾"], + ["勺", "匀", "勾", "钓", "均", "钩", "沟"], + ["斥", "诉", "拆"], + ["西", "洒", "晒", "酒"], + ["登", "凳", "橙", "噔", "蹬", "瞪"], + ["昔", "惜", "措", "错", "腊", "蜡"], + ["傲", "熬", "赘"], + ["偶", "遇", "寓", "藕", "隅"], + ["比", "此", "些"], + ["童", "撞", "幢"], + ["仓", "苍", "沧", "抢", "枪", "疮", "呛", "炝"], + ["部", "剖", "陪", "培", "倍", "赔"], + ["八", "扒", "趴", "穴"], + ["咸", "减", "喊", "感"], + ["力", "历", "沥", "枥", "厉", "励", "砺"], + ["状", "壮"], + ["袄", "妖"], + ["仗", "杖"], + ["废", "疲"], + ["促", "捉"], + ["灾", "灭"], + ["并", "开"], + ["创", "枪"], + ["委", "萎"], + ["品", "晶"], + ["坚", "竖"], + ["国", "固"], + ["拾", "给"], + ["熟", "热"], + ["刮", "乱"], + ["室", "宝"], + ["兽", "曾"], + ["嬴", "蠃", "羸", "赢"], + ["椽", "喙", "蠡", "掾", "缘"], + ["忻", "沂", "坎", "斫", "昕"], + ["戍", "戎", "戊", "戌"], + ["圩", "盱", "纡", "吁"], + ["婺", "骛", "鹜"], + ["柝", "坼", "祗", "诋", "邸", "柢", "砥", "抵", "抵", "泜", "胝"], + ["醇", "淳", "谆", "敦"], + ["肄", "肆"], + ["苘", "茼"], + ["祛", "怯"], + ["厮", "撕"], + ["宵", "霄"], + ["粟", "栗"], + ["敝", "弊", "蔽"], + ["澄", "橙"], + ["蓝", "篮"], + ["妨", "彷"], + ["晤", "悟"], + ["嬉", "禧"], + ["谡", "稷"], + ["崇", "祟"], + ["蛰", "蜇"], + ["掣", "擎"], + ["箫", "萧"], + ["称", "你"], + ["糖", "塘"], + ["掩", "淹"], + ["因", "困"], + ["努", "怒"], + ["调", "凋"], + ["奋", "备"], + ["取", "职"], + ["约", "钓"], + ["怕", "帕"], + ["摘", "滴"], + ["庆", "厌"], + ["雀", "省"], + ["左", "在"], + ["票", "栗"], + ["塔", "搭"], + ["帅", "师"], + ["尊", "奠"], + ["区", "匹", ""], + ["伐", "代", ""], + ["豪", "毫", ""], + ["右", "石"], + ["屋", "层"], + ["伯", "柏"], + ["影", "景"], + ["管", "馆"], + ["茵", "菌"], + ["思", "恩"], + ["类", "粪"], + ["考", "老"], + ["尤", "龙"], + ["暑", "署"], + ["脏", "桩"], + ["苟", "苞"], + ["汗", "汁"], + ["内", "肉"], + ["找", "戏"], + ["埋", "理"], + ["绳", "蝇"], + ["度", "席"], + ["厉", "历"], + ["甩", "用"], + ["辨", "辩", "瓣"], + ["喂", "偎", "畏"], + ["传", "转", "砖"], + ["讯", "迅", "汛"], + ["挣", "净", "睁"], + ["炉", "庐", "护"], + ["瓜", "爪", "弧"], + ["掉", "卓", "桌"], + ["盒", "盘", "盆"], + ["堂", "党", "赏"], + ["参", "惨", "渗"], + ["艰", "银", "很", "恨", "狠", "跟"], + ["样", "洋", "鲜", "祥", "详"], + ["湖", "糊", "蝴", "瑚", "葫"], + ["枯", "姑", "估"], + ["榆", "愉", "喻"], + ["顽", "烦", "顿"], + ["格", "骆", "络"], + ["洒", "晒", "酒"], + ["忙", "芒", "茫"], + ["待", "诗", "特"], + ["肚", "吐", "杜"], + ["乖", "乘", "剩"], + ["飘", "漂", "瞟"], + ["织", "识", "职"], + ["快", "块", "夸"], + ["爱", "受", "援"], + ["愿", "源", "原"], + ["痛", "疼", "病"], + ["池", "地", "驰"], + ["闻", "问", "闷"], + ["视", "砚", "现"], + ["坏", "怀", "环", "还"], + ["洗", "宪", "冼", "选"], + ["彩", "踩", "菜", "睬"], + ["掏", "淘", "陶", "萄"], + ["冷", "领", "铃", "怜"], + ["杨", "汤", "场", "扬"], + ["义", "议", "仪", "蚁"], + ["眨", "泛", "乏", "之"], + ["份", "粉", "纷", "分"], + ["凉", "谅", "晾", "惊"], + ["板", "饭", "返", "扳", "贩"], + ["防", "访", "纺", "仿", "妨"], + ["彼", "披", "破", "坡", "波"], + ["缝", "逢", "峰", "烽", "蜂"], + ["贴", "帖", "粘", "站"], + ["订", "盯", "钉", "叮"], + ["油", "宙", "笛", "邮"], + ["籍", "藉", "误", "娱"], + ["渴", "竭", "碣", "谒"], + ["将", "奖", "浆", "蒋"], + ["熬", "傲", "遨", "鏖"], + ["稿", "篙", "嵩", "蒿"], + ["驿", "泽", "择", "译"], + ["蓝", "篮", "监", "临"], + ["悲", "辈", "菲", "翡"], + ["框", "筐", "眶", "狂"], + ["息", "熄"], + ["哀", "衰", "蓑", "猿"], + ["堂", "棠", "裳", "赏"], + ["抚", "芜", "拴", "栓"], + ["府", "付", "附", "附"], + ["货", "袋", "贷", "代"], + ["参", "惨", "渗", "掺"], + ["姆", "母", "拇"], + ["镶", "壤", "攘", "嚷"], + ["旺", "汪", "茁", "拙"], + ["慕", "幕", "墓", "暮"], + ["梯", "弟", "涕", "递", "挨", "埃", "唉"], + ["磁", "滋", "糍", "慈"], + ["烂", "栏", "拦", "兰"], + ["撕", "嘶", "期", "其"], + ["申", "审", "伸", "呻"], + ["宠", "庞", "笼", "拢"], + ["忖", "村", "讨", "对"], + ["橙", "澄", "凳", "登"], + ["瑞", "端", "揣", "喘"], + ["据", "剧", "居", "踞"], + ["输", "暖", "载", "栽"], + ["耐", "惴", "阅", "悦"], + ["熟", "塾"], + ["浩", "结", "洁", "吉"], + ["刑", "型", "荆", "形"], + ["婉", "晚", "豌", "惋"], + ["怯", "劫", "讪", "仙"], + ["航", "杭", "抗", "炕"], + ["沟", "钩", "钓", "钧"], + ["朗", "郎", "踉", "粮"], + ["疆", "僵", "蜷", "倦"], + ["陨", "损", "协", "胁"], + ["谨", "勤", "幻", "幼"], + ["跨", "垮", "挎", "胯"], + ["碍", "得", "泣", "拉"], + ["吹", "炊", "饮", "欢"], + ["般", "没", "投", "役"], + ["耽", "眈", "忱", "枕"], + ["编", "遍", "扁", "蝙"], + ["拔", "拨", "托", "拖"], + ["奋", "愤", "锁", "销"], + ["遗", "匮", "馈", "遣"], + ["稍", "梢", "哨", "捎"], + ["徘", "排"], + ["湛", "勘", "斟", "堪"], + ["票", "飘", "漂", "瞟"], + ["即", "既", "颇", "须", "榜", "傍", "磅", "膀"], + ["概", "慨", "溉", "既"], + ["恰", "洽"], + ["探", "深"], + ["杨", "惕", "赐", "踢"], + ["央", "秧", "殃", "泱"], + ["验", "检", "捡", "俭"], + ["州", "洲", "渊"], + ["瑰", "鬼"], + ["冠", "寇"], + ["崖", "涯"], + ["喂", "偎"], + ["培", "赔", "陪", "倍"], + ["涡", "蜗"], + ["粘", "沾"], + ["诞", "蜒", "碗", "婉"], + ["惩", "征"], + ["铭", "名", "茗", "酩"], + ["蛮", "峦", "恋", "奕"], + ["谋", "媒", "煤", "某"], + ["控", "腔"], + ["貌", "藐"], + ["俘", "浮"], + ["锦", "棉", "绵", "帛"], + ["忙", "茫", "芒", "氓"], + ["秋", "愁"], + ["祥", "详", "翔", "样"], + ["粮", "酿", "浪", "良"], + ["卒", "率", "翠", "碎"], + ["沸", "佛", "拂"], + ["腮", "思", "崽", "筛"], + ["调", "雕", "凋", "碉", ""], + ["撤", "撒", "籍", "霜"], + ["嫌", "谦", "歉", "廉"], + ["殊", "铢"], + ["翎", "翔", "翘", "翩"], + ["丞", "承"], + ["遐", "瑕", "暇", "假"], + ["魏", "巍", "翼", "冀"], + ["锋", "蜂", "峰", "缝"], + ["楼", "搂", "缕"], + ["挪", "娜", "那", "哪"], + ["逝", "浙"], +] + +CHN_STOPWORD = { + "、", + "。", + "〈", + "〉", + "《", + "》", + "一", + "一个", + "一些", + "一何", + "一切", + "一则", + "一方面", + "一旦", + "一来", + "一样", + "一种", + "一般", + "一转眼", + "七", + "万一", + "三", + "上", + "上下", + "下", + "不", + "不仅", + "不但", + "不光", + "不单", + "不只", + "不外乎", + "不如", + "不妨", + "不尽", + "不尽然", + "不得", + "不怕", + "不惟", + "不成", + "不拘", + "不料", + "不是", + "不比", + "不然", + "不特", + "不独", + "不管", + "不至于", + "不若", + "不论", + "不过", + "不问", + "与", + "与其", + "与其说", + "与否", + "与此同时", + "且", + "且不说", + "且说", + "两者", + "个", + "个别", + "中", + "临", + "为", + "为了", + "为什么", + "为何", + "为止", + "为此", + "为着", + "乃", + "乃至", + "乃至于", + "么", + "之", + "之一", + "之所以", + "之类", + "乌乎", + "乎", + "乘", + "九", + "也", + "也好", + "也罢", + "了", + "二", + "二来", + "于", + "于是", + "于是乎", + "云云", + "云尔", + "五", + "些", + "亦", + "人", + "人们", + "人家", + "什", + "什么", + "什么样", + "今", + "介于", + "仍", + "仍旧", + "从", + "从此", + "从而", + "他", + "他人", + "他们", + "他们们", + "以", + "以上", + "以为", + "以便", + "以免", + "以及", + "以故", + "以期", + "以来", + "以至", + "以至于", + "以致", + "们", + "任", + "任何", + "任凭", + "会", + "似的", + "但", + "但凡", + "但是", + "何", + "何以", + "何况", + "何处", + "何时", + "余外", + "作为", + "你", + "你们", + "使", + "使得", + "例如", + "依", + "依据", + "依照", + "便于", + "俺", + "俺们", + "倘", + "倘使", + "倘或", + "倘然", + "倘若", + "借", + "借傥然", + "假使", + "假如", + "假若", + "做", + "像", + "儿", + "先不先", + "光", + "光是", + "全体", + "全部", + "八", + "六", + "兮", + "共", + "关于", + "关于具体地说", + "其", + "其一", + "其中", + "其二", + "其他", + "其余", + "其它", + "其次", + "具体地说", + "具体说来", + "兼之", + "内", + "再", + "再其次", + "再则", + "再有", + "再者", + "再者说", + "再说", + "冒", + "冲", + "况且", + "几", + "几时", + "凡", + "凡是", + "凭", + "凭借", + "出于", + "出来", + "分", + "分别", + "则", + "则甚", + "别", + "别人", + "别处", + "别是", + "别的", + "别管", + "别说", + "到", + "前后", + "前此", + "前者", + "加之", + "加以", + "区", + "即", + "即令", + "即使", + "即便", + "即如", + "即或", + "即若", + "却", + "去", + "又", + "又及", + "及", + "及其", + "及至", + "反之", + "反而", + "反过来", + "反过来说", + "受到", + "另", + "另一方面", + "另外", + "另悉", + "只", + "只当", + "只怕", + "只是", + "只有", + "只消", + "只要", + "只限", + "叫", + "叮咚", + "可", + "可以", + "可是", + "可见", + "各", + "各个", + "各位", + "各种", + "各自", + "同", + "同时", + "后", + "后者", + "向", + "向使", + "向着", + "吓", + "吗", + "否则", + "吧", + "吧哒", + "含", + "吱", + "呀", + "呃", + "呕", + "呗", + "呜", + "呜呼", + "呢", + "呵", + "呵呵", + "呸", + "呼哧", + "咋", + "和", + "咚", + "咦", + "咧", + "咱", + "咱们", + "咳", + "哇", + "哈", + "哈哈", + "哉", + "哎", + "哎呀", + "哎哟", + "哗", + "哟", + "哦", + "哩", + "哪", + "哪个", + "哪些", + "哪儿", + "哪天", + "哪年", + "哪怕", + "哪样", + "哪边", + "哪里", + "哼", + "哼唷", + "唉", + "唯有", + "啊", + "啐", + "啥", + "啦", + "啪达", + "啷当", + "喂", + "喏", + "喔唷", + "喽", + "嗡", + "嗡嗡", + "嗬", + "嗯", + "嗳", + "嘎", + "嘎登", + "嘘", + "嘛", + "嘻", + "嘿", + "嘿嘿", + "四", + "因", + "因为", + "因了", + "因此", + "因着", + "因而", + "固然", + "在", + "在下", + "在于", + "地", + "基于", + "处在", + "多", + "多么", + "多少", + "大", + "大家", + "她", + "她们", + "好", + "如", + "如上", + "如上所述", + "如下", + "如何", + "如其", + "如同", + "如是", + "如果", + "如此", + "如若", + "始而", + "孰料", + "孰知", + "宁", + "宁可", + "宁愿", + "宁肯", + "它", + "它们", + "对", + "对于", + "对待", + "对方", + "对比", + "将", + "小", + "尔", + "尔后", + "尔尔", + "尚且", + "就", + "就是", + "就是了", + "就是说", + "就算", + "就要", + "尽", + "尽管", + "尽管如此", + "岂但", + "己", + "已", + "已矣", + "巴", + "巴巴", + "年", + "并", + "并且", + "庶乎", + "庶几", + "开外", + "开始", + "归", + "归齐", + "当", + "当地", + "当然", + "当着", + "彼", + "彼时", + "彼此", + "往", + "待", + "很", + "得", + "得了", + "怎", + "怎么", + "怎么办", + "怎么样", + "怎奈", + "怎样", + "总之", + "总的来看", + "总的来说", + "总的说来", + "总而言之", + "恰恰相反", + "您", + "惟其", + "慢说", + "我", + "我们", + "或", + "或则", + "或是", + "或曰", + "或者", + "截至", + "所", + "所以", + "所在", + "所幸", + "所有", + "才", + "才能", + "打", + "打从", + "把", + "抑或", + "拿", + "按", + "按照", + "换句话说", + "换言之", + "据", + "据此", + "接着", + "故", + "故此", + "故而", + "旁人", + "无", + "无宁", + "无论", + "既", + "既往", + "既是", + "既然", + "日", + "时", + "时候", + "是", + "是以", + "是的", + "更", + "曾", + "替", + "替代", + "最", + "月", + "有", + "有些", + "有关", + "有及", + "有时", + "有的", + "望", + "朝", + "朝着", + "本", + "本人", + "本地", + "本着", + "本身", + "来", + "来着", + "来自", + "来说", + "极了", + "果然", + "果真", + "某", + "某个", + "某些", + "某某", + "根据", + "欤", + "正值", + "正如", + "正巧", + "正是", + "此", + "此地", + "此处", + "此外", + "此时", + "此次", + "此间", + "毋宁", + "每", + "每当", + "比", + "比及", + "比如", + "比方", + "没奈何", + "沿", + "沿着", + "漫说", + "点", + "焉", + "然则", + "然后", + "然而", + "照", + "照着", + "犹且", + "犹自", + "甚且", + "甚么", + "甚或", + "甚而", + "甚至", + "甚至于", + "用", + "用来", + "由", + "由于", + "由是", + "由此", + "由此可见", + "的", + "的确", + "的话", + "直到", + "相对而言", + "省得", + "看", + "眨眼", + "着", + "着呢", + "矣", + "矣乎", + "矣哉", + "离", + "秒", + "称", + "竟而", + "第", + "等", + "等到", + "等等", + "简言之", + "管", + "类如", + "紧接着", + "纵", + "纵令", + "纵使", + "纵然", + "经", + "经过", + "结果", + "给", + "继之", + "继后", + "继而", + "综上所述", + "罢了", + "者", + "而", + "而且", + "而况", + "而后", + "而外", + "而已", + "而是", + "而言", + "能", + "能否", + "腾", + "自", + "自个儿", + "自从", + "自各儿", + "自后", + "自家", + "自己", + "自打", + "自身", + "至", + "至于", + "至今", + "至若", + "致", + "般的", + "若", + "若夫", + "若是", + "若果", + "若非", + "莫不然", + "莫如", + "莫若", + "虽", + "虽则", + "虽然", + "虽说", + "被", + "要", + "要不", + "要不是", + "要不然", + "要么", + "要是", + "譬喻", + "譬如", + "让", + "许多", + "论", + "设使", + "设或", + "设若", + "诚如", + "诚然", + "该", + "说", + "说来", + "请", + "诸", + "诸位", + "诸如", + "谁", + "谁人", + "谁料", + "谁知", + "贼死", + "赖以", + "赶", + "起", + "起见", + "趁", + "趁着", + "越是", + "距", + "跟", + "较", + "较之", + "边", + "过", + "还", + "还是", + "还有", + "还要", + "这", + "这一来", + "这个", + "这么", + "这么些", + "这么样", + "这么点儿", + "这些", + "这会儿", + "这儿", + "这就是说", + "这时", + "这样", + "这次", + "这般", + "这边", + "这里", + "进而", + "连", + "连同", + "逐步", + "通过", + "遵循", + "遵照", + "那", + "那个", + "那么", + "那么些", + "那么样", + "那些", + "那会儿", + "那儿", + "那时", + "那样", + "那般", + "那边", + "那里", + "都", + "鄙人", + "鉴于", + "针对", + "阿", + "除", + "除了", + "除外", + "除开", + "除此之外", + "除非", + "随", + "随后", + "随时", + "随着", + "难道说", + "零", + "非", + "非但", + "非徒", + "非特", + "非独", + "靠", + "顺", + "顺着", + "首先", + "︿", + "!", + "#", + "$", + "%", + "&", + "(", + ")", + "*", + "+", + ",", + "0", + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + ":", + ";", + "<", + ">", + "?", + "@", + "[", + "]", + "{", + "|", + "}", + "~", + "¥", +} diff --git a/textattack/shared/utils/strings.py b/textattack/shared/utils/strings.py index b22d44610..7b137d174 100644 --- a/textattack/shared/utils/strings.py +++ b/textattack/shared/utils/strings.py @@ -2,6 +2,7 @@ import string import flair +import jieba from .importing import LazyLoader @@ -30,7 +31,14 @@ def add_indent(s_, numSpaces): def words_from_text(s, words_to_ignore=[]): """Lowercases a string, removes all non-alphanumeric characters, and splits into words.""" - s = " ".join(s.split()) + try: + if re.search("[\u4e00-\u9FFF]", s): + seg_list = jieba.cut(s, cut_all=False) + s = " ".join(seg_list) + else: + s = " ".join(s.split()) + except Exception: + s = " ".join(s.split()) homos = """˗৭Ȣ𝟕бƼᏎƷᒿlO`ɑЬϲԁе𝚏ɡհіϳ𝒌ⅼmոорԛⲅѕ𝚝սѵԝ×уᴢ""" exceptions = """'-_*@""" @@ -234,7 +242,7 @@ def zip_flair_result(pred, tag_type="upos-fast"): for token in tokens: word_list.append(token.text) if "pos" in tag_type: - pos_list.append(token.annotation_layers["pos"][0]._value) + pos_list.append(token.annotation_layers["upos"][0]._value) elif tag_type == "ner": pos_list.append(token.get_label("ner")) diff --git a/textattack/shared/validators.py b/textattack/shared/validators.py index 4d9611d5a..fcf08e150 100644 --- a/textattack/shared/validators.py +++ b/textattack/shared/validators.py @@ -24,7 +24,10 @@ r"^textattack.models.helpers.word_cnn_for_classification.*", r"^transformers.modeling_\w*\.\w*ForSequenceClassification$", ], - (NonOverlappingOutput, MinimizeBleu,): [ + ( + NonOverlappingOutput, + MinimizeBleu, + ): [ r"^textattack.models.helpers.t5_for_text_to_text.*", ], } diff --git a/textattack/trainer.py b/textattack/trainer.py index 9c3198ae3..26d72d315 100644 --- a/textattack/trainer.py +++ b/textattack/trainer.py @@ -407,7 +407,6 @@ def collate_fn(data): is_adv_sample = [] for item in data: if "_example_type" in item[0].keys(): - # Get example type value from OrderedDict and remove it adv = item[0].pop("_example_type") diff --git a/textattack/training_args.py b/textattack/training_args.py index 6c5aa034d..c6e02c171 100644 --- a/textattack/training_args.py +++ b/textattack/training_args.py @@ -547,7 +547,6 @@ def _create_dataset_from_args(cls, args): train_dataset.output_column == "label" and eval_dataset.output_column == "label" ): - train_dataset_labels = train_dataset._dataset["label"] eval_dataset_labels = eval_dataset._dataset["label"] diff --git a/textattack/transformations/word_swaps/__init__.py b/textattack/transformations/word_swaps/__init__.py index 1d2aa9f52..431e0e345 100644 --- a/textattack/transformations/word_swaps/__init__.py +++ b/textattack/transformations/word_swaps/__init__.py @@ -8,6 +8,7 @@ from .word_swap import WordSwap # Black box transformations +from .chn_transformations import * from .word_swap_embedding import WordSwapEmbedding from .word_swap_hownet import WordSwapHowNet from .word_swap_homoglyph_swap import WordSwapHomoglyphSwap @@ -24,8 +25,6 @@ from .word_swap_change_number import WordSwapChangeNumber from .word_swap_change_location import WordSwapChangeLocation from .word_swap_change_name import WordSwapChangeName -from .chinese_word_swap_hownet import ChineseWordSwapHowNet -from .chinese_homophone_character_swap import ChineseHomophoneCharacterSwap # White box transformation from .word_swap_gradient_based import WordSwapGradientBased diff --git a/textattack/transformations/word_swaps/chinese_word_swap_hownet.py b/textattack/transformations/word_swaps/chinese_word_swap_hownet.py deleted file mode 100644 index c977a3c92..000000000 --- a/textattack/transformations/word_swaps/chinese_word_swap_hownet.py +++ /dev/null @@ -1,24 +0,0 @@ -import OpenHowNet - -from .word_swap import WordSwap - - -class ChineseWordSwapHowNet(WordSwap): - """Transforms an input by replacing its words with synonyms provided by - WordNet.""" - - def __init__(self): - self.hownet_dict = OpenHowNet.HowNetDict(use_sim=True) - self.topk = 10 - - def _get_replacement_words(self, word): - """Returns a list containing all possible words with N characters - replaced by a homoglyph.""" - if self.hownet_dict.get(word): - results = self.hownet_dict.get_nearest_words_via_sememes(word, self.topk) - synonyms = [ - w["word"] for r in results for w in r["synset"] if w["word"] != word - ] - return synonyms - else: - return [] diff --git a/textattack/transformations/word_swaps/chn_transformations/__init__.py b/textattack/transformations/word_swaps/chn_transformations/__init__.py new file mode 100644 index 000000000..2e8918fb3 --- /dev/null +++ b/textattack/transformations/word_swaps/chn_transformations/__init__.py @@ -0,0 +1,11 @@ +""" +chinese_transformations package +----------------------------------- + +""" + +from textattack.transformations.word_swaps.word_swap import WordSwap +from .chinese_homophone_character_swap import ChineseHomophoneCharacterSwap +from .chinese_morphonym_character_swap import ChineseMorphonymCharacterSwap +from .chinese_word_swap_masked import ChineseWordSwapMaskedLM +from .chinese_word_swap_hownet import ChineseWordSwapHowNet diff --git a/textattack/transformations/word_swaps/chinese_homophone_character_swap.py b/textattack/transformations/word_swaps/chn_transformations/chinese_homophone_character_swap.py similarity index 98% rename from textattack/transformations/word_swaps/chinese_homophone_character_swap.py rename to textattack/transformations/word_swaps/chn_transformations/chinese_homophone_character_swap.py index 1aa9e00b0..0573f7267 100644 --- a/textattack/transformations/word_swaps/chinese_homophone_character_swap.py +++ b/textattack/transformations/word_swaps/chn_transformations/chinese_homophone_character_swap.py @@ -3,7 +3,7 @@ import pandas as pd import pinyin -from .word_swap import WordSwap +from . import WordSwap class ChineseHomophoneCharacterSwap(WordSwap): @@ -17,11 +17,8 @@ def __init__(self): path_list = path_list[:-2] path_list.append("shared/chinese_homophone_char.txt") homophone_dict_path = os.path.join("/", *path_list) - homophone_dict = pd.read_csv(homophone_dict_path, header=None, sep="\n") - homophone_dict = homophone_dict[0].str.split("\t", expand=True) - self.homophone_dict = homophone_dict def _get_replacement_words(self, word): diff --git a/textattack/transformations/word_swaps/chn_transformations/chinese_morphonym_character_swap.py b/textattack/transformations/word_swaps/chn_transformations/chinese_morphonym_character_swap.py new file mode 100644 index 000000000..82692f352 --- /dev/null +++ b/textattack/transformations/word_swaps/chn_transformations/chinese_morphonym_character_swap.py @@ -0,0 +1,26 @@ +from textattack.shared.data import MORPHONYM_LS + +from . import WordSwap + + +class ChineseMorphonymCharacterSwap(WordSwap): + """Transforms an input by replacing its words with synonyms provided by a + morphonym dictionary.""" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def _get_replacement_words(self, word): + """Returns a list containing all possible words with 1 character + replaced by a morphonym.""" + word = list(word) + candidate_words = set() + for i in range(len(word)): + character = word[i] + for char_morpho_ls in MORPHONYM_LS: + if character in char_morpho_ls: + for new_char in char_morpho_ls: + temp_word = word + temp_word[i] = new_char + candidate_words.add("".join(temp_word)) + return list(candidate_words) diff --git a/textattack/transformations/word_swaps/chn_transformations/chinese_word_swap_hownet.py b/textattack/transformations/word_swaps/chn_transformations/chinese_word_swap_hownet.py new file mode 100644 index 000000000..2743ae4b6 --- /dev/null +++ b/textattack/transformations/word_swaps/chn_transformations/chinese_word_swap_hownet.py @@ -0,0 +1,25 @@ +import OpenHowNet + +from . import WordSwap + + +class ChineseWordSwapHowNet(WordSwap): + """Transforms an input by replacing its words with synonyms provided by + OpenHownet http://nlp.csai.tsinghua.edu.cn/.""" + + def __init__(self, topk=5): + self.hownet_dict = OpenHowNet.HowNetDict(init_sim=True) + self.topk = topk + + def _get_replacement_words(self, word): + """Returns a list containing all possible words with N characters + replaced by a homoglyph.""" + results = self.hownet_dict.get_nearest_words(word, language="zh", K=self.topk) + synonyms = [] + if results: + for key, value in results.items(): + for w in value: + synonyms.append(w) + return synonyms + else: + return [] diff --git a/textattack/transformations/word_swaps/chn_transformations/chinese_word_swap_masked.py b/textattack/transformations/word_swaps/chn_transformations/chinese_word_swap_masked.py new file mode 100644 index 000000000..6973e3117 --- /dev/null +++ b/textattack/transformations/word_swaps/chn_transformations/chinese_word_swap_masked.py @@ -0,0 +1,77 @@ +""" +Word Swap by BERT-Masked LM. +------------------------------- +""" + +from transformers import pipeline + +from . import WordSwap + + +class ChineseWordSwapMaskedLM(WordSwap): + """Generate potential replacements for a word using a masked language + model.""" + + def __init__(self, task="fill-mask", model="xlm-roberta-base", **kwargs): + self.unmasker = pipeline(task, model) + super().__init__(**kwargs) + + def get_replacement_words(self, current_text, indice_to_modify): + masked_text = current_text.replace_word_at_index(indice_to_modify, "") + outputs = self.unmasker(masked_text.text) + words = [] + for dict in outputs: + take = True + for char in dict["token_str"]: + # accept only Chinese characters for potential substitutions + if not is_cjk(char): + take = False + if take: + words.append(dict["token_str"]) + + return words + + def _get_transformations(self, current_text, indices_to_modify): + words = current_text.words + transformed_texts = [] + + for i in indices_to_modify: + word_to_replace = words[i] + replacement_words = self.get_replacement_words(current_text, i) + transformed_texts_idx = [] + for r in replacement_words: + if r == word_to_replace: + continue + transformed_texts_idx.append(current_text.replace_word_at_index(i, r)) + transformed_texts.extend(transformed_texts_idx) + + return transformed_texts + + +def is_cjk(char): + char = ord(char) + for bottom, top in cjk_ranges: + if bottom <= char <= top: + return True + return False + + +cjk_ranges = [ + (0x4E00, 0x62FF), + (0x6300, 0x77FF), + (0x7800, 0x8CFF), + (0x8D00, 0x9FCC), + (0x3400, 0x4DB5), + (0x20000, 0x215FF), + (0x21600, 0x230FF), + (0x23100, 0x245FF), + (0x24600, 0x260FF), + (0x26100, 0x275FF), + (0x27600, 0x290FF), + (0x29100, 0x2A6DF), + (0x2A700, 0x2B734), + (0x2B740, 0x2B81D), + (0x2B820, 0x2CEAF), + (0x2CEB0, 0x2EBEF), + (0x2F800, 0x2FA1F), +] diff --git a/textattack/transformations/word_swaps/word_swap_change_name.py b/textattack/transformations/word_swaps/word_swap_change_name.py index d54b755a5..c4feeff48 100644 --- a/textattack/transformations/word_swaps/word_swap_change_name.py +++ b/textattack/transformations/word_swaps/word_swap_change_name.py @@ -64,7 +64,6 @@ def _get_transformations(self, current_text, indices_to_modify): return transformed_texts def _get_replacement_words(self, word, word_part_of_speech): - replacement_words = [] tag = word_part_of_speech if ( diff --git a/textattack/transformations/word_swaps/word_swap_change_number.py b/textattack/transformations/word_swaps/word_swap_change_number.py index 1ced0f84d..b885b6fa4 100644 --- a/textattack/transformations/word_swaps/word_swap_change_number.py +++ b/textattack/transformations/word_swaps/word_swap_change_number.py @@ -70,7 +70,7 @@ def _get_transformations(self, current_text, indices_to_modify): # replace original numbers with new numbers transformed_texts = [] - for (idx, word) in num_words: + for idx, word in num_words: replacement_words = self._get_new_number(word) for r in replacement_words: if r == word: