Merge pull request #41 from jalammar/v0.0.15-release

V0.0.15 release
jalammar · Aug 2, 2021 · 8670f7b · 8670f7b
2 parents d9e4e37 + efc5950
commit 8670f7b
Show file tree

Hide file tree

Showing 20 changed files with 1,428 additions and 22 deletions.
diff --git a/docs/api/analysis.md b/docs/api/analysis.md
@@ -0,0 +1,2 @@
+
+::: ecco.analysis
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -29,6 +29,7 @@ nav:
       - Language Model: api/language-model.md
       - Output: api/output.md
       - NMF: api/nmf.md
+      - Analysis: api/analysis.md
 
 markdown_extensions:
   - pymdownx.highlight

diff --git a/notebooks/Ecco_CCA_Similarity.ipynb b/notebooks/Ecco_CCA_Similarity.ipynb
@@ -0,0 +1,337 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "3d3e8316-6769-44e1-b522-cb4b35fc4541",
+   "metadata": {},
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/jalammar/ecco/blob/main/notebooks/Ecco_CCA_Similarity.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2aad7c95-bc94-4813-9b9f-dc47fdc442ee",
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: ecco in c:\\users\\msx\\pycharmprojects\\ecco\\src (0.0.14)\n",
+      "Requirement already satisfied: transformers~=4.2 in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from ecco) (4.6.1)\n",
+      "Requirement already satisfied: seaborn~=0.11 in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from ecco) (0.11.1)\n",
+      "Requirement already satisfied: scikit-learn~=0.23 in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from ecco) (0.24.2)\n",
+      "Requirement already satisfied: PyYAML~=5.4 in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from ecco) (5.4.1)\n",
+      "Requirement already satisfied: numpy>=1.13.3 in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from scikit-learn~=0.23->ecco) (1.19.5)\n",
+      "Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from scikit-learn~=0.23->ecco) (2.1.0)\n",
+      "Requirement already satisfied: joblib>=0.11 in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from scikit-learn~=0.23->ecco) (1.0.1)\n",
+      "Requirement already satisfied: scipy>=0.19.1 in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from scikit-learn~=0.23->ecco) (1.6.3)\n",
+      "Requirement already satisfied: pandas>=0.23 in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from seaborn~=0.11->ecco) (1.2.4)\n",
+      "Requirement already satisfied: matplotlib>=2.2 in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from seaborn~=0.11->ecco) (3.3.4)\n",
+      "Requirement already satisfied: cycler>=0.10 in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from matplotlib>=2.2->seaborn~=0.11->ecco) (0.10.0)\n",
+      "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from matplotlib>=2.2->seaborn~=0.11->ecco) (8.2.0)\n",
+      "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\\users\\msx\\appdata\\roaming\\python\\python39\\site-packages (from matplotlib>=2.2->seaborn~=0.11->ecco) (2.4.7)\n",
+      "Requirement already satisfied: python-dateutil>=2.1 in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from matplotlib>=2.2->seaborn~=0.11->ecco) (2.8.1)\n",
+      "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from matplotlib>=2.2->seaborn~=0.11->ecco) (1.3.1)\n",
+      "Requirement already satisfied: six in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from cycler>=0.10->matplotlib>=2.2->seaborn~=0.11->ecco) (1.16.0)\n",
+      "Requirement already satisfied: pytz>=2017.3 in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from pandas>=0.23->seaborn~=0.11->ecco) (2021.1)\n",
+      "Requirement already satisfied: packaging in c:\\users\\msx\\appdata\\roaming\\python\\python39\\site-packages (from transformers~=4.2->ecco) (20.9)\n",
+      "Requirement already satisfied: tokenizers<0.11,>=0.10.1 in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from transformers~=4.2->ecco) (0.10.3)\n",
+      "Requirement already satisfied: filelock in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from transformers~=4.2->ecco) (3.0.12)\n",
+      "Requirement already satisfied: sacremoses in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from transformers~=4.2->ecco) (0.0.45)\n",
+      "Requirement already satisfied: requests in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from transformers~=4.2->ecco) (2.25.1)\n",
+      "Requirement already satisfied: tqdm>=4.27 in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from transformers~=4.2->ecco) (4.61.0)\n",
+      "Requirement already satisfied: huggingface-hub==0.0.8 in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from transformers~=4.2->ecco) (0.0.8)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from transformers~=4.2->ecco) (2021.4.4)\n",
+      "Requirement already satisfied: idna<3,>=2.5 in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from requests->transformers~=4.2->ecco) (2.10)\n",
+      "Requirement already satisfied: chardet<5,>=3.0.2 in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from requests->transformers~=4.2->ecco) (4.0.0)\n",
+      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from requests->transformers~=4.2->ecco) (1.26.6)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from requests->transformers~=4.2->ecco) (2021.5.30)\n",
+      "Requirement already satisfied: click in c:\\users\\msx\\miniconda3\\envs\\ecco\\lib\\site-packages (from sacremoses->transformers~=4.2->ecco) (8.0.1)\n",
+      "Requirement already satisfied: colorama in c:\\users\\msx\\appdata\\roaming\\python\\python39\\site-packages (from click->sacremoses->transformers~=4.2->ecco) (0.4.4)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install ecco"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "55290353-1c0f-4778-abd8-bb4c09969e1a",
+   "metadata": {},
+   "source": [
+    "Load Ecco and BERT."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "228061f6-2cfc-47ea-9357-6789c81745d1",
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']\n",
+      "- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
+     ]
+    }
+   ],
+   "source": [
+    "import ecco\n",
+    "lm = ecco.from_pretrained('distilbert-base-uncased', gpu=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "27d1f126-e938-42f6-8e8c-fb97f79a2b74",
+   "metadata": {},
+   "source": [
+    "Let's give BERT a passage of text to proccess"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "b1abd436-50bf-4722-b113-73eaa795020d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = '''Now I ask you: what can be expected of man since he is a being endowed with strange qualities? Shower upon him every earthly blessing, drown him in a sea of happiness, so that nothing but bubbles of bliss can be seen on the surface; give him economic prosperity, such that he should have nothing else to do but sleep, eat cakes and busy himself with the continuation of his species, and even then out of sheer ingratitude, sheer spite, man would play you some nasty trick. He would even risk his cakes and would deliberately desire the most fatal rubbish, the most uneconomical absurdity, simply to introduce into all this positive good sense his fatal fantastic element. It is just his fantastic dreams, his vulgar folly that he will desire to retain, simply in order to prove to himself--as though that were so necessary-- that men still are men and not the keys of a piano, which the laws of nature threaten to control so completely that soon one will be able to desire nothing but by the calendar. And that is not all: even if man really were nothing but a piano-key, even if this were proved to him by natural science and mathematics, even then he would not become reasonable, but would purposely do something perverse out of simple ingratitude, simply to gain his point. And if he does not find means he will contrive destruction and chaos, will contrive sufferings of all sorts, only to gain his point! He will launch a curse upon the world, and as only man can curse (it is his privilege, the primary distinction between him and other animals), may be by his curse alone he will attain his object--that is, convince himself that he is a man and not a piano-key!\n",
+    "'''\n",
+    "\n",
+    "inputs = lm.tokenizer([text], return_tensors=\"pt\")\n",
+    "output = lm(inputs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "470b6fa8-1f45-42e6-afa8-ba6fbe576bb4",
+   "metadata": {},
+   "source": [
+    "the `output` variable now contains the result of BERT processing the passge of text. The property `output.hidden_states` contains the hidden states after each layer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "1c97a987-c239-4f51-8f39-f4f00dfa9464",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "((768, 363), (768, 363), 6)"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "embed = output.hidden_states[0].detach().numpy()[0,:,:].T\n",
+    "hidden_state_layer = [layer.detach().numpy()[0,:,:].T for layer in output.hidden_states[1:]]\n",
+    "embed.shape, hidden_state_layer[0].shape, len(hidden_state_layer)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "41ee4085-a697-4753-98b8-f930b4abcad0",
+   "metadata": {},
+   "source": [
+    "`embed` now contains the embeddings of the inputs. Its dimensions are (embed_dim, number of tokens). \n",
+    "`hidden_state_layer` has the outputs of each of the model's 6 layers. The output of each layer is (embed_dim, number of tokens).\n",
+    "\n",
+    "This is how to calculate the cka similarity score between the embeddings layer and the output of the first layer:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "9fe8fa2b-39b7-4a04-bf4e-c62cffe3f2ff",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.9042735809843326"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from ecco import analysis\n",
+    "analysis.cka(embed, hidden_state_layer[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "831da973-63bc-4a53-8022-8017ee82af57",
+   "metadata": {},
+   "source": [
+    "When we compare the embeddings with the output of the second layer, we see less similarity"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "d428f519-fb48-402d-8bea-962783fe36de",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.7774273750068427"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "analysis.cka(embed, hidden_state_layer[1])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "823e071c-ccc2-458e-b792-8d451bd48e41",
+   "metadata": {},
+   "source": [
+    "And so on"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "8621e099-a768-4e17-86e6-6281c4fe4a0f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.6922863613160068"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "analysis.cka(embed, hidden_state_layer[2])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7c2de6f2-fe10-4672-ae51-cfdfe2382be2",
+   "metadata": {},
+   "source": [
+    "We can try with `cca`, `svcca` and `pwcca`. But we need to choose a subset of the neurons because these methods require more tokens than neurons (and advise 10x as many tokens as neurons to get a proper similarity score). \n",
+    "\n",
+    "Let's compare the similarities of the first 50 neurons."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "fc3f50a6-eaa0-498e-896a-11d333d7fb5f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CCA - Embed vs. layer 0: 0.8518187635570224\n",
+      "CCA - Embed vs. layer 1: 0.7220358141619774\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"CCA - Embed vs. layer 0:\", analysis.cca(embed[:50,:], hidden_state_layer[0][:50,:]))\n",
+    "print(\"CCA - Embed vs. layer 1:\", analysis.cca(embed[:50,:], hidden_state_layer[1][:50,:]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "ebb5d691-af03-4d0c-9282-c5338671df12",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SVCCA - Embed vs. layer 0: 0.7830642647708996\n",
+      "SVCCA - Embed vs. layer 1: 0.6833412957583129\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"SVCCA - Embed vs. layer 0:\", analysis.svcca(embed[:50,:], hidden_state_layer[0][:50,:]))\n",
+    "print(\"SVCCA - Embed vs. layer 1:\", analysis.svcca(embed[:50,:], hidden_state_layer[1][:50,:]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "5b3e47b3-e5a6-47ce-8964-29eb086e29ad",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "PWCCA - Embed vs. layer 0: 0.8695735246868949\n",
+      "PWCCA - Embed vs. layer 1: 0.746195889153883\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"PWCCA - Embed vs. layer 0:\", analysis.pwcca(embed[:50,:], hidden_state_layer[0][:50,:]))\n",
+    "print(\"PWCCA - Embed vs. layer 1:\", analysis.pwcca(embed[:50,:], hidden_state_layer[1][:50,:]))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/requirements.txt b/requirements.txt
@@ -1,13 +1,11 @@
--f https://download.pytorch.org/whl/torch_stable.html
 matplotlib~=3.3.1
 numpy~=1.19.1
 ipython~=7.16.1
-scikit-learn~=0.23.2
+scikit-learn~=0.24.2
 seaborn~=0.11.0
-transformers~=4.2.2
+transformers~=4.6.1
 pytest~=6.1.2
 setuptools~=49.6.0
-torch~=1.6.0
-torchvision~=0.7.0
+torch~=1.9.0
 PyYAML==5.4.1
 
diff --git a/setup.py b/setup.py
@@ -51,6 +51,7 @@ def read(*names, **kwargs):
         'Programming Language :: Python',
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
         'Programming Language :: Python :: Implementation :: CPython',
         'Programming Language :: Python :: Implementation :: PyPy',
         'Topic :: Utilities',

diff --git a/src/ecco/__init__.py b/src/ecco/__init__.py
@@ -16,6 +16,7 @@
 from ecco.lm import LM
 from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
 from typing import Optional, List
+from ecco.util import load_config
 
 
 def from_pretrained(hf_model_id: str,
@@ -48,8 +49,13 @@ def from_pretrained(hf_model_id: str,
 """
     # TODO: Should specify task/head in a cleaner way. Allow masked LM. T5 generation.
     # Likely use model-config. Have a default. Allow user to specify head?
-    if 'gpt2' not in hf_model_id:
-        tokenizer = AutoTokenizer.from_pretrained(hf_model_id)
+
+    model_config = load_config(hf_model_id)
+    use_causal_lm = model_config.get('use_causal_lm',False)
+
+    tokenizer = AutoTokenizer.from_pretrained(hf_model_id)
+
+    if not use_causal_lm:
         model = AutoModel.from_pretrained(hf_model_id,
                                                      output_hidden_states=hidden_states,
                                                      output_attentions=attention)