explosion · polm · Nov 11, 2022 · Nov 22, 2022 · Nov 22, 2022 · Nov 22, 2022
diff --git a/tutorials/pipeline_customize/README.md b/tutorials/pipeline_customize/README.md
@@ -0,0 +1,26 @@
+<!-- SPACY PROJECT: AUTO-GENERATED DOCS START (do not remove) -->
+
+# 🪐 spaCy Project: Customizing Pipelines
+
+This project includes a script to help customize your pipelines. It allow you to swap a tok2vec for a transformer component, do the opposite, or merge two pipelines.
+
+
+## 📋 project.yml
+
+The [`project.yml`](project.yml) defines the data assets required by the
+project, as well as the available commands and workflows. For details, see the
+[spaCy projects documentation](https://spacy.io/usage/projects).
+
+### ⏯ Commands
+
+The following commands are defined by the project. They
+can be executed using [`spacy project run [name]`](https://spacy.io/api/cli#project-run).
+Commands are only re-run if their inputs have changed.
+
+| Command | Description |
+| --- | --- |
+| `merge` | Merge the two pipelines into one pipeline. |
+| `use-transformer` | Use a transformer feature source in a pipeline, keeping listeners updated. Output config. |
+| `use-tok2vec` | Use a CNN tok2vec feature source in a pipeline, keeping listeners updated. Output config. |
+
+<!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->
diff --git a/tutorials/pipeline_customize/project.yml b/tutorials/pipeline_customize/project.yml
@@ -0,0 +1,47 @@
+title: "Customizing Pipelines"
+description: >
+  This project includes a script to help customize your pipelines. It allow you
+  to swap a tok2vec for a transformer component, do the opposite, or merge two
+  pipelines.
+
+vars:
+  version: "0.0.0"
+  # change these to paths to your pipelines, or override on the CLI
+  base_pipeline: pipeline_a
+  # this one is just for merging
+  added_pipeline: pipeline_b
+  # name of the output pipeline, for merging
+  output_pipeline: merged_pipeline
+  # name of the base transformer
+  base_transformer: roberta-base
+
+# These are the directories that the project needs. The project CLI will make
+# sure that they always exist.
+directories: [dist, configs]
+
+# Project commands, specified in a style similar to CI config files (e.g. Azure
+# pipelines). The name is the command name that lets you trigger the command
+# via "spacy project run [command] [path]". The help message is optional and
+# shown when executing "spacy project run [optional command] [path] --help".
+commands:
+  - name: merge
+    help: Merge the two pipelines into one pipeline.
+    script:
+      - "python -m scripts.merge ${vars.base_pipeline} ${vars.added_pipeline} dist/${vars.output_pipeline}"
+    # input pipelines are not deps because they might be module names
+    outputs:
+      - "dist/${vars.output_pipeline}"
+
+  - name: use-transformer
+    help: Use a transformer feature source in a pipeline, keeping listeners updated. Output config.
+    script:
+      - "python -m scripts.configure_tok2vec use-transformer --transformer-name ${vars.base_transformer} ${vars.base_pipeline} configs/transformer.cfg"
+    outputs:
+      - "configs/transformer.cfg"
+
+  - name: use-tok2vec
+    help: Use a CNN tok2vec feature source in a pipeline, keeping listeners updated. Output config.
+    script:
+      - "python -m scripts.configure_tok2vec use-tok2vec ${vars.base_pipeline} configs/cnn.cfg"
+    outputs:
+      - "configs/cnn.cfg"
diff --git a/tutorials/pipeline_customize/scripts/configure_tok2vec.py b/tutorials/pipeline_customize/scripts/configure_tok2vec.py
@@ -0,0 +1,80 @@
+from pathlib import Path
+
+import spacy
+import typer
+from util import check_tok2vecs, get_tok2vecs, get_listeners
+
+def use_transformer(
+    base_model: str, output_path: Path, transformer_name: str = "roberta-base"
+):
+    """Replace pipeline tok2vec with transformer, update listeners, output config."""
+    # 1. identify tok2vec
+    # 2. replace tok2vec
+    # 3. replace listeners
+    nlp = spacy.load(base_model)
+    check_tok2vecs(base_model, nlp.config)
+
+    tok2vecs = get_tok2vecs(nlp.config)
+    assert len(tok2vecs) > 0, "Must have tok2vec to replace!"
+
+    nlp.remove_pipe(tok2vecs[0])
+    # the rest can be default values
+    trf_config = {
+        "model": {
+            "name": transformer_name,
+        }
+    }
+    trf = nlp.add_pipe("transformer", config=trf_config, first=True)
+
+    # TODO maybe remove vectors?
+
+    # now update the listeners
+    listeners = get_listeners(nlp)
+    for listener in listeners:
+        listener_config = {
+            "@architectures": "spacy-transformers.TransformerListener.v1",
+            "grad_factor": 1.0,
+            "upstream": "transformer",
+            "pooling": {"@layers": "reduce_mean.v1"},
+        }
+        nlp.config["components"][listener]["model"]["tok2vec"] = listener_config
+
+    # that's it!
+    nlp.config.to_disk(output_path)
+
+
+def use_tok2vec(base_model: str, output_path: Path):
+    """Replace pipeline tok2vec with CNN tok2vec, update listeners, output config."""
+    nlp = spacy.load(base_model)
+    check_tok2vecs(base_model, nlp.config)
+
+    tok2vecs = get_tok2vecs(nlp.config)
+    assert len(tok2vecs) > 0, "Must have tok2vec to replace!"
+
+    nlp.remove_pipe(tok2vecs[0])
+
+    tok2vec = nlp.add_pipe("tok2vec", first=True)
+    width = "${components.tok2vec.model.encode:width}"
+
+    listeners = get_listeners(nlp)
+    for listener in listeners:
+        listener_config = {
+            "@architectures": "spacy.Tok2VecListener.v1",
+            "width": width,
+            "upstream": "tok2vec",
+        }
+        nlp.config["components"][listener]["model"]["tok2vec"] = listener_config
+
+    nlp.config.to_disk(output_path)
+
+
+if __name__ == "__main__":
+
+    help_msg = """
+    This script will help you swap out a tok2vec in your pipeline for a
+    Transformer or vice-versa.
+    """
+    app = typer.Typer(name="tok2vec Swapper", help=help_msg, no_args_is_help=True)
+    app.command("use-transformer")(use_transformer)
+    app.command("use-tok2vec")(use_tok2vec)
+    app()
diff --git a/tutorials/pipeline_customize/scripts/merge.py b/tutorials/pipeline_customize/scripts/merge.py
@@ -0,0 +1,106 @@
+from pathlib import Path
+
+from util import get_tok2vecs, check_tok2vecs, has_listener
+from util import check_pipeline_names, get_listeners
+
+import typer
+import spacy
+from spacy.language import Language
+from wasabi import msg
+
+
+def inner_merge(nlp, nlp2, replace_listeners=False) -> Language:
+    """Actually do the merge.
+
+    nlp: Base pipeline to add components to.
+    nlp2: Pipeline to add components from.
+    replace_listeners (bool): Whether to replace listeners. Usually only true
+      if there's one listener.
+    returns: assembled pipeline.
+    """
+
+    # we checked earlier, so there's definitely just one
+    tok2vec_name = get_tok2vecs(nlp2.config)[0]
+    rename = check_pipeline_names(nlp, nlp2)
+
+    if len(get_listeners(nlp2)) > 1:
+        if replace_listeners:
+            msg.warn(
+                """
+                Replacing listeners for multiple components. Note this can make
+                your pipeline large and slow. Consider chaining pipelines (like
+                nlp2(nlp(text))) instead.
+                """
+            )
+        else:
+            # TODO provide a guide for what to do here
+            msg.warn(
+                """
+                The result of this merge will have two feature sources
+                (tok2vecs) and multiple listeners. This will work for
+                inference, but will probably not work when training without
+                extra adjustment. If you continue to train the pipelines
+                separately this is not a problem.
+                """
+            )
+
+    for comp in nlp2.pipe_names:
+        if replace_listeners and comp == tok2vec_name:
+            # the tok2vec should not be copied over
+            continue
+        if replace_listeners and has_listener(nlp2, comp):
+            # TODO does "model.tok2vec" work for everything?
+            nlp2.replace_listeners(tok2vec_name, comp, ["model.tok2vec"])
+        nlp.add_pipe(comp, source=nlp2, name=rename.get(comp, comp))
+        if comp in rename:
+            msg.info(f"Renaming {comp} to {rename[comp]} to avoid collision...")
+    return nlp
+
+
+def merge_pipelines(base_model: str, added_model: str, output_path: Path):
+    """Combine components from multiple pipelines into a single new one."""
+    nlp = spacy.load(base_model)
+    nlp2 = spacy.load(added_model)
+
+    # to merge models:
+    # - lang must be the same
+    # - vectors must be the same
+    # - vocabs must be the same (how to check?)
+    # - tokenizer must be the same (only partially checkable)
+    if nlp.lang != nlp2.lang:
+        msg.fail("Can't merge - languages don't match", exits=1)
+
+    # check vector equality
+    if (
+        nlp.vocab.vectors.shape != nlp2.vocab.vectors.shape
+        or nlp.vocab.vectors.key2row != nlp2.vocab.vectors.key2row
+        or nlp.vocab.vectors.to_bytes(exclude=["strings"])
+        != nlp2.vocab.vectors.to_bytes(exclude=["strings"])
+    ):
+        msg.fail("Can't merge - vectors don't match", exits=1)
+
+    if nlp.config["nlp"]["tokenizer"] != nlp2.config["nlp"]["tokenizer"]:
+        msg.fail("Can't merge - tokenizers don't match", exits=1)
+
+    # Check that each pipeline only has one feature source
+    check_tok2vecs(base_model, nlp.config)
+    check_tok2vecs(added_model, nlp2.config)
+
+    # Check how many listeners there are and replace based on that
+    # TODO: option to recognize frozen tok2vecs
+    # TODO: take list of pipe names to copy
+    listeners = get_listeners(nlp2)
+    replace_listeners = len(listeners) == 1
+    print(replace_listeners, len(listeners))
+    nlp_out = inner_merge(nlp, nlp2, replace_listeners=replace_listeners)
+
+    # write the final pipeline
+    nlp.to_disk(output_path)
+    msg.info(f"Saved pipeline to: {output_path}")
+
+
+if __name__ == "__main__":
+
+    app = typer.Typer(name="Pipeline Merge Helper")
+    app.command("merge", no_args_is_help=True)(merge_pipelines)
+    app()
diff --git a/tutorials/pipeline_customize/scripts/resume_training.py b/tutorials/pipeline_customize/scripts/resume_training.py
@@ -0,0 +1,30 @@
+from pathlib import Path
+
+from util import get_tok2vecs, check_tok2vecs, has_listener
+from util import check_pipeline_names, get_listeners
+
+import typer
+import spacy
+from spacy.language import Language
+from wasabi import msg
+
+def create_resume_config(base_model: str, output_path: Path):
+    """Given an input pipeline, produce a config for resuming training.
+
+    A config for resuming training is the same as the input config, but with
+    all components sourced.
+    """
+
+    nlp = spacy.load(base_model)
+    conf = nlp.config
+
+    for comp in nlp.pipe_names:
+        conf["components"][comp] = {"source": base_model}
+
+    conf.to_disk(output_path)
+
+if __name__ == "__main__":
+
+    app = typer.Typer(name="Resume Config Creator")
+    app.command("resume_config", no_args_is_help=True)(create_resume_config)
+    app()