ml-explore · chimezie · Jun 7, 2024 · Jun 16, 2024 · Jun 24, 2024 · Jun 28, 2024
diff --git a/llms/mlx_lm/LORA.md b/llms/mlx_lm/LORA.md
@@ -76,6 +76,27 @@ You can specify the output location with `--adapter-path`.
 You can resume fine-tuning with an existing adapter with
 `--resume-adapter-file <path_to_adapters.safetensors>`.
 
+### Input Masking
+There are custom functions for masking the sequence of tokens associated with the `prompt` in a completion dataset
+during the loss calculation to ensure the model is not being penalized for not recreating the prompt.  To fine-tune 
+with masked input sequences, use the `--mask-inputs` argument.
+
+This functionality expects a ```response_template``` parameter in the configuration that is either a string representing
+a [string that indicate the start of the model's response](https://huggingface.co/docs/transformers/en/chat_templating#what-are-generation-prompts) 
+or its corresopnding tokens.  This is used to create the mask that excludes the tokens associated from the rest of
+the sequence from loss calculations.  For example (ChatML):
+
+```yaml
+response_template: "<|im_start|>assistant"
+```
+
+or (for the corresponding tokens of Gemma's response template)
+
+```yaml
+response_template: [106, 2516]
+```
+
+
 ### Evaluate
 
 To compute test set perplexity use:
@@ -267,7 +288,7 @@ it on the command line. For example, pass `--data mlx-community/wikisql` to
 train on the pre-formatted WikiwSQL data.
 
 Otherwise, provide a mapping of keys in the dataset to the features MLX LM
-expects. Use a YAML config to specify the Hugging Face dataset arguments. For
+expects. Use a YAML config to specify the Hugging Face (HF)  dataset arguments. For
 example:
 
 ```
@@ -279,11 +300,29 @@ hf_dataset:
 
 - Use `prompt_feature` and `completion_feature` to specify keys for a
   `completions` dataset. Use `text_feature` to specify the key for a `text`
-  dataset. 
+  dataset. Use `chat_feature` to specify the key for a chat dataset.
 
 - To specify the train, valid, or test splits, set the corresponding
   `{train,valid,test}_split` argument. 
 
+You can specify a list of HF datasets using the `hf_datasets` (plural) configuration, which is a list of records
+each with the same structure as above.  For example:
+
+```yaml
+hf_datasets: 
+- hf_dataset:
+    name: "Open-Orca/OpenOrca"
+    train_split: "train[:90%]"
+    valid_split: "train[-10%:]"
+    prompt_feature: "question"
+    completion_feature: "response"
+- hf_dataset:
+    name: "trl-lib/ultrafeedback_binarized"
+    train_split: "train[:90%]"
+    valid_split: "train[-10%:]"
+    chat_feature: "chosen"
+```
+
 - Arguments specified in `config` will be passed as keyword arguments to
   [`datasets.load_dataset`](https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset).
 

diff --git a/llms/mlx_lm/lora.py b/llms/mlx_lm/lora.py
@@ -11,7 +11,7 @@
 import numpy as np
 import yaml
 
-from .tokenizer_utils import TokenizerWrapper
+from .tokenizer_utils import TokenizerWrapper, no_bos_or_eos
 from .tuner.datasets import load_dataset
 from .tuner.trainer import TrainingArgs, TrainingCallback, evaluate, train
 from .tuner.utils import (
@@ -58,7 +58,9 @@
     "test_batches": 500,
     "max_seq_length": 2048,
     "lr_schedule": None,
+    "hf_datasets": None,
     "lora_parameters": {"rank": 8, "alpha": 16, "dropout": 0.0, "scale": 10.0},
+    "response_template": None,
 }
 
 
@@ -91,6 +93,15 @@ def build_parser():
         default="lora",
         help="Type of fine-tuning to perform: lora, dora, or full.",
     )
+
+    parser.add_argument(
+        "--mask-inputs",
+        dest="mask_inputs",
+        action="store_true",
+        help="Whether to mask the inputs when training. Default is False.",
+        default=False,
+    )
+
     parser.add_argument(
         "--num-layers",
         type=int,
@@ -169,6 +180,13 @@ def train_model(
     valid_set,
     training_callback: TrainingCallback = None,
 ):
+    from .tuner.trainer import (
+        default_loss,
+        input_masked_loss,
+        iterate_batches,
+        iterate_completion_batches,
+    )
+
     model.freeze()
     if args.fine_tune_type == "full":
         for l in model.layers[-min(args.num_layers, 0) :]:
@@ -197,6 +215,17 @@ def train_model(
     adapter_file = adapter_path / "adapters.safetensors"
     save_config(vars(args), adapter_path / "adapter_config.json")
 
+    if isinstance(args.response_template, str):
+        response_generation_tokens = tokenizer.encode(
+            args.response_template, add_special_tokens=False
+        )
+    else:
+        if not all([item.isinstance(int) for item in args.response_template]):
+            raise ValueError(
+                "Response template must be a list of integers if it is not a string."
+            )
+        response_generation_tokens = args.response_template
+
     # init training args
     training_args = TrainingArgs(
         batch_size=args.batch_size,
@@ -208,6 +237,9 @@ def train_model(
         adapter_file=adapter_file,
         max_seq_length=args.max_seq_length,
         grad_checkpoint=args.grad_checkpoint,
+        response_generation_tokens=no_bos_or_eos(
+            response_generation_tokens, tokenizer.bos_token_id, tokenizer.eos_token_id
+        ),
     )
 
     model.train()
@@ -216,6 +248,10 @@ def train_model(
             build_schedule(args.lr_schedule) if args.lr_schedule else args.learning_rate
         )
     )
+
+    if args.mask_inputs:
+        print("Masking inputs..")
+
     # Train model
     train(
         model=model,
@@ -225,6 +261,10 @@ def train_model(
         train_dataset=train_set,
         val_dataset=valid_set,
         training_callback=training_callback,
+        iterate_batches=(
+            iterate_completion_batches if args.mask_inputs else iterate_batches
+        ),
+        loss=input_masked_loss if args.mask_inputs else default_loss,
     )
 
 

diff --git a/llms/mlx_lm/tokenizer_utils.py b/llms/mlx_lm/tokenizer_utils.py
@@ -1,5 +1,6 @@
 import json
 from functools import partial
+from typing import List
 
 from transformers import AutoTokenizer
 
@@ -340,3 +341,8 @@ def load_tokenizer(model_path, tokenizer_config_extra={}):
         AutoTokenizer.from_pretrained(model_path, **tokenizer_config_extra),
         detokenizer_class,
     )
+
+
+def no_bos_or_eos(sequence: List, bos: int, eos: int) -> List:
+    removed_bos = sequence if sequence[0] != bos else sequence[1:]
+    return removed_bos[:-1] if removed_bos[-1] == eos else removed_bos