Skip to content

Commit

Permalink
Merge pull request #45 from tanganke/develop
Browse files Browse the repository at this point in the history
merge develop into main
  • Loading branch information
tanganke authored Dec 3, 2024
1 parent a4847b1 commit a9b5173
Show file tree
Hide file tree
Showing 21 changed files with 568 additions and 81 deletions.
6 changes: 6 additions & 0 deletions config/dataset/llm_sft/alpaca_cleaned.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
alpaca-cleaned:
_target_: fusion_bench.dataset.llama.alpaca.load_tokenized_alpaca_dataset
tokenizer: ???
path: "yahma/alpaca-cleaned"
split: train
cache_path: null
3 changes: 3 additions & 0 deletions config/dataset/llm_sft/ultrachat_200k.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ultrachat-200k:
_target_: fusion_bench.dataset.ultrachat.load_tokenized_ultrachat_200k
tokenizer: ???
16 changes: 16 additions & 0 deletions config/fabric/llama_peft_fsdp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
defaults:
- loggers: tensorboard_logger
- strategy: llama_peft_fsdp
- _self_

_target_: lightning.Fabric
_recursive_: true
# Number of devices to train on (``int``), which GPUs to train on (``list`` or ``str``), or ``"auto"``.
# The value applies per node.
devices: auto
# The hardware to run on. Possible choices are:
# ``"cpu"``, ``"cuda"``, ``"mps"``, ``"gpu"``, ``"tpu"``, ``"auto"``.
# for example: fabric.accelerator=cpu
accelerator: auto
# reference to the precision policy: https://lightning.ai/docs/fabric/stable/api/fabric_args.html#precision
precision: bf16-true
9 changes: 9 additions & 0 deletions config/fabric/strategy/llama_peft_fsdp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
_target_: lightning.fabric.strategies.FSDPStrategy
sharding_strategy: FULL_SHARD
state_dict_type: full # Save a single, consolidated checkpoint file
cpu_offload: false
auto_wrap_policy:
_target_: fusion_bench.mixins.lightning_fabric.get_size_based_auto_wrap_policy
activation_checkpointing_policy: ${.auto_wrap_policy}
# limit_all_gathers: true

2 changes: 1 addition & 1 deletion config/method/lm_finetune/peftfinetune_sft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ save_optimizer_state: false
# save_full_model must be true when using shared FSDP
save_full_model: false
# save_ckpt_type can be 'peft' or 'lightning'
save_ckpt_type: peft
save_ckpt_type: lightning
# Path to checkpoint to load from, used for resuming training
ckpt_path: null
max_length: 4096
18 changes: 18 additions & 0 deletions config/modelpool/CausalLMPool/llama_ultrachat.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
_target_: fusion_bench.modelpool.CausalLMPool

pretrained_model_name_or_path: meta-llama/Llama-3-1B-Instruct

models:
_pretrained_:
_target_: transformers.AutoModelForCausalLM.from_pretrained
pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
torch_dtype: bfloat16

tokenizer:
_target_: transformers.AutoTokenizer.from_pretrained
pretrained_model_name_or_path: ${..pretrained_model_name_or_path}

train_datasets:
ultrachat-200k:
_target_: fusion_bench.dataset.llama.ultrachat.load_tokenized_ultrachat_200k
tokenizer: ${...tokenizer}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
_target_: fusion_bench.modelpool.CausalLMPool
_target_: fusion_bench.modelpool.SeqenceClassificationModelPool

pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct

Expand All @@ -16,7 +16,7 @@ tokenizer:

train_datasets:
preference_700k:
_target_: fusion_bench.dataset.llama.preference_700k.load_tokenized_preference_700k_for_bradley_terry_rm
_target_: fusion_bench.dataset.llama.preference_700k.load_tokenized_preference_700k_for_rlhf
tokenizer: ${...tokenizer}
path: hendrydong/preference_700K
split: train
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
_target_: fusion_bench.modelpool.SeqenceClassificationModelPool

pretrained_model_name_or_path: fusion-bench/Llama-3.2-1B-Instruct_Bradly-Terry-RM_Preference-700k

models:
_pretrained_:
_target_: transformers.AutoModelForSequenceClassification.from_pretrained
pretrained_model_name_or_path: ${...pretrained_model_name_or_path}
torch_dtype: bfloat16

tokenizer:
_target_: transformers.AutoTokenizer.from_pretrained
pretrained_model_name_or_path: ${..pretrained_model_name_or_path}
pad_token: <|end_of_text|> # do not use eos token (<|eos_id|>) as padding token because it is used as the end of each content
18 changes: 18 additions & 0 deletions config/taskpool/reward_model_evaluation.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
_target_: fusion_bench.taskpool.llama.reward_model.RewardModelEvaluationTaskPool

test_datasets:
preference_700k:
_target_: fusion_bench.dataset.llama.preference_700k.load_tokenized_preference_700k_for_rlhf
tokenizer: ${...tokenizer}
path: hendrydong/preference_700K
split: train
cache_path: null

dataloader_kwargs:
shuffle: False
batch_size: 16

tokenizer: ${..modelpool.tokenizer}

max_num_samples: 1000
seed: 42
14 changes: 14 additions & 0 deletions examples/lm_finetune/llama_fullfinetune.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,17 @@ fusion_bench --config-name llama_full_finetune \
method.checkpoint_save_frequency=2000 \
method.max_epochs=1 \
modelpool=CausalLMPool/llama_metamathqa

# full finetune on ultrachat
fusion_bench --config-name llama_full_finetune \
fabric=llama_peft_fsdp \
fabric.loggers.name=llama_lora_finetune \
method=lm_finetune/peftfinetune_sft \
method.dataloader_kwargs.batch_size=1 \
method.max_epochs=1 \
method.gradient_clip_val=1.0 \
method.accumulate_grad_batches=16 \
method.checkpoint_save_interval=step \
method.checkpoint_save_frequency=2000 \
modelpool=CausalLMPool/llama_ultrachat \
modelpool.pretrained_model_name_or_path=meta-llama/Meta-Llama-3-8B-Instruct
16 changes: 8 additions & 8 deletions fusion_bench/dataset/llama/collate.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,14 @@ def bradley_terry_rm_collate(
converted_batch = []
for item in batch:
new_item = {
"input_ids": item["input_ids_j"],
"attention_mask": item["attention_mask_j"],
"input_ids": item["chosen_input_ids"],
"attention_mask": item["chosen_attention_mask"],
}
converted_batch.append(new_item)
for item in batch:
new_item = {
"input_ids": item["input_ids_k"],
"attention_mask": item["attention_mask_k"],
"input_ids": item["rejected_input_ids"],
"attention_mask": item["rejected_attention_mask"],
}
converted_batch.append(new_item)

Expand All @@ -111,10 +111,10 @@ def bradley_terry_rm_collate(
collated_batch = {"input_ids": input_ids, "attention_mask": attention_mask}
for key in batch[0]:
if key not in [
"input_ids_j",
"attention_mask_j",
"input_ids_k",
"attention_mask_k",
"chosen_input_ids",
"chosen_attention_mask",
"rejected_input_ids",
"rejected_attention_mask",
]:
collated_batch[key] = [x[key] for x in batch]
return collated_batch
49 changes: 29 additions & 20 deletions fusion_bench/dataset/llama/preference_700k.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
import os
from copy import deepcopy
from typing import TYPE_CHECKING, Optional

from datasets import Dataset, load_dataset, load_from_disk
from lightning.fabric.utilities import rank_zero_only
from tqdm.auto import tqdm

from fusion_bench.utils import timeit_context

from .alpaca import convert_alpaca_to_conversation
import logging

if TYPE_CHECKING:
from transformers import PreTrainedTokenizer

log = logging.getLogger(__name__)


def load_tokenized_preference_700k_for_bradley_terry_rm(
def load_tokenized_preference_700k_for_rlhf(
tokenizer: "PreTrainedTokenizer",
path: str = "hendrydong/preference_700K",
split: str = "train",
Expand All @@ -25,10 +27,10 @@ def load_tokenized_preference_700k_for_bradley_terry_rm(
The returned dataset contains the following fields:
- input_ids_j: The input token ids for the winner.
- attention_mask_j: The attention mask for the winner.
- input_ids_k: The input token ids for the loser.
- attention_mask_k: The attention mask for the loser.
- chosen_input_ids: The input token ids for the winner.
- chosen_attention_mask: The attention mask for the winner.
- rejected_input_ids: The input token ids for the loser.
- rejected_attention_mask: The attention mask for the loser.
"""
if cache_path is not None and os.path.exists(cache_path):
dataset = load_from_disk(cache_path)
Expand All @@ -37,21 +39,28 @@ def load_tokenized_preference_700k_for_bradley_terry_rm(
dataset = load_dataset(path, split=split)

def tokenize(sample):

# ? is it necessary to `.replace(tokenizer.bos_token, "")`?
sample["positive"] = tokenizer.apply_chat_template(
sample["chosen_chat"] = tokenizer.apply_chat_template(
sample["chosen"], tokenize=False, add_generation_prompt=False
).replace(tokenizer.bos_token, "")
sample["negative"] = tokenizer.apply_chat_template(
)
sample["rejected_chat"] = tokenizer.apply_chat_template(
sample["rejected"], tokenize=False, add_generation_prompt=False
).replace(tokenizer.bos_token, "")

tokenized_pos = tokenizer(sample["positive"], truncation=True)
tokenized_neg = tokenizer(sample["negative"], truncation=True)
sample["input_ids_j"] = tokenized_pos["input_ids"]
sample["attention_mask_j"] = tokenized_pos["attention_mask"]
sample["input_ids_k"] = tokenized_neg["input_ids"]
sample["attention_mask_k"] = tokenized_neg["attention_mask"]
)

tokenized_pos = tokenizer(sample["chosen_chat"], truncation=True)
tokenized_neg = tokenizer(sample["rejected_chat"], truncation=True)

# Ensure that the chosen response does not contain an PAD token
sample["chosen_input_ids"] = tokenized_pos["input_ids"]
sample["chosen_attention_mask"] = tokenized_pos["attention_mask"]
if tokenizer.pad_token_id in tokenized_pos["input_ids"]:
log.warning(f"Prompt contains PAD token: {sample['chosen_chat']}")

sample["rejected_input_ids"] = tokenized_neg["input_ids"]
sample["rejected_attention_mask"] = tokenized_neg["attention_mask"]
# Ensure that the rejected response does not contain an PAD token
if tokenizer.pad_token_id in tokenized_neg["input_ids"]:
log.warning(f"Prompt contains PAD token: {sample['rejected_chat']}")

return sample

dataset = dataset.map(tokenize, num_proc=num_proc)
Expand Down
88 changes: 88 additions & 0 deletions fusion_bench/dataset/llama/stanford_shp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import os
from copy import deepcopy
from typing import TYPE_CHECKING, Optional

from datasets import Dataset, load_dataset, load_from_disk
from lightning.fabric.utilities import rank_zero_only
from tqdm.auto import tqdm

from fusion_bench.utils import timeit_context

if TYPE_CHECKING:
from transformers import PreTrainedTokenizer


def load_tokenized_stanford_shp_for_rlhf(
tokenizer: "PreTrainedTokenizer",
path: str = "stanfordnlp/SHP",
split: str = "train",
num_proc: int = 8,
cache_path: Optional[str] = None,
):
if cache_path is not None and os.path.isdir(cache_path):
dataset = load_from_disk(cache_path)
return dataset

dataset = load_dataset(path, split=split)

def tokenize(sample):
"""
- history: the post title concatented to the post body (string)
- human_ref_A: text of comment A (string)
- human_ref_B: text of comment B (string)
- labels: the preference label -- it is 1 if A is preferred to B; 0 if B is preferred to A. This was randomized such that the label distribution is roughly 50/50. (integer)
"""
# Create a conversation with the post title and body, followed by comments
conversation = [{"role": "user", "content": sample["history"]}]
if sample["labels"] == 0:
sample["chosen"] = deepcopy(conversation).append(
{"role": "assistant", "content": sample["human_ref_B"]}
)
sample["rejected"] = deepcopy(conversation).append(
{"role": "assistant", "content": sample["human_ref_A"]}
)
else:
sample["chosen"] = deepcopy(conversation).append(
{"role": "assistant", "content": sample["human_ref_A"]}
)
sample["rejected"] = deepcopy(conversation).append(
{"role": "assistant", "content": sample["human_ref_B"]}
)

# apply chat template
sample["chosen_chat"] = tokenizer.apply_chat_template(
sample["chosen"], tokenize=False, add_generation_prompt=False
)
sample["rejected_chat"] = tokenizer.apply_chat_template(
sample["rejected"], tokenize=False, add_generation_prompt=False
)

# tokenize the conversation
tokenized_pos = tokenizer(sample["chosen_chat"], truncation=True)
tokenized_neg = tokenizer(sample["rejected_chat"], truncation=True)

# Ensure that the chosen response does not contain an EOS token
sample["chosen_input_ids"] = tokenized_pos["input_ids"]
sample["chosen_attention_mask"] = tokenized_pos["attention_mask"]
assert (
tokenizer.eos_token_id not in tokenized_pos["input_ids"][:-1]
), f"Prompt contains EOS token: {sample['positive']}"
if sample["chosen_input_ids"][-1] != tokenizer.eos_token_id:
sample["chosen_input_ids"].append(tokenizer.eos_token_id)
sample["chosen_attention_mask"].append(1)

sample["rejected_input_ids"] = tokenized_neg["input_ids"]
sample["rejected_attention_mask"] = tokenized_neg["attention_mask"]
# Ensure that the rejected response does not contain an EOS token
assert (
tokenizer.eos_token_id not in tokenized_neg["input_ids"][:-1]
), f"Prompt contains EOS token: {sample['rejected']}"
if sample["rejected_input_ids"][-1] != tokenizer.eos_token_id:
sample["rejected_input_ids"].append(tokenizer.eos_token_id)
sample["rejected_attention_mask"].append(1)

dataset = dataset.map(tokenize, num_proc=num_proc)

if cache_path is not None and rank_zero_only.rank == 0:
dataset.save_to_disk(cache_path)
return dataset
58 changes: 58 additions & 0 deletions fusion_bench/dataset/llama/ultrachat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import os
from typing import TYPE_CHECKING, Optional

from datasets import Dataset, load_dataset, load_from_disk
from lightning.fabric.utilities import rank_zero_only
from tqdm.auto import tqdm

from fusion_bench.utils import timeit_context

if TYPE_CHECKING:
from transformers import PreTrainedTokenizer


def load_tokenized_ultrachat_200k(
tokenizer: "PreTrainedTokenizer",
path: str = "HuggingFaceH4/ultrachat_200k",
split: str = "train_sft",
num_proc: int = 8,
cache_path: Optional[str] = None,
):
R"""
Load and tokenized Ultrachat 200k dataset for Bradley-Terry ranking model.
The returned dataset contains the following fields:
- input_ids: The input token ids for the winner.
- attention_mask: The attention mask for the winner.
"""
if cache_path is not None and os.path.exists(cache_path):
dataset = load_from_disk(cache_path)
return dataset

dataset = load_dataset(path, split=split)

def tokenize(sample):

# ? is it necessary to `.replace(tokenizer.bos_token, "")`?
sample["input_ids"] = tokenizer.apply_chat_template(
sample["messages"], tokenize=True, add_generation_prompt=False
)
sample["attention_mask"] = [1] * len(sample["input_ids"])

return sample

dataset = dataset.map(tokenize, num_proc=num_proc)

if cache_path is not None and rank_zero_only.rank == 0:
dataset.save_to_disk(cache_path)
return dataset


if __name__ == "__main__":
# Example usage and testing
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
dataset = load_tokenized_ultrachat_200k(tokenizer)
print(dataset)
Empty file.
Loading

0 comments on commit a9b5173

Please sign in to comment.