Skip to content

Commit

Permalink
Merge pull request #96 from OpenLMLab/add-more-shields
Browse files Browse the repository at this point in the history
Add more shields
  • Loading branch information
KaiLv69 authored Aug 13, 2023
2 parents a7de989 + 13e0178 commit 37cb128
Show file tree
Hide file tree
Showing 13 changed files with 192 additions and 73 deletions.
15 changes: 14 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ CoLLiE (Collaborative Tuning of Large Language Models in an Efficient Way),一


## 新闻
* [2023/07/18] 发布Python包`collie-lm`。您可以在[链接](https://pypi.org/project/collie-lm/#history)中查看更多细节!

## 目录
<ul>
Expand Down Expand Up @@ -120,9 +121,21 @@ CoLLiE 基于 *DeepSpeed* 和 *PyTorch*,为大型语言模型提供协作式
注:在使用Adam优化器的情况下,各个模型需要的最少的GPU(A100)数量

## 安装
在安装前,你需要确保:
* PyTorch >= 1.13
* CUDA >= 11.6
* Linux OS
### PyPI安装
你可以简单地通过PyPI安装,命令如下:
```bash
pip install git+https://github.com/OpenLMLab/collie.git
pip install collie-lm
```
### 源码安装
```bash
git clone https://github.com/OpenLMLab/collie
python setup.py install
```

## Docker安装

## 使用
Expand Down
1 change: 1 addition & 0 deletions README_EN.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ CoLLiE (Collaborative Tuning of Large Language Models in an Efficient Way) is a
</h4>

## Latest News
* [2023/07/18] Release python package collie-lm(1.0.2). You can find more detials in this [link](https://pypi.org/project/collie-lm/#history).

## Table of Contents
<ul>
Expand Down
9 changes: 1 addition & 8 deletions examples/alpaca/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,7 @@
eval_dataset = dataset[-32:]

# 5. 加载预训练模型
model = LlamaForCausalLM(config)
state_dict = LlamaForCausalLM.load_parallel_state_dict(
path="hdd:s3://opennlplab_hdd/models/llama/llama-7b-hf",
config=config,
protocol="petrel",
format="hf"
)
model.load_state_dict(state_dict)
model = LlamaForCausalLM.from_config(config)

# 6. 设置优化器
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
Expand Down
2 changes: 1 addition & 1 deletion examples/finetune_chatglm2_for_summary.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import sys
sys.path.append("../")
sys.path.append("..")
from typing import Dict

import argparse
Expand Down
9 changes: 3 additions & 6 deletions examples/finetune_chatglm_for_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,16 +47,14 @@
} for sample in load_dataset("iwslt2017", name="iwslt2017-fr-en", split="train[100:150]")
]
# Prepare model
model = ChatGLMForCausalLM.from_pretrained(
"/mnt/petrelfs/zhangshuo/model/chatglm-6b", config=config)
model = ChatGLMForCausalLM.from_pretrained("THUDM/chatglm-6b", config=config)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
# lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
# optimizer=optimizer, T_max=config.train_epochs * len(train_dataset) / (config.train_micro_batch_size * config.gradient_accumulation_steps))
lr_scheduler = torch.optim.lr_scheduler.StepLR(
optimizer=optimizer, step_size=1, gamma=0.9
)
tokenizer = AutoTokenizer.from_pretrained(
"THUDM/chatglm-6b", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
# 默认的tokenizer不会把[gMASK]当作一个token,所以需要手动添加
tokenizer.unique_no_split_tokens.append("[gMASK]")
# Convert to CoLLie Dataset
Expand Down Expand Up @@ -117,5 +115,4 @@
)),
evaluators=[evaluator_ppl, evaluator_bleu]
)
trainer.train()
# trainer.save_checkpoint(path="/mnt/petrelfs/zhangshuo/model/test_save_checkpoint", mode="model")
trainer.train()
24 changes: 5 additions & 19 deletions examples/finetune_llama_for_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,12 @@
"fp16": {
"enabled": True
},
# "monitor_config": {
# "enabled": True,
# "wandb": {
# "enabled": True,
# "team": "00index",
# "project": "collie",
# "group": "test_evaluator"
# }
# },
"zero_optimization": {
"stage": 3,
}
}
config.seed = 1024
model = LlamaForCausalLM.from_pretrained("/mnt/petrelfs/zhangshuo/model/llama-7b-hf", config=config)
model = LlamaForCausalLM.from_pretrained("decapoda-research/llama-7b-hf", config=config)
# model = LlamaForCausalLM.from_config(config)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=1000)
Expand All @@ -42,11 +33,6 @@
"output": "positive." if sample["label"] else "negative."
} for sample in load_dataset("imdb", split="train")
]
# train_dataset = [
# {
# "text": f"Comment: {sample['text']}. The sentiment of this comment is: {'positive.' if sample['label'] else 'negative.'}",
# } for sample in load_dataset("imdb", split="train")
# ]
### Prepare perplexity evaluation dataset
ratio = 0.01
eval_dataset_ppl, train_dataset = train_dataset[:int(len(train_dataset) * ratio)], train_dataset[int(len(train_dataset) * ratio):]
Expand All @@ -60,11 +46,11 @@
][:1000]
### Convert to CoLLie Dataset
traine_dataset = CollieDatasetForTraining(train_dataset,
tokenizer=LlamaTokenizer.from_pretrained("/mnt/petrelfs/zhangshuo/model/llama-7b-hf", add_eos_token=True))
tokenizer=LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf", add_eos_token=True))
eval_dataset_ppl = CollieDatasetForTraining(eval_dataset_ppl,
tokenizer=LlamaTokenizer.from_pretrained("/mnt/petrelfs/zhangshuo/model/llama-7b-hf", add_eos_token=True))
tokenizer=LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf", add_eos_token=True))
eval_dataset_cls = CollieDatasetForClassification(eval_dataset_cls,
tokenizer=LlamaTokenizer.from_pretrained("/mnt/petrelfs/zhangshuo/model/llama-7b-hf", add_eos_token=True))
tokenizer=LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf", add_eos_token=True))
### Prepare Evaluator
evaluator_ppl = EvaluatorForPerplexity(
model=model,
Expand Down Expand Up @@ -101,7 +87,7 @@
MemoryMonitor(config),
LRMonitor(config)
],
data_provider=GradioProvider(LlamaTokenizer.from_pretrained("/mnt/petrelfs/zhangshuo/model/llama-7b-hf"), port=12300, stream=True),
data_provider=GradioProvider(LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf"), port=12300, stream=True),
evaluators=[evaluator_ppl, evaluator_cls]
)
trainer.train()
Expand Down
5 changes: 2 additions & 3 deletions examples/finetune_llama_for_summary.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import sys
sys.path.append("../")
sys.path.append("..")
from typing import Dict

import argparse
Expand Down Expand Up @@ -78,8 +78,7 @@ def load_data(path_dict):
tokenizer=tokenizer)

# Prepare model
model = LlamaForCausalLM.from_pretrained(
"decapoda-research/llama-7b-hf", config=config)
model = LlamaForCausalLM.from_pretrained("decapoda-research/llama-7b-hf", config=config)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
lr_scheduler = torch.optim.lr_scheduler.StepLR(
optimizer=optimizer, step_size=1, gamma=0.9
Expand Down
11 changes: 4 additions & 7 deletions examples/finetune_llama_for_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from transformers import LlamaTokenizer, GenerationConfig
from collie import Trainer, EvaluatorForPerplexity, LlamaForCausalLM, CollieConfig, PPLMetric, AccuracyMetric, DecodeMetric, CollieDatasetForTraining, CollieDatasetForGeneration, \
LossMonitor, TGSMonitor, MemoryMonitor, EvalMonitor, GradioProvider, EvaluatorForGeneration, LRMonitor, BleuMetric, DashProvider
config = CollieConfig.from_pretrained("/mnt/petrelfs/zhangshuo/model/llama-7b-hf")

config = CollieConfig.from_pretrained("decapoda-research/llama-7b-hf")
config.pp_size = 8
config.train_micro_batch_size = 1
config.eval_batch_size = 1
Expand Down Expand Up @@ -46,16 +47,12 @@
} for sample in load_dataset("iwslt2017", name="iwslt2017-fr-en", split="train[100:150]")
]
# Prepare model
model = LlamaForCausalLM.from_pretrained(
"/mnt/petrelfs/zhangshuo/model/llama-7b-hf", config=config)
model = LlamaForCausalLM.from_pretrained("decapoda-research/llama-7b-hf", config=config)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
# lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
# optimizer=optimizer, T_max=config.train_epochs * len(train_dataset) / (config.train_micro_batch_size * config.gradient_accumulation_steps))
lr_scheduler = torch.optim.lr_scheduler.StepLR(
optimizer=optimizer, step_size=1, gamma=0.9
)
tokenizer = LlamaTokenizer.from_pretrained(
"/mnt/petrelfs/zhangshuo/model/llama-7b-hf", add_eos_token=False, add_bos_token=False)
tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf", add_eos_token=False, add_bos_token=False)
# Convert to CoLLie Dataset
train_dataset = CollieDatasetForTraining(train_dataset,
tokenizer=tokenizer)
Expand Down
10 changes: 1 addition & 9 deletions examples/finetune_moss_for_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,11 @@
"""
import sys
sys.path.append('..')

import os
import json
import torch

from transformers import AutoTokenizer

from collie.config import CollieConfig

from collie.data import CollieDatasetForTraining
from collie.data import CollieDataLoader

from collie.optim.lomo import Lomo

Expand All @@ -23,11 +17,9 @@
from collie.models.moss_moon import Moss003MoonForCausalLM

from collie.utils.monitor import StepTimeMonitor, TGSMonitor, MemoryMonitor, LossMonitor, EvalMonitor
from collie.metrics import DecodeMetric, PPLMetric, BleuMetric
from collie.metrics import DecodeMetric, PPLMetric
from collie.module import GPTLMLoss

from collie.utils import env

# 1. 设置路径
# 1.1 预训练模型路径
pretrained_model = "fnlp/moss-moon-003-sft"
Expand Down
9 changes: 2 additions & 7 deletions examples/further_pretrain_llama/expand_vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,9 @@
"group": "further_pretrain_llama"
}
},
# "zero_optimization": {
# "stage": 1,
# }
}
# 合并词表
llama_tokenizer = LlamaTokenizer.from_pretrained(
"/mnt/petrelfs/zhangshuo/model/llama-7b-hf")
llama_tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")
chinese_sp_model = spm.SentencePieceProcessor()
chinese_sp_model.Load("./chinese_sp.model")
llama_spm = sp_pb2_model.ModelProto()
Expand Down Expand Up @@ -72,8 +68,7 @@
eval_dataset, train_dataset = dataset[:int(
len(dataset) * ratio)], dataset[int(len(dataset) * ratio):]
# 准备模型并调整 embedding 层大小,设置只训练 embedding 和 lm_head 层,加速收敛
model = LlamaForCausalLM.from_pretrained(
"/mnt/petrelfs/zhangshuo/model/llama-7b-hf", config=config)
model = LlamaForCausalLM.from_pretrained("decapoda-research/llama-7b-hf", config=config)
model.resize_token_embeddings(len(llama_tokenizer) + 7) # 取个整
for p in model.parameters():
p.requires_grad = False
Expand Down
10 changes: 4 additions & 6 deletions examples/one_sentence_overfitting/3d_parallelism.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import sys
sys.path.append("../../")
sys.path.append("../..")
from collie.models.llama.model import LlamaForCausalLM
from collie.controller import Trainer, EvaluatorForGeneration
from collie.metrics.decode import DecodeMetric
Expand All @@ -9,12 +9,10 @@
from transformers.generation.utils import GenerationConfig
import torch

tokenizer = LlamaTokenizer.from_pretrained("/mnt/petrelfs/zhangshuo/model/llama-7b-hf",
padding_side="left",
add_eos_token=False)
tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf", padding_side="left",add_eos_token=False)
tokenizer.bos_token_id = 1
tokenizer.eos_token_id = 2
config = CollieConfig.from_pretrained("/mnt/petrelfs/zhangshuo/model/llama-7b-hf")
config = CollieConfig.from_pretrained("decapoda-research/llama-7b-hf")
config.tp_size = 4
config.dp_size = 1
config.pp_size = 2
Expand All @@ -31,7 +29,7 @@
}
}

model = LlamaForCausalLM.from_pretrained("/mnt/petrelfs/zhangshuo/model/llama-7b-hf", config=config)
model = LlamaForCausalLM.from_pretrained("decapoda-research/llama-7b-hf", config=config)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
train_sample = tokenizer("Collie is a python package for finetuning large language models.</s>", return_tensors="pt").input_ids.squeeze(0)
eval_sample = tokenizer("Collie is", return_tensors="pt")
Expand Down
Loading

0 comments on commit 37cb128

Please sign in to comment.