recipes/config_lora.yaml

model_name_or_path: rhymes-ai/Aria
tokenizer_path: rhymes-ai/Aria

# Dataset config
dataset_mixer:
  "path/to/dataset1": 1
  "path/to/dataset2": 0.5
  "path/to/dataset3": 2
dataloader_num_workers: 8

# SFT trainer config
per_device_train_batch_size: 8
gradient_accumulation_steps: 2
output_dir: out/refcoco_moe_lora_980
bf16: true
torch_dtype: bfloat16
gradient_checkpointing: true
log_level: info
logging_steps: 1
save_strategy: epoch
resume_from_checkpoint: false # resume from the latest checkpoint in output_dir if set to true
## hyper params
num_train_epochs: 1
eval_strategy: 'no'
learning_rate: 5e-5
weight_decay: 0.1
adam_beta2: 0.95
warmup_ratio: 0.01
lr_scheduler_type: "cosine"
max_image_size: 980
seed: 42
max_seq_length: 2048
report_to: wandb


# Model config
moe_z_loss_coeff: 1e-5
moe_aux_loss_coeff: 1e-4
freeze_vit: true
freeze_llm: false
# freeze_llm_layers: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
freeze_projector: true
use_peft: true
lora_r: 8
lora_alpha: 32
lora_dropout: 0.05
lora_target_modules:
  - fc1
  - fc2
  - q_proj
  - k_proj
  - v_proj
  - linear
  - o_proj
  - up_proj
  - down_proj
  - out_proj
  - gate_proj
  - lm_head