diff --git a/collie/data/dataset.py b/collie/data/dataset.py index ec35b578..d48fd94b 100644 --- a/collie/data/dataset.py +++ b/collie/data/dataset.py @@ -198,12 +198,11 @@ def save_propressed(self, path: str, shard_size: int = 4): shard_idx = 0 meta = np.empty((0, 2), int) for i in self.indices: - labels = self[i][1] data = { - "tokens": labels["labels"] + "tokens": self[i]["input_ids"] } data.update( - {key: value for key, value in labels.items() if key != "labels"}) + {key: value for key, value in self[i].items() if key != "input_ids"}) bytes_data = json.dumps(data).encode() + "\n".encode() offset = shard.tell() length = len(data["tokens"]) diff --git a/collie/models/llama/model.py b/collie/models/llama/model.py index 4a0db442..44e9ef86 100644 --- a/collie/models/llama/model.py +++ b/collie/models/llama/model.py @@ -18,7 +18,7 @@ from einops import rearrange try: - from flash_attn.flash_attention import FlashAttention + from flash_attn.modules.mha import SelfAttention as FlashAttention except ModuleNotFoundError: FlashAttention = None @@ -194,7 +194,12 @@ def _forward(self, assert FlashAttention is not None, \ "Detected flash_attn is not installed. See https://github.com/HazyResearch/flash-attention" qkv = torch.stack([query, key, value], dim=2) - output, _ = FlashAttention()(qkv, key_padding_mask=attention_mask, causal=True) + output = FlashAttention()(qkv, key_padding_mask=attention_mask.bool(), causal=True) + """ flash_attn_2 note: + from flash_attn.modules.mha import SelfAttention as FlashAttention + require attention_mask as a bool tensor + replace 'output, _ =' as 'output =' + """ output = rearrange(output, "b n h d -> b n (h d)") else: query, key, value = query.permute(0, 2, 1, 3), key.permute(