-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.py
58 lines (45 loc) · 2 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# Copyright (c) 2024 Varlachev Valery
from datasets import DatasetDict, load_dataset
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
class CodeDataset():
DATASET_NAME = 'calum/the-stack-smol-python-docstrings'
INSTRUCTION = 'Describe what the following code does:\n```Python\n%s\n```\n# docstring\n%s'
def __init__(self, model_name_or_path: str, max_seq_length: int, val_test_size=0.2, test_size=0.5, preprocessing_num_workers=19):
super().__init__()
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, add_eos_token=True)
ds = load_dataset(self.DATASET_NAME)
train_testvalid = ds['train'].train_test_split(test_size=0.2)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
ds = DatasetDict({
'train': train_testvalid['train'],
'test': test_valid['test'],
'valid': test_valid['train']}
)
def preprocess_function(examples):
res = []
for body_without_docstring, docstring in zip(examples['body_without_docstring'], examples['docstring']):
res.append(self.INSTRUCTION % (body_without_docstring, docstring))
result = self.tokenizer(res, padding='max_length', max_length=max_seq_length, truncation=False)
result = {
k: [t for t in tens if len(t) <= max_seq_length]
for k, tens in result.items()
}
result["labels"] = result["input_ids"].copy()
return result
tokenized_dataset = ds.map(
preprocess_function,
batched=True,
num_proc=preprocessing_num_workers,
remove_columns=ds['train'].column_names,
)
self.ds = tokenized_dataset
@property
def train(self):
return self.ds['train']
@property
def valid(self):
return self.ds['test']
@property
def test(self):
return self.ds['test']