-
Notifications
You must be signed in to change notification settings - Fork 8
/
rasa_nlu_data.py
87 lines (70 loc) · 2.91 KB
/
rasa_nlu_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from nlp_architect.data.intent_datasets import IntentDataset
import os
import sys
import json
class RasaNlu(IntentDataset):
"""
RASA NLU dataset class
Args:
path (str): dataset path
sentence_length (int, optional): max sentence length
word_length (int, optional): max word length
"""
def __init__(self, path, train_file, test_file, sentence_length=30, word_length=12):
if path is None or not os.path.isdir(path):
print('invalid path for RasaNlu dataset loader')
sys.exit(0)
self.dataset_root = path
self.train_file = train_file
self.test_file = test_file
train_set_raw, test_set_raw = self._load_dataset()
super(RasaNlu, self).__init__(sentence_length=sentence_length,
word_length=word_length)
self._load_data(train_set_raw, test_set_raw)
def _load_dataset(self):
train_data = self._load_intents(self.train_file)
test_data = self._load_intents(self.test_file)
train = [(t, l, i) for i in sorted(train_data)
for t, l in train_data[i]]
test = [(t, l, i) for i in sorted(test_data) for t, l in test_data[i]]
return train, test
def _load_intents(self, file):
data = {}
fname = os.path.join(self.dataset_root, file)
with open(fname, 'rb') as load_f:
load_dict = json.load(load_f)
data = load_dict['rasa_nlu_data']
common_examples = data.get("common_examples", [])
intent_examples = data.get("intent_examples", [])
entity_examples = data.get("entity_examples", [])
all_examples = common_examples + intent_examples + entity_examples
entries = []
sentences = []
train_data = {}
for ex in all_examples:
intent = ex.get("intent")
if intent not in train_data:
sentences = []
entries = self._parse_json_jieba(ex, sentences)
train_data[intent] = entries
return train_data
def _parse_json_jieba(self, data, sentences):
import jieba
tokens = []
tags = ['O'] * len(data['text'])
new_tokens = jieba.tokenize(data['text'].strip())
tokens += [word for (word, start, end) in new_tokens]
for s in data['entities']:
ent = s.get('entity', None)
start = s.get('start', None)
end = s.get('end', None)
tags[start:end] = self._create_tags(ent, len(s))
sentences.append((tokens, tags))
return sentences
@staticmethod
def _create_tags(tag, length):
labels = ['B-' + tag]
if length > 1:
for _ in range(length - 1):
labels.append('I-' + tag)
return labels