forked from bakwc/JamSpell
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_jamspell.py
44 lines (37 loc) · 1.5 KB
/
test_jamspell.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import os
import pytest
from pytest import approx
import jamspell
from evaluate import generate_dataset
from evaluate.evaluate import evaluateJamspell
def removeFile(fname):
try:
os.remove(fname)
except OSError:
pass
TEMP_MODEL = 'temp_model.bin'
TEMP_SPELL = 'temp_model.bin.spell'
TEMP = 'temp'
TEMP_TEST = TEMP + '_test.txt'
TEMP_TRAIN = TEMP + '_train.txt'
TEST_DATA = 'test_data/'
def teardown_module(module):
removeFile(TEMP_MODEL)
removeFile(TEMP_SPELL)
removeFile(TEMP_TEST)
removeFile(TEMP_TRAIN)
def trainLangModel(trainText, alphabetFile, modelFile):
corrector = jamspell.TSpellCorrector()
corrector.TrainLangModel(trainText, alphabetFile, modelFile)
@pytest.mark.parametrize('sourceFile,alphabetFile,expected', [
('sherlockholmes.txt', 'alphabet_en.txt', (0.04538662682106836, 0.6987951807228916, 0.014246804944479363,
0.013821441912588718, 0.76592082616179)),
('kapitanskaya_dochka.txt', 'alphabet_ru.txt', (0.12330535829567463, 0.391304347826087, 0.03866565579984837,
0.05358295674628793, 0.4391304347826087)),
])
def test_evaluation(sourceFile, alphabetFile, expected):
alphabetFile = TEST_DATA + alphabetFile
generate_dataset.generateDatasetTxt(TEST_DATA + sourceFile, TEMP)
trainLangModel(TEMP_TRAIN, alphabetFile, TEMP_MODEL)
results = evaluateJamspell(TEMP_MODEL, TEMP_TEST, alphabetFile)
assert results == approx(expected, rel=1)