-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Finish parseCantoneseReadings function and add tests
- Loading branch information
Showing
2 changed files
with
139 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import test from 'ava'; | ||
|
||
import { parseCantoneseReadings } from '../util/parseCantoneseReadings.js'; | ||
|
||
/** | ||
* @typedef {Object} TestCase | ||
* @property {string} text | ||
* @property {string} reading | ||
* @property {ReturnType<typeof parseCantoneseReadings>} expected | ||
*/ | ||
|
||
/** | ||
* @type {TestCase[]} | ||
*/ | ||
const testCases = [ | ||
{ | ||
text: '福州', | ||
reading: 'fuk1 zau1', | ||
expected: [ | ||
{ text: '福', reading: 'fuk1' }, | ||
{ text: '州', reading: 'zau1' }, | ||
], | ||
}, | ||
{ | ||
text: 'bu你阿麼', | ||
reading: 'bu4 ni5 aa3 mo1', | ||
expected: [ | ||
{ text: 'bu', reading: 'bu4' }, | ||
{ text: '你', reading: 'ni5' }, | ||
{ text: '阿', reading: 'aa3' }, | ||
{ text: '麼', reading: 'mo1' }, | ||
], | ||
}, | ||
{ | ||
text: '你get唔get到我講咩?', | ||
reading: 'nei5 get1 m4 get1 dou2 ngo5 gong2 me1?', | ||
expected: [ | ||
{ text: '你', reading: 'nei5' }, | ||
{ text: 'get', reading: 'get1' }, | ||
{ text: '唔', reading: 'm4' }, | ||
{ text: 'get', reading: 'get1' }, | ||
{ text: '到', reading: 'dou2' }, | ||
{ text: '我', reading: 'ngo5' }, | ||
{ text: '講', reading: 'gong2' }, | ||
{ text: '咩', reading: 'me1' }, | ||
{ text: '?', reading: '?' }, | ||
], | ||
}, | ||
]; | ||
|
||
for (const { text, reading, expected } of testCases) { | ||
test(`parseCantoneseReadings: ${text} ${reading}`, (t) => { | ||
const result = parseCantoneseReadings(text, reading); | ||
t.deepEqual(result, expected); | ||
}); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,91 @@ | ||
/** | ||
* Parses a text string into an array matching each character to the readings | ||
* @example "你get唔get到我講咩? (nei5 get1 m4 get1 dou2 ngo5 gong2 me1?)" => [{text: "你", reading: "nei5"}, {text: "get", reading: "get1"}, ...] | ||
* @example text: "你get唔get到我講咩?" | ||
* reading: "nei5 get1 m4 get1 dou2 ngo5 gong2 me1?" | ||
* => | ||
* [{text: "你", reading: "nei5"}, {text: "get", reading: "get1"}, ...] | ||
* @param {string} text | ||
* @param {string} readings | ||
* @returns {{text: string, reading: string}[]} | ||
*/ | ||
function parseCantoneseReadings(text) { | ||
// TODO | ||
return []; | ||
function parseCantoneseReadings(text, readings) { | ||
/** | ||
* @type {{text: string, reading: string}[]} | ||
*/ | ||
const resultArray = []; | ||
let textIndex = 0; | ||
let readingIndex = 0; | ||
const punctuations = [ | ||
',', | ||
',', | ||
'。', | ||
'.', | ||
'?', | ||
'?', | ||
'!', | ||
'!', | ||
';', | ||
';', | ||
':', | ||
':', | ||
'、', | ||
',', | ||
]; | ||
|
||
const textArray = splitString(text, punctuations); | ||
const readingsArray = splitString(readings, punctuations); | ||
if (textArray.length !== readingsArray.length) { | ||
throw new Error('Text and readings do not match'); | ||
} | ||
return textArray.map((text, index) => { | ||
const reading = readingsArray[index]; | ||
return { | ||
text, | ||
reading, | ||
}; | ||
}); | ||
} | ||
|
||
/** | ||
* | ||
* @param {string} input | ||
* @param {string[]} punctuations | ||
* @returns {string[]} | ||
*/ | ||
function splitString(input, punctuations) { | ||
const resultArray = []; | ||
let current = ''; | ||
for (const char of input) { | ||
if (/[a-zA-Z0-9]/.test(char)) { | ||
// if (current) { | ||
// resultArray.push(current); | ||
// current = ''; | ||
// } | ||
current += char; | ||
} else if (punctuations[char]) { | ||
if (current) { | ||
resultArray.push(current); | ||
current = ''; | ||
} | ||
resultArray.push(char); | ||
} else { | ||
if (current) { | ||
resultArray.push(current); | ||
current = ''; | ||
} | ||
resultArray.push(char); | ||
} | ||
} | ||
// Push the last current | ||
if (current) { | ||
resultArray.push(current); | ||
} | ||
|
||
// Remove empty strings | ||
const resultArrayFiltered = resultArray | ||
.map((item) => item.trim()) | ||
.filter((item) => item); | ||
return resultArrayFiltered; | ||
} | ||
|
||
export { parseCantoneseReadings }; |