Skip to content

Commit

Permalink
Fix more readings parse cases, move into folder
Browse files Browse the repository at this point in the history
  • Loading branch information
MarvNC committed Jan 20, 2024
1 parent 8d5d702 commit 4338c12
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 28 deletions.
39 changes: 38 additions & 1 deletion src/test/parseCantoneseReadings.test.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import test from 'ava';

import { parseCantoneseReadings } from '../util/parseCantoneseReadings.js';
import { parseCantoneseReadings } from '../util/textHandling/parseCantoneseReadings.js';

/**
* @typedef {Object} TestCase
Expand Down Expand Up @@ -46,6 +46,43 @@ const testCases = [
{ text: '?', reading: '?' },
],
},
{
text: '專業運動員成日斷韌帶。',
reading: 'zyun1 jip6 wan6 dung6 jyun4 seng4 jat6 tyun5 jan6 daai2.',
expected: [
{ text: '專', reading: 'zyun1' },
{ text: '業', reading: 'jip6' },
{ text: '運', reading: 'wan6' },
{ text: '動', reading: 'dung6' },
{ text: '員', reading: 'jyun4' },
{ text: '成', reading: 'seng4' },
{ text: '日', reading: 'jat6' },
{ text: '斷', reading: 'tyun5' },
{ text: '韌', reading: 'jan6' },
{ text: '帶', reading: 'daai2' },
{ text: '。', reading: '.' },
],
},
{
text: '佢考咗車牌六年,終於成功嘞。',
reading: 'keoi5 haau2 zo2 ce1 paai4 luk6 nin4 zung1 jyu1 sing4 gung1 laak3',
expected: [
{ text: '佢', reading: 'keoi5' },
{ text: '考', reading: 'haau2' },
{ text: '咗', reading: 'zo2' },
{ text: '車', reading: 'ce1' },
{ text: '牌', reading: 'paai4' },
{ text: '六', reading: 'luk6' },
{ text: '年', reading: 'nin4' },
{ text: ',', reading: '' },
{ text: '終', reading: 'zung1' },
{ text: '於', reading: 'jyu1' },
{ text: '成', reading: 'sing4' },
{ text: '功', reading: 'gung1' },
{ text: '嘞', reading: 'laak3' },
{ text: '。', reading: '' },
],
},
];

for (const { text, reading, expected } of testCases) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
import {
punctuations,
isHanzi,
isJyuutping,
isPunctuation,
} from './textUtils.js';

/**
* Parses a text string into an array matching each character to the readings
* @example text: "你get唔get到我講咩?"
Expand All @@ -13,37 +20,41 @@ function parseCantoneseReadings(text, readings) {
* @type {{text: string, reading: string}[]}
*/
const resultArray = [];
let textIndex = 0;
let readingIndex = 0;
const punctuations = [
',',
',',
'。',
'.',
'?',
'?',
'!',
'!',
';',
';',
':',
':',
'、',
',',
];

const textArray = splitString(text, punctuations);
const readingsArray = splitString(readings, punctuations);
if (textArray.length !== readingsArray.length) {
throw new Error('Text and readings do not match');

let readingIndex = 0;
let textIndex = 0;
for (let i = 0; i < Math.max(textArray.length, readingsArray.length); i++) {
const text = textArray[textIndex];
const reading = readingsArray[readingIndex];
const isTextHanzi = isHanzi(text);
const isReadingJyuutping = isJyuutping(reading);
const isTextPunctuation = isPunctuation(text);
const isReadingPunctuation = isPunctuation(reading);
// Ideal case
if (
(isTextHanzi && isReadingJyuutping) ||
(isTextPunctuation && isReadingPunctuation) ||
// Case where for example text is 'bu' and reading is 'bu4'
(!isTextHanzi && !isTextPunctuation && isReadingJyuutping)
) {
resultArray.push({ text, reading });
textIndex++;
readingIndex++;
} else if (isTextPunctuation && isReadingJyuutping) {
// Send empty string to reading
resultArray.push({ text, reading: '' });
textIndex++;
} else {
throw new Error(
`Unexpected text "${text}" and reading "${reading}" at index ${i}`
);
}
}
return textArray.map((text, index) => {
const reading = readingsArray[index];
return {
text,
reading,
};
});

return resultArray;
}

/**
Expand Down
48 changes: 48 additions & 0 deletions src/util/textHandling/textUtils.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
const punctuations = [
',',
',',
'。',
'.',
'?',
'?',
'!',
'!',
';',
';',
':',
':',
'、',
',',
',',
];

/**
* Returns true if the text is a Chinese character.
* @param {string} text
* @returns {boolean}
*/
function isHanzi(text) {
return /[\u3040-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff66-\uff9f]/.test(
text
);
}

/**
* Returns true if the text is a Jyutping reading.
* @param {string} text
* @returns {boolean}
*/
function isJyuutping(text) {
return /[a-zA-Z0-9]/.test(text);
}

/**
* Returns true if the text is a punctuation.
* @param {string} text
* @returns {boolean}
*/
function isPunctuation(text) {
return punctuations.includes(text);
}

export { punctuations, isHanzi, isJyuutping, isPunctuation };

0 comments on commit 4338c12

Please sign in to comment.