Fix more readings parse cases, move into folder

MarvNC · Jan 20, 2024 · 4338c12 · 4338c12
1 parent 8d5d702
commit 4338c12
Show file tree

Hide file tree

Showing 3 changed files with 124 additions and 28 deletions.
diff --git a/src/test/parseCantoneseReadings.test.js b/src/test/parseCantoneseReadings.test.js
@@ -1,6 +1,6 @@
 import test from 'ava';
 
-import { parseCantoneseReadings } from '../util/parseCantoneseReadings.js';
+import { parseCantoneseReadings } from '../util/textHandling/parseCantoneseReadings.js';
 
 /**
  * @typedef {Object} TestCase
@@ -46,6 +46,43 @@ const testCases = [
  { text: '？', reading: '?' },
  ],
  },
+ {
+ text: '專業運動員成日斷韌帶。',
+ reading: 'zyun1 jip6 wan6 dung6 jyun4 seng4 jat6 tyun5 jan6 daai2.',
+ expected: [
+ { text: '專', reading: 'zyun1' },
+ { text: '業', reading: 'jip6' },
+ { text: '運', reading: 'wan6' },
+ { text: '動', reading: 'dung6' },
+ { text: '員', reading: 'jyun4' },
+ { text: '成', reading: 'seng4' },
+ { text: '日', reading: 'jat6' },
+ { text: '斷', reading: 'tyun5' },
+ { text: '韌', reading: 'jan6' },
+ { text: '帶', reading: 'daai2' },
+ { text: '。', reading: '.' },
+ ],
+ },
+ {
+ text: '佢考咗車牌六年，終於成功嘞。',
+ reading: 'keoi5 haau2 zo2 ce1 paai4 luk6 nin4 zung1 jyu1 sing4 gung1 laak3',
+ expected: [
+ { text: '佢', reading: 'keoi5' },
+ { text: '考', reading: 'haau2' },
+ { text: '咗', reading: 'zo2' },
+ { text: '車', reading: 'ce1' },
+ { text: '牌', reading: 'paai4' },
+ { text: '六', reading: 'luk6' },
+ { text: '年', reading: 'nin4' },
+ { text: '，', reading: '' },
+ { text: '終', reading: 'zung1' },
+ { text: '於', reading: 'jyu1' },
+ { text: '成', reading: 'sing4' },
+ { text: '功', reading: 'gung1' },
+ { text: '嘞', reading: 'laak3' },
+ { text: '。', reading: '' },
+ ],
+ },
 ];
 
 for (const { text, reading, expected } of testCases) {

diff --git a/src/util/parseCantoneseReadings.js → ...il/textHandling/parseCantoneseReadings.js b/src/util/parseCantoneseReadings.js → ...il/textHandling/parseCantoneseReadings.js
@@ -1,3 +1,10 @@
+import {
+ punctuations,
+ isHanzi,
+ isJyuutping,
+ isPunctuation,
+} from './textUtils.js';
+
 /**
  * Parses a text string into an array matching each character to the readings
  * @example text: "你get唔get到我講咩？"
@@ -13,37 +20,41 @@ function parseCantoneseReadings(text, readings) {
  * @type {{text: string, reading: string}[]}
  */
  const resultArray = [];
- let textIndex = 0;
- let readingIndex = 0;
- const punctuations = [
- '，',
- ',',
- '。',
- '.',
- '？',
- '?',
- '！',
- '!',
- '；',
- ';',
- '：',
- ':',
- '、',
- ',',
- ];
 
  const textArray = splitString(text, punctuations);
  const readingsArray = splitString(readings, punctuations);
- if (textArray.length !== readingsArray.length) {
- throw new Error('Text and readings do not match');
+
+ let readingIndex = 0;
+ let textIndex = 0;
+ for (let i = 0; i < Math.max(textArray.length, readingsArray.length); i++) {
+ const text = textArray[textIndex];
+ const reading = readingsArray[readingIndex];
+ const isTextHanzi = isHanzi(text);
+ const isReadingJyuutping = isJyuutping(reading);
+ const isTextPunctuation = isPunctuation(text);
+ const isReadingPunctuation = isPunctuation(reading);
+ // Ideal case
+ if (
+ (isTextHanzi && isReadingJyuutping) ||
+ (isTextPunctuation && isReadingPunctuation) ||
+ // Case where for example text is 'bu' and reading is 'bu4'
+ (!isTextHanzi && !isTextPunctuation && isReadingJyuutping)
+ ) {
+ resultArray.push({ text, reading });
+ textIndex++;
+ readingIndex++;
+ } else if (isTextPunctuation && isReadingJyuutping) {
+ // Send empty string to reading
+ resultArray.push({ text, reading: '' });
+ textIndex++;
+ } else {
+ throw new Error(
+ `Unexpected text "${text}" and reading "${reading}" at index ${i}`
+ );
+ }
  }
- return textArray.map((text, index) => {
- const reading = readingsArray[index];
- return {
- text,
- reading,
- };
- });
+
+ return resultArray;
 }
 
 /**

diff --git a/src/util/textHandling/textUtils.js b/src/util/textHandling/textUtils.js
@@ -0,0 +1,48 @@
+const punctuations = [
+ '，',
+ ',',
+ '。',
+ '.',
+ '？',
+ '?',
+ '！',
+ '!',
+ '；',
+ ';',
+ '：',
+ ':',
+ '、',
+ ',',
+ '，',
+];
+
+/**
+ * Returns true if the text is a Chinese character.
+ * @param {string} text
+ * @returns {boolean}
+ */
+function isHanzi(text) {
+ return /[\u3040-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff66-\uff9f]/.test(
+ text
+ );
+}
+
+/**
+ * Returns true if the text is a Jyutping reading.
+ * @param {string} text
+ * @returns {boolean}
+ */
+function isJyuutping(text) {
+ return /[a-zA-Z0-9]/.test(text);
+}
+
+/**
+ * Returns true if the text is a punctuation.
+ * @param {string} text
+ * @returns {boolean}
+ */
+function isPunctuation(text) {
+ return punctuations.includes(text);
+}
+
+export { punctuations, isHanzi, isJyuutping, isPunctuation };