From c756bd444c42dffbd979ab5797ad62e6c0adf30f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BA=B6=E7=88=B6=20=28taan=C2=B2=20fu=C2=B2=29?= Date: Wed, 24 Sep 2025 01:12:19 -0400 Subject: [PATCH 1/4] Update(CI): Streamline jyupting checker Resolves #202 Delegates responsibility to upstream Python checker --- .ci/checker.cpp | 113 ------------------------------ .ci/verify.php | 87 ----------------------- .github/workflows/deploy-test.yml | 25 ------- 3 files changed, 225 deletions(-) delete mode 100644 .ci/checker.cpp delete mode 100644 .ci/verify.php diff --git a/.ci/checker.cpp b/.ci/checker.cpp deleted file mode 100644 index a3dbcd4a..00000000 --- a/.ci/checker.cpp +++ /dev/null @@ -1,113 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#define JYUT phonemes[0] -#define INITIAL phonemes[1] -#define STEM phonemes[2] -#define FINAL phonemes[3] -#define TONE phonemes[4] -using namespace boost; - -std::string word, line, dict; -smatch phonemes; - -regex expr("(gw?|kw?|ng?|sh?|[bpmfdtlhzcjw])?(aa?|eo?|oe?|[iu]|yu|ng|m)([iumptk]|ng?)?([1-6])"); - -// CHANGE THIS -- a function to check if a jyutping word is valid -bool valid_jyutping(std::string a) -{ - // Has a jyutping-like structure -- catches most invalid spellings ('ch-', 'eu-', etc) - // If cannot be decomposed into exactly 4 groups, treat as invalid - if (!regex_match(a, phonemes, expr)) return false; - - /* Note: after the first regex_match, the jyutping string will be decomposed into phonemes acc to the following: - phonemes[0] = JYUT = full jyutping string - -------------------------------- - phonemes[1] = INITIAL = initial (empty if none) - phonemes[2] = STEM = stem (aa / a / e / i / o / u / eo / oe / yu / ng* / m*) - phonemes[3] = FINAL = final (empty if none; including -i and -u; thus -aai will be treated as -aa + -i) - phonemes[4] = TONE = tone - - *Only applies to /ng/ and /m/ - */ - - /* ------ ADDITIONAL CHECKING RULES ------ */ - - // eo followed by t / i / n - if (STEM == "eo" && !(FINAL == "t" || FINAL == "i" || FINAL == "n")) return false; - - // i / yu must begin with an initial, but allow ik1 and ik6 (hiccups) - if (JYUT == "ik1" || JYUT == "ik6") return true; - else if (INITIAL == "" && (STEM == "i" || STEM == "yu")) return false; - - // u must begin with an initial, except for /ung/ and /uk/ - if (INITIAL == "" && STEM == "u") return (FINAL == "ng" || FINAL == "k"); - - // Add more rules here... - - // catch invalid matches of /m/ and /ng/ - if ((STEM == "m" || STEM == "ng") && (INITIAL != "" && INITIAL != "h")) return false; - if ((STEM == "m" || STEM == "ng") && FINAL != "") return false; - - return true; -} - -//Strips down a jyutping string into words and passes to valid_jyutping() for checking -bool process(std::string a) -{ - std::istringstream sin(a); - while(sin >> word) { - if (!valid_jyutping(word)) return false; - } - return true; -} - -//Processes file and keeps track of discovered errors -int parse() -{ - int count_invalid = 0; - int line_no = 0; - - // Discard lines until '...' (i.e. end of header block) - while (true) { - getline(std::cin, line); - line_no++; - if (line == "...") break; - } - - // Real checking - while(getline(std::cin, line)) { - line_no++; - int temp = line.find('\t'); - std::string jyut_string = line.substr(temp+1, line.find('\t', temp + 1) - temp); - if (!process(jyut_string)) { - std::cerr << "[L" << line_no << "] Invalid Jyutping detected: " << jyut_string << std::endl; - count_invalid++; - } - } - - return count_invalid; -} - -// Wrapper for command line interface -int main (int argc, char** argv) -{ - std::ios_base::sync_with_stdio(false); - std::cin.tie(NULL); - - if (argc == 2) { - dict = argv[1]; - std::cout << "Loading " << dict << std::endl; - } else { - std::cerr << "Invalid input format" << std::endl; - std::cerr << "Use ./checker input_file_name.yaml" << std::endl; - return 1; - } - - std::freopen(dict.c_str(), "r", stdin); // redirect file to stdin stream - return parse(); -} diff --git a/.ci/verify.php b/.ci/verify.php deleted file mode 100644 index f92f10e5..00000000 --- a/.ci/verify.php +++ /dev/null @@ -1,87 +0,0 @@ -'; -$starttime = microtime(true); - -$fp = fopen('./jyut6ping3.dict.yaml', 'r'); -$errorcnt = 0; -$i = 0; -while($line = fgets($fp)) { - if (trim($line) === '' || $line[0] === '#') goto next; - if (strpos($line, "\t") === false) goto next; - - list($char, $val) = explode("\t", trim($line), 3); - - $syllable = strtok($val, " "); - while ($syllable !== false) { - $tone = substr($syllable, -1); - if ($tone !== '1' && $tone !== '2' && $tone !== '3' && $tone !== '4' && $tone !== '5' && $tone !== '6') { - goto error; - } - - $syllable = substr($syllable, 0, -1); - - if ($syllable == 'm' || $syllable == 'ng' || $syllable == 'hm' || $syllable == 'hng') goto success; - - $syllable = strtr($syllable, [ - 'q' => 'qqqq', 'v' => 'qqqq', - - 'aang' => 'v', 'aan' => 'v', 'aam' => 'v', - 'aak' => 'v', 'aat' => 'v', 'aap' => 'v', - 'aai' => 'v', 'aau' => 'v', 'aa' => 'v', - - 'ang' => 'v', 'an' => 'v', 'am' => 'v', - 'ak' => 'v', 'at' => 'v', 'ap' => 'v', - 'ai' => 'v', 'au' => 'v', 'a' => 'v', - - 'em' => 'v', 'eng' => 'v', 'en' => 'v', - 'ep' => 'v', 'ek' => 'v', 'et' => 'v', - 'ei' => 'v', 'eu' => 'v', 'e' => 'v', - - 'ing' => 'v', 'in' => 'v', 'im' => 'v', - 'ik' => 'v', 'it' => 'v', 'ip' => 'v', - 'iu' => 'v', 'i' => 'v', - - 'oi' => 'v', 'ou' => 'v', - 'ong' => 'v', 'on' => 'v', 'ot' => 'v', 'ok' => 'v', 'om' => 'v', - 'o' => 'v', - - 'eoi' => 'v', 'eon' => 'v', 'eot' => 'v', - 'oeng' => 'v', 'oet' => 'v', 'oek' => 'v', 'oe' => 'v', - 'yun' => 'v', 'yut' => 'v', 'yu' => 'v', - - 'ui' => 'v', 'ung' => 'v', 'un' => 'v', 'ut' => 'v', 'uk' => 'v', - 'u' => 'v', - ]); - - $syllable = strtr($syllable, [ - 'gw' => 'q', 'kw' => 'q', - 'b' => 'q', 'p' => 'q', 'm' => 'q', 'f' => 'q', - 'd' => 'q', 't' => 'q', 'n' => 'q', 'l' => 'q', - 'g' => 'q', 'k' => 'q', 'ng' => 'q', 'h' => 'q', - 'z' => 'q', 'c' => 'q', 's' => 'q', - 'j' => 'q', 'w' => 'q' - ]); - - if ($syllable !== 'qv' && $syllable !== 'v') { - echo $syllable . " "; - goto error; - } - - - success: - $syllable = strtok(" "); - continue; - - error: - echo "Invalid Jyutping detected on line $i: $line" . "\r\n"; - $errorcnt++; - $syllable = strtok(" "); - } - - next: - $i++; -} - -echo "Time used: " . (microtime(true) - $starttime) . ' s'; -exit(min($errorcnt, 1)); diff --git a/.github/workflows/deploy-test.yml b/.github/workflows/deploy-test.yml index ffea1029..b3f327e5 100644 --- a/.github/workflows/deploy-test.yml +++ b/.github/workflows/deploy-test.yml @@ -39,28 +39,3 @@ jobs: run: | cat log.tmp exit $(cat log.tmp | grep -c ^[EW]) - - check_jyutping_cpp: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - - run: | - echo "Installing prerequisites..." - sudo apt-get install libboost-all-dev - - run: | - echo "Building checker from source..." - export LD_LIBRARY_PATH="/usr/lib/boost/lib" - g++ .ci/checker.cpp -o checker.o -Ofast -I/usr/include/boost -L/usr/lib/boost/lib -lboost_regex - chmod u+x ./checker.o - - run: | - echo "Checking jyut6ping3.dict.yaml" - time ./checker.o jyut6ping3.dict.yaml - - check_jyutping_php: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - - run: | - php .ci/verify.php From dc17381c16e227274139c4f5b830b834e086f70e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BA=B6=E7=88=B6=20=28taan=C2=B2=20fu=C2=B2=29?= Date: Wed, 24 Sep 2025 01:25:37 -0400 Subject: [PATCH 2/4] Update(schema): change simplifier priority Addresses #189 --- jyut6ping3.schema.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jyut6ping3.schema.yaml b/jyut6ping3.schema.yaml index 94a03bd2..b907cca6 100644 --- a/jyut6ping3.schema.yaml +++ b/jyut6ping3.schema.yaml @@ -78,9 +78,9 @@ engine: - table_translator@cangjie5 filters: - simplifier@emoji_cantonese_suggestion + - simplifier - simplifier@variants_hk - simplifier@trad_tw - - simplifier - uniquifier - reverse_lookup_filter@jyut6ping3_reverse_lookup From 23287e6473acba1fa3b5cf832b0942e85221e874 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BA=B6=E7=88=B6=20=28taan=C2=B2=20fu=C2=B2=29?= Date: Wed, 24 Sep 2025 08:48:17 -0400 Subject: [PATCH 3/4] Fix(CI): pull request refs Old version always fetches commits from the master branch; fixed it so that it will now install files from the checked out branch (e.g. from a pull request). --- .github/workflows/deploy-test.yml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/deploy-test.yml b/.github/workflows/deploy-test.yml index b3f327e5..3d336803 100644 --- a/.github/workflows/deploy-test.yml +++ b/.github/workflows/deploy-test.yml @@ -22,15 +22,20 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Install rime engine run: | sudo apt-get install ibus-rime -y - - name: Install rime-cantonese files + - name: Install rime-cantonese files from checked out branch run: | chmod u+x ./.ci/* export rime_dir=~/.config/ibus/rime - ./.ci/install-schema.sh + curl -fsSL https://git.io/rime-install | bash -s -- :preset emoji CanCLID/rime-loengfan custom:set:config=default,key=installed_from,value=rime-cantonese custom:clear_schema_list custom:add:schema=jyut6ping3 custom:add:schema=cangjie5 custom:add:schema=stroke custom:add:schema=luna_pinyin lotem/rime-octagram-data lotem/rime-octagram-data@hant lotem/rime-octagram-data:customize:schema=jyut6ping3,model=hant + cp ./*.{txt,yaml} $rime_dir + cp ./opencc/* $rime_dir + +# Use cp ./** in lieu of ./.ci/install_schema.sh to ensure that action is running on checked out branch (e.g. PR refs) + - name: Compile run: | chmod u+wx ~/.config/ibus/rime/* From 611506b4ea228f71f6142c99ad39f8e88aa13d80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BA=B6=E7=88=B6=20=28taan=C2=B2=20fu=C2=B2=29?= Date: Wed, 24 Sep 2025 09:07:50 -0400 Subject: [PATCH 4/4] Temporarily reinstate cpp checker for transition --- .ci/checker.cpp | 109 ++++++++++++++++++++++++++++++ .github/workflows/deploy-test.yml | 17 +++++ 2 files changed, 126 insertions(+) create mode 100644 .ci/checker.cpp diff --git a/.ci/checker.cpp b/.ci/checker.cpp new file mode 100644 index 00000000..a3889d47 --- /dev/null +++ b/.ci/checker.cpp @@ -0,0 +1,109 @@ +#include +#include +#include +#include +#include +#include +#include +#define JYUT phonemes[0] +#define INITIAL phonemes[1] +#define STEM phonemes[2] +#define FINAL phonemes[3] +#define TONE phonemes[4] +using namespace boost; + +std::string word, line, dict; +smatch phonemes; + +regex expr("(gw?|kw?|ng?|sh?|[bpmfdtlhzcjw])?(aa?|eo?|oe?|[iu]|yu|ng|m)([iumptk]|ng?)?([1-6])"); + +// CHANGE THIS -- a function to check if a jyutping word is valid +bool valid_jyutping(std::string a) { + // Has a jyutping-like structure -- catches most invalid spellings ('ch-', 'eu-', etc) + // If cannot be decomposed into exactly 4 groups, treat as invalid + if (!regex_match(a, phonemes, expr)) return false; + + /* Note: after the first regex_match, the jyutping string will be decomposed into phonemes acc to the following: + phonemes[0] = JYUT = full jyutping string + -------------------------------- + phonemes[1] = INITIAL = initial (empty if none) + phonemes[2] = STEM = stem (aa / a / e / i / o / u / eo / oe / yu / ng* / m*) + phonemes[3] = FINAL = final (empty if none; including -i and -u; thus -aai will be treated as -aa + -i) + phonemes[4] = TONE = tone + + *Only applies to /ng/ and /m/ + */ + + /* ------ ADDITIONAL CHECKING RULES ------ */ + + // eo followed by t / i / n + if (STEM == "eo" && !(FINAL == "t" || FINAL == "i" || FINAL == "n")) return false; + + // i / yu must begin with an initial, but allow ik1 and ik6 (hiccups) + if (JYUT == "ik1" || JYUT == "ik6") return true; + else if (INITIAL == "" && (STEM == "i" || STEM == "yu")) return false; + + // u must begin with an initial, except for /ung/ and /uk/ + if (INITIAL == "" && STEM == "u") return (FINAL == "ng" || FINAL == "k"); + + // Add more rules here... + + // catch invalid matches of /m/ and /ng/ + if ((STEM == "m" || STEM == "ng") && (INITIAL != "" && INITIAL != "h")) return false; + if ((STEM == "m" || STEM == "ng") && FINAL != "") return false; + + return true; +} + +//Strips down a jyutping string into words and passes to valid_jyutping() for checking +bool process(std::string a) { + std::istringstream sin(a); + while(sin >> word) { + if (!valid_jyutping(word)) return false; + } + return true; +} + +//Processes file and keeps track of discovered errors +int parse() { + int count_invalid = 0; + int line_no = 0; + + // Discard lines until '...' (i.e. end of header block) + while (true) { + getline(std::cin, line); + line_no++; + if (line == "...") break; + } + + // Real checking + while(getline(std::cin, line)) { + line_no++; + int temp = line.find('\t'); + std::string jyut_string = line.substr(temp+1, line.find('\t', temp + 1) - temp); + if (!process(jyut_string)) { + std::cerr << "[L" << line_no << "] Invalid Jyutping detected: " << jyut_string << std::endl; + count_invalid++; + } + } + + return count_invalid; +} + +// Wrapper for command line interface +int main (int argc, char** argv) { + std::ios_base::sync_with_stdio(false); + std::cin.tie(NULL); + + if (argc == 2) { + dict = argv[1]; + std::cout << "Loading " << dict << std::endl; + } else { + std::cerr << "Invalid input format" << std::endl; + std::cerr << "Use ./checker input_file_name.yaml" << std::endl; + return 1; + } + + std::freopen(dict.c_str(), "r", stdin); // redirect file to stdin stream + return parse(); +} \ No newline at end of file diff --git a/.github/workflows/deploy-test.yml b/.github/workflows/deploy-test.yml index 3d336803..83832190 100644 --- a/.github/workflows/deploy-test.yml +++ b/.github/workflows/deploy-test.yml @@ -44,3 +44,20 @@ jobs: run: | cat log.tmp exit $(cat log.tmp | grep -c ^[EW]) + + check_jyutping_cpp: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - run: | + echo "Installing prerequisites..." + sudo apt-get install libboost-all-dev + - run: | + echo "Building checker from source..." + export LD_LIBRARY_PATH="/usr/lib/boost/lib" + g++ .ci/checker.cpp -o checker.o -Ofast -I/usr/include/boost -L/usr/lib/boost/lib -lboost_regex + chmod u+x ./checker.o + - run: | + echo "Checking jyut6ping3.dict.yaml" + time ./checker.o jyut6ping3.dict.yaml \ No newline at end of file