From 2a5eb473b649e4c5c41c0e866757cbc2ef796add Mon Sep 17 00:00:00 2001 From: Lukhnos Liu Date: Fri, 8 Jul 2022 06:43:12 -0700 Subject: [PATCH 1/4] Use Gramambular2 - Keep Engine code in sync with fcitx5-mcbopomofo - Keep associtaed phrases support to McBopomofoLM - Update KeyHandler and InputState to use Gramambular2 - Remove the composing buffer size restriction - CI settings update to run Gramambular2 tests Fixes #309. --- ...ntinuous-integration-workflow-xcode-12.yml | 10 +- ...uous-integration-workflow-xcode-latest.yml | 10 +- McBopomofo.xcodeproj/project.pbxproj | 48 +- Source/Engine/CMakeLists.txt | 4 +- .../Engine/Gramambular/BlockReadingBuilder.h | 214 ------ Source/Engine/Gramambular/Gramambular.h | 41 -- Source/Engine/Gramambular/GramambularTest.cpp | 238 ------ Source/Engine/Gramambular/Grid.cpp | 74 -- Source/Engine/Gramambular/Grid.h | 281 -------- Source/Engine/Gramambular/KeyValuePair.h | 67 -- Source/Engine/Gramambular/Node.h | 164 ----- Source/Engine/Gramambular/NodeAnchor.h | 72 -- Source/Engine/Gramambular/Span.h | 101 --- Source/Engine/Gramambular/Unigram.h | 99 --- Source/Engine/Gramambular/Walker.h | 173 ----- Source/Engine/Mandarin/CMakeLists.txt | 2 +- Source/Engine/McBopomofoLM.cpp | 48 +- Source/Engine/McBopomofoLM.h | 13 +- Source/Engine/ParselessLM.cpp | 22 +- Source/Engine/ParselessLM.h | 8 +- Source/Engine/ParselessLMTest.cpp | 12 +- Source/Engine/PhraseReplacementMap.cpp | 15 +- Source/Engine/PhraseReplacementMap.h | 11 +- Source/Engine/UserOverrideModel.cpp | 65 +- Source/Engine/UserOverrideModel.h | 6 +- Source/Engine/UserPhrasesLM.cpp | 20 +- Source/Engine/UserPhrasesLM.h | 31 +- Source/Engine/UserPhrasesLMTest.cpp | 6 +- .../.clang-format | 0 .../CMakeLists.txt | 20 +- Source/Engine/gramambular2/README.md | 13 + .../language_model.h} | 45 +- Source/Engine/gramambular2/reading_grid.cpp | 624 ++++++++++++++++ Source/Engine/gramambular2/reading_grid.h | 263 +++++++ .../Engine/gramambular2/reading_grid_test.cpp | 676 ++++++++++++++++++ Source/InputMethodController.swift | 5 - Source/InputState.swift | 3 +- Source/KeyHandler.mm | 490 +++++-------- Source/LanguageModelManager.mm | 6 +- 39 files changed, 1976 insertions(+), 2024 deletions(-) delete mode 100644 Source/Engine/Gramambular/BlockReadingBuilder.h delete mode 100644 Source/Engine/Gramambular/Gramambular.h delete mode 100644 Source/Engine/Gramambular/GramambularTest.cpp delete mode 100644 Source/Engine/Gramambular/Grid.cpp delete mode 100644 Source/Engine/Gramambular/Grid.h delete mode 100644 Source/Engine/Gramambular/KeyValuePair.h delete mode 100644 Source/Engine/Gramambular/Node.h delete mode 100644 Source/Engine/Gramambular/NodeAnchor.h delete mode 100644 Source/Engine/Gramambular/Span.h delete mode 100644 Source/Engine/Gramambular/Unigram.h delete mode 100644 Source/Engine/Gramambular/Walker.h rename Source/Engine/{Gramambular => gramambular2}/.clang-format (100%) rename Source/Engine/{Gramambular => gramambular2}/CMakeLists.txt (52%) create mode 100644 Source/Engine/gramambular2/README.md rename Source/Engine/{Gramambular/LanguageModel.h => gramambular2/language_model.h} (54%) create mode 100644 Source/Engine/gramambular2/reading_grid.cpp create mode 100644 Source/Engine/gramambular2/reading_grid.h create mode 100644 Source/Engine/gramambular2/reading_grid_test.cpp diff --git a/.github/workflows/continuous-integration-workflow-xcode-12.yml b/.github/workflows/continuous-integration-workflow-xcode-12.yml index 876635921..eb3d89d3a 100644 --- a/.github/workflows/continuous-integration-workflow-xcode-12.yml +++ b/.github/workflows/continuous-integration-workflow-xcode-12.yml @@ -24,12 +24,12 @@ jobs: - name: Run MandarinTest run: make runMandarinTest working-directory: Source/Engine/Mandarin/build - - name: Build GramambularTest + - name: Build Gramambular2Test run: cmake -S . -B build - working-directory: Source/Engine/Gramambular - - name: Run GramambularTest - run: make runGramambularTest - working-directory: Source/Engine/Gramambular/build + working-directory: Source/Engine/gramambular2 + - name: Run Gramambular2Test + run: make runGramambular2Test + working-directory: Source/Engine/gramambular2/build - name: Test McBopomofo App Bundle run: xcodebuild -scheme McBopomofo -configuration Debug test - name: Test CandidateUI diff --git a/.github/workflows/continuous-integration-workflow-xcode-latest.yml b/.github/workflows/continuous-integration-workflow-xcode-latest.yml index 7cba10e47..4a8a78629 100644 --- a/.github/workflows/continuous-integration-workflow-xcode-latest.yml +++ b/.github/workflows/continuous-integration-workflow-xcode-latest.yml @@ -24,12 +24,12 @@ jobs: - name: Run MandarinTest run: make runMandarinTest working-directory: Source/Engine/Mandarin/build - - name: Build GramambularTest + - name: Build Gramambular2Test run: cmake -S . -B build - working-directory: Source/Engine/Gramambular - - name: Run GramambularTest - run: make runGramambularTest - working-directory: Source/Engine/Gramambular/build + working-directory: Source/Engine/gramambular2 + - name: Run Gramambular2Test + run: make runGramambular2Test + working-directory: Source/Engine/gramambular2/build - name: Test McBopomofo App Bundle run: xcodebuild -scheme McBopomofo -configuration Debug test - name: Test CandidateUI diff --git a/McBopomofo.xcodeproj/project.pbxproj b/McBopomofo.xcodeproj/project.pbxproj index bebae9b2c..c798db3ec 100644 --- a/McBopomofo.xcodeproj/project.pbxproj +++ b/McBopomofo.xcodeproj/project.pbxproj @@ -16,11 +16,11 @@ 6A2E40F6253A69DA00D1AE1D /* Images.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 6A2E40F5253A69DA00D1AE1D /* Images.xcassets */; }; 6A2E40F9253A6AA000D1AE1D /* Images.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 6A2E40F5253A69DA00D1AE1D /* Images.xcassets */; }; 6A38BC1515FC117A00A8A51F /* data.txt in Resources */ = {isa = PBXBuildFile; fileRef = 6A38BBF615FC117A00A8A51F /* data.txt */; }; + 6A4F5F982879E838008C4307 /* reading_grid.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6A4F5F932879E838008C4307 /* reading_grid.cpp */; }; 6A6ED16B2797650A0012872E /* template-phrases-replacement.txt in Resources */ = {isa = PBXBuildFile; fileRef = 6A6ED1632797650A0012872E /* template-phrases-replacement.txt */; }; 6A6ED16C2797650A0012872E /* template-data.txt in Resources */ = {isa = PBXBuildFile; fileRef = 6A6ED1652797650A0012872E /* template-data.txt */; }; 6A6ED16D2797650A0012872E /* template-exclude-phrases-plain-bpmf.txt in Resources */ = {isa = PBXBuildFile; fileRef = 6A6ED1672797650A0012872E /* template-exclude-phrases-plain-bpmf.txt */; }; 6A6ED16E2797650A0012872E /* template-exclude-phrases.txt in Resources */ = {isa = PBXBuildFile; fileRef = 6A6ED1692797650A0012872E /* template-exclude-phrases.txt */; }; - 6A74B14927C16845001988F4 /* Grid.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6A74B14827C16845001988F4 /* Grid.cpp */; }; 6ACA41FA15FC1D9000935EF6 /* InfoPlist.strings in Resources */ = {isa = PBXBuildFile; fileRef = 6ACA41EA15FC1D9000935EF6 /* InfoPlist.strings */; }; 6ACA41FB15FC1D9000935EF6 /* License.rtf in Resources */ = {isa = PBXBuildFile; fileRef = 6ACA41EC15FC1D9000935EF6 /* License.rtf */; }; 6ACA41FC15FC1D9000935EF6 /* Localizable.strings in Resources */ = {isa = PBXBuildFile; fileRef = 6ACA41EE15FC1D9000935EF6 /* Localizable.strings */; }; @@ -104,16 +104,6 @@ 6A0D4EF015FC0DA600ABF4B3 /* Bopomofo@2x.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = "Bopomofo@2x.tiff"; sourceTree = ""; }; 6A0D4EF515FC0DA600ABF4B3 /* McBopomofo-Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = "McBopomofo-Info.plist"; sourceTree = ""; }; 6A0D4EF615FC0DA600ABF4B3 /* McBopomofo-Prefix.pch */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "McBopomofo-Prefix.pch"; sourceTree = ""; }; - 6A0D4F1515FC0EB100ABF4B3 /* BlockReadingBuilder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = BlockReadingBuilder.h; sourceTree = ""; }; - 6A0D4F1615FC0EB100ABF4B3 /* Gramambular.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Gramambular.h; sourceTree = ""; }; - 6A0D4F1715FC0EB100ABF4B3 /* Grid.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Grid.h; sourceTree = ""; }; - 6A0D4F1815FC0EB100ABF4B3 /* KeyValuePair.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KeyValuePair.h; sourceTree = ""; }; - 6A0D4F1915FC0EB100ABF4B3 /* LanguageModel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LanguageModel.h; sourceTree = ""; }; - 6A0D4F1A15FC0EB100ABF4B3 /* Node.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Node.h; sourceTree = ""; }; - 6A0D4F1B15FC0EB100ABF4B3 /* NodeAnchor.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = NodeAnchor.h; sourceTree = ""; }; - 6A0D4F1C15FC0EB100ABF4B3 /* Span.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Span.h; sourceTree = ""; }; - 6A0D4F1D15FC0EB100ABF4B3 /* Unigram.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Unigram.h; sourceTree = ""; }; - 6A0D4F1E15FC0EB100ABF4B3 /* Walker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Walker.h; sourceTree = ""; }; 6A0D4F2015FC0EB100ABF4B3 /* Mandarin.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Mandarin.cpp; sourceTree = ""; }; 6A0D4F2115FC0EB100ABF4B3 /* Mandarin.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Mandarin.h; sourceTree = ""; }; 6A0D4F5615FC0EF900ABF4B3 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = file.xib; name = "zh-Hant"; path = "Source/zh-Hant.lproj/preferences.xib"; sourceTree = ""; }; @@ -124,6 +114,9 @@ 6A225A1E23679F2600F685C6 /* NotarizedArchives */ = {isa = PBXFileReference; lastKnownFileType = folder; path = NotarizedArchives; sourceTree = ""; }; 6A2E40F5253A69DA00D1AE1D /* Images.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Images.xcassets; sourceTree = ""; }; 6A38BBF615FC117A00A8A51F /* data.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = data.txt; sourceTree = ""; }; + 6A4F5F912879E838008C4307 /* reading_grid.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = reading_grid.h; sourceTree = ""; }; + 6A4F5F922879E838008C4307 /* language_model.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = language_model.h; sourceTree = ""; }; + 6A4F5F932879E838008C4307 /* reading_grid.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reading_grid.cpp; sourceTree = ""; }; 6A6ED1642797650A0012872E /* Base */ = {isa = PBXFileReference; lastKnownFileType = text; name = Base; path = "Base.lproj/template-phrases-replacement.txt"; sourceTree = ""; }; 6A6ED1662797650A0012872E /* Base */ = {isa = PBXFileReference; lastKnownFileType = text; name = Base; path = "Base.lproj/template-data.txt"; sourceTree = ""; }; 6A6ED1682797650A0012872E /* Base */ = {isa = PBXFileReference; lastKnownFileType = text; name = Base; path = "Base.lproj/template-exclude-phrases-plain-bpmf.txt"; sourceTree = ""; }; @@ -132,7 +125,6 @@ 6A6ED170279765140012872E /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = text; name = "zh-Hant"; path = "zh-Hant.lproj/template-exclude-phrases-plain-bpmf.txt"; sourceTree = ""; }; 6A6ED171279765170012872E /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = text; name = "zh-Hant"; path = "zh-Hant.lproj/template-exclude-phrases.txt"; sourceTree = ""; }; 6A6ED1722797651A0012872E /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = text; name = "zh-Hant"; path = "zh-Hant.lproj/template-phrases-replacement.txt"; sourceTree = ""; }; - 6A74B14827C16845001988F4 /* Grid.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Grid.cpp; sourceTree = ""; }; 6A93050C279877FF00D370DA /* McBopomofoInstaller-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "McBopomofoInstaller-Bridging-Header.h"; sourceTree = ""; }; 6ACA41CB15FC1D7500935EF6 /* McBopomofoInstaller.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = McBopomofoInstaller.app; sourceTree = BUILT_PRODUCTS_DIR; }; 6ACA41EB15FC1D9000935EF6 /* en */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = en; path = en.lproj/InfoPlist.strings; sourceTree = ""; }; @@ -304,7 +296,7 @@ 6A0D4F1215FC0EB100ABF4B3 /* Engine */ = { isa = PBXGroup; children = ( - 6A0D4F1315FC0EB100ABF4B3 /* Gramambular */, + 6A4F5F8F2879E838008C4307 /* gramambular2 */, 6A0D4F1F15FC0EB100ABF4B3 /* Mandarin */, 6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */, 6ACC3D3C27914AAB00F1B140 /* KeyValueBlobReader.h */, @@ -326,24 +318,6 @@ path = Engine; sourceTree = ""; }; - 6A0D4F1315FC0EB100ABF4B3 /* Gramambular */ = { - isa = PBXGroup; - children = ( - 6A0D4F1515FC0EB100ABF4B3 /* BlockReadingBuilder.h */, - 6A0D4F1615FC0EB100ABF4B3 /* Gramambular.h */, - 6A74B14827C16845001988F4 /* Grid.cpp */, - 6A0D4F1715FC0EB100ABF4B3 /* Grid.h */, - 6A0D4F1815FC0EB100ABF4B3 /* KeyValuePair.h */, - 6A0D4F1915FC0EB100ABF4B3 /* LanguageModel.h */, - 6A0D4F1A15FC0EB100ABF4B3 /* Node.h */, - 6A0D4F1B15FC0EB100ABF4B3 /* NodeAnchor.h */, - 6A0D4F1C15FC0EB100ABF4B3 /* Span.h */, - 6A0D4F1D15FC0EB100ABF4B3 /* Unigram.h */, - 6A0D4F1E15FC0EB100ABF4B3 /* Walker.h */, - ); - path = Gramambular; - sourceTree = ""; - }; 6A0D4F1F15FC0EB100ABF4B3 /* Mandarin */ = { isa = PBXGroup; children = ( @@ -380,6 +354,16 @@ path = Data; sourceTree = ""; }; + 6A4F5F8F2879E838008C4307 /* gramambular2 */ = { + isa = PBXGroup; + children = ( + 6A4F5F912879E838008C4307 /* reading_grid.h */, + 6A4F5F922879E838008C4307 /* language_model.h */, + 6A4F5F932879E838008C4307 /* reading_grid.cpp */, + ); + path = gramambular2; + sourceTree = ""; + }; 6A6ED162279764CD0012872E /* Custom Phrase Templates */ = { isa = PBXGroup; children = ( @@ -663,6 +647,7 @@ D4A13D5A27A59F0B003BE359 /* InputMethodController.swift in Sources */, D44FB74527915565003C80A6 /* Preferences.swift in Sources */, D4E569DC27A34D0E00AC2CEF /* KeyHandler.mm in Sources */, + 6A4F5F982879E838008C4307 /* reading_grid.cpp in Sources */, D47F7DD0278C0897002F9DD7 /* NonModalAlertWindowController.swift in Sources */, D456576E279E4F7B00DF6BC9 /* KeyHandlerInput.swift in Sources */, D47F7DCE278BFB57002F9DD7 /* PreferencesWindowController.swift in Sources */, @@ -674,7 +659,6 @@ D41355DE278EA3ED005E5CBD /* UserPhrasesLM.cpp in Sources */, 6ACC3D3F27914F2400F1B140 /* KeyValueBlobReader.cpp in Sources */, D41355D8278D74B5005E5CBD /* LanguageModelManager.mm in Sources */, - 6A74B14927C16845001988F4 /* Grid.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/Source/Engine/CMakeLists.txt b/Source/Engine/CMakeLists.txt index 9be7697a8..d32d6cb82 100644 --- a/Source/Engine/CMakeLists.txt +++ b/Source/Engine/CMakeLists.txt @@ -1,8 +1,10 @@ -cmake_minimum_required(VERSION 3.17) +cmake_minimum_required(VERSION 3.6) project(McBopomofoLMLib) set(CMAKE_CXX_STANDARD 17) +add_subdirectory(gramambular2) + include_directories("Gramambular") add_library(McBopomofoLMLib KeyValueBlobReader.cpp diff --git a/Source/Engine/Gramambular/BlockReadingBuilder.h b/Source/Engine/Gramambular/BlockReadingBuilder.h deleted file mode 100644 index 6d4d7e252..000000000 --- a/Source/Engine/Gramambular/BlockReadingBuilder.h +++ /dev/null @@ -1,214 +0,0 @@ -// -// BlockReadingBuilder.h -// -// Copyright (c) 2007-2010 Lukhnos D. Liu (http://lukhnos.org) -// -// Permission is hereby granted, free of charge, to any person -// obtaining a copy of this software and associated documentation -// files (the "Software"), to deal in the Software without -// restriction, including without limitation the rights to use, -// copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following -// conditions: -// -// The above copyright notice and this permission notice shall be -// included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. -// - -#ifndef BLOCKREADINGBUILDER_H_ -#define BLOCKREADINGBUILDER_H_ - -#include -#include - -#include "Grid.h" -#include "LanguageModel.h" - -namespace Formosa { -namespace Gramambular { - -class BlockReadingBuilder { - public: - explicit BlockReadingBuilder(LanguageModel* lm); - void clear(); - - size_t length() const; - size_t cursorIndex() const; - void setCursorIndex(size_t newIndex); - void insertReadingAtCursor(const std::string& reading); - bool deleteReadingBeforeCursor(); // backspace - bool deleteReadingAfterCursor(); // delete - - bool removeHeadReadings(size_t count); - - void setJoinSeparator(const std::string& separator); - const std::string joinSeparator() const; - - std::vector readings() const; - - Grid& grid(); - - protected: - void build(); - - static const std::string Join(std::vector::const_iterator begin, - std::vector::const_iterator end, - const std::string& separator); - - // 最多使用六個字組成一個詞 - static const size_t MaximumBuildSpanLength = 6; - - size_t m_cursorIndex; - std::vector m_readings; - - Grid m_grid; - LanguageModel* m_LM; - std::string m_joinSeparator; -}; - -inline BlockReadingBuilder::BlockReadingBuilder(LanguageModel* lm) - : m_LM(lm), m_cursorIndex(0) {} - -inline void BlockReadingBuilder::clear() { - m_cursorIndex = 0; - m_readings.clear(); - m_grid.clear(); -} - -inline size_t BlockReadingBuilder::length() const { return m_readings.size(); } - -inline size_t BlockReadingBuilder::cursorIndex() const { return m_cursorIndex; } - -inline void BlockReadingBuilder::setCursorIndex(size_t newIndex) { - m_cursorIndex = newIndex > m_readings.size() ? m_readings.size() : newIndex; -} - -inline void BlockReadingBuilder::insertReadingAtCursor( - const std::string& reading) { - m_readings.insert(m_readings.begin() + m_cursorIndex, reading); - - m_grid.expandGridByOneAtLocation(m_cursorIndex); - build(); - m_cursorIndex++; -} - -inline std::vector BlockReadingBuilder::readings() const { - return m_readings; -} - -inline bool BlockReadingBuilder::deleteReadingBeforeCursor() { - if (!m_cursorIndex) { - return false; - } - - m_readings.erase(m_readings.begin() + m_cursorIndex - 1, - m_readings.begin() + m_cursorIndex); - m_cursorIndex--; - m_grid.shrinkGridByOneAtLocation(m_cursorIndex); - build(); - return true; -} - -inline bool BlockReadingBuilder::deleteReadingAfterCursor() { - if (m_cursorIndex == m_readings.size()) { - return false; - } - - m_readings.erase(m_readings.begin() + m_cursorIndex, - m_readings.begin() + m_cursorIndex + 1); - m_grid.shrinkGridByOneAtLocation(m_cursorIndex); - build(); - return true; -} - -inline bool BlockReadingBuilder::removeHeadReadings(size_t count) { - if (count > length()) { - return false; - } - - for (size_t i = 0; i < count; i++) { - if (m_cursorIndex) { - m_cursorIndex--; - } - m_readings.erase(m_readings.begin(), m_readings.begin() + 1); - m_grid.shrinkGridByOneAtLocation(0); - build(); - } - - return true; -} - -inline void BlockReadingBuilder::setJoinSeparator( - const std::string& separator) { - m_joinSeparator = separator; -} - -inline const std::string BlockReadingBuilder::joinSeparator() const { - return m_joinSeparator; -} - -inline Grid& BlockReadingBuilder::grid() { return m_grid; } - -inline void BlockReadingBuilder::build() { - if (!m_LM) { - return; - } - - size_t begin = 0; - size_t end = m_cursorIndex + MaximumBuildSpanLength; - - if (m_cursorIndex < MaximumBuildSpanLength) { - begin = 0; - } else { - begin = m_cursorIndex - MaximumBuildSpanLength; - } - - if (end > m_readings.size()) { - end = m_readings.size(); - } - - for (size_t p = begin; p < end; p++) { - for (size_t q = 1; q <= MaximumBuildSpanLength && p + q <= end; q++) { - std::string combinedReading = Join( - m_readings.begin() + p, m_readings.begin() + p + q, m_joinSeparator); - if (!m_grid.hasNodeAtLocationSpanningLengthMatchingKey(p, q, - combinedReading)) { - std::vector unigrams = m_LM->unigramsForKey(combinedReading); - - if (unigrams.size() > 0) { - Node n(combinedReading, unigrams); - m_grid.insertNode(n, p, q); - } - } - } - } -} - -inline const std::string BlockReadingBuilder::Join( - std::vector::const_iterator begin, - std::vector::const_iterator end, - const std::string& separator) { - std::string result; - for (std::vector::const_iterator iter = begin; iter != end;) { - result += *iter; - ++iter; - if (iter != end) { - result += separator; - } - } - return result; -} -} // namespace Gramambular -} // namespace Formosa - -#endif diff --git a/Source/Engine/Gramambular/Gramambular.h b/Source/Engine/Gramambular/Gramambular.h deleted file mode 100644 index 7c083a71f..000000000 --- a/Source/Engine/Gramambular/Gramambular.h +++ /dev/null @@ -1,41 +0,0 @@ -// -// Gramambular.h -// -// Copyright (c) 2007-2010 Lukhnos D. Liu (http://lukhnos.org) -// -// Permission is hereby granted, free of charge, to any person -// obtaining a copy of this software and associated documentation -// files (the "Software"), to deal in the Software without -// restriction, including without limitation the rights to use, -// copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following -// conditions: -// -// The above copyright notice and this permission notice shall be -// included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. -// - -#ifndef GRAMAMBULAR_H_ -#define GRAMAMBULAR_H_ - -#include "BlockReadingBuilder.h" -#include "Grid.h" -#include "KeyValuePair.h" -#include "LanguageModel.h" -#include "Node.h" -#include "NodeAnchor.h" -#include "Span.h" -#include "Unigram.h" -#include "Walker.h" - -#endif diff --git a/Source/Engine/Gramambular/GramambularTest.cpp b/Source/Engine/Gramambular/GramambularTest.cpp deleted file mode 100644 index 57fd8d2aa..000000000 --- a/Source/Engine/Gramambular/GramambularTest.cpp +++ /dev/null @@ -1,238 +0,0 @@ -// Copyright (c) 2022 and onwards Lukhnos Liu -// -// Permission is hereby granted, free of charge, to any person -// obtaining a copy of this software and associated documentation -// files (the "Software"), to deal in the Software without -// restriction, including without limitation the rights to use, -// copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following -// conditions: -// -// The above copyright notice and this permission notice shall be -// included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. - -#include -#include -#include -#include -#include -#include - -#include "Gramambular.h" -#include "gtest/gtest.h" - -namespace Formosa { -namespace Gramambular { - -const char* SampleData = R"( -# -# The sample is from libtabe (http://sourceforge.net/projects/libtabe/) -# last updated in 2002. The project was originally initiated by -# Pai-Hsiang Hsiao in 1999. -# -# Libtabe is a frequency table of Taiwanese Mandarin words. The database -# itself is, according to the tar file, released under the BSD License. -# - -ㄙ 絲 -9.495858 -ㄙ 思 -9.006414 -ㄙ 私 -99.000000 -ㄙ 斯 -8.091803 -ㄙ 司 -99.000000 -ㄙ 嘶 -13.513987 -ㄙ 撕 -12.259095 -ㄍㄠ 高 -7.171551 -ㄎㄜ 顆 -10.574273 -ㄎㄜ 棵 -11.504072 -ㄎㄜ 刻 -10.450457 -ㄎㄜ 科 -7.171052 -ㄎㄜ 柯 -99.000000 -ㄍㄠ 膏 -11.928720 -ㄍㄠ 篙 -13.624335 -ㄍㄠ 糕 -12.390804 -ㄉㄜ˙ 的 -3.516024 -ㄉㄧˊ 的 -3.516024 -ㄉㄧˋ 的 -3.516024 -ㄓㄨㄥ 中 -5.809297 -ㄉㄜ˙ 得 -7.427179 -ㄍㄨㄥ 共 -8.381971 -ㄍㄨㄥ 供 -8.501463 -ㄐㄧˋ 既 -99.000000 -ㄐㄧㄣ 今 -8.034095 -ㄍㄨㄥ 紅 -8.858181 -ㄐㄧˋ 際 -7.608341 -ㄐㄧˋ 季 -99.000000 -ㄐㄧㄣ 金 -7.290109 -ㄐㄧˋ 騎 -10.939895 -ㄓㄨㄥ 終 -99.000000 -ㄐㄧˋ 記 -99.000000 -ㄐㄧˋ 寄 -99.000000 -ㄐㄧㄣ 斤 -99.000000 -ㄐㄧˋ 繼 -9.715317 -ㄐㄧˋ 計 -7.926683 -ㄐㄧˋ 暨 -8.373022 -ㄓㄨㄥ 鐘 -9.877580 -ㄐㄧㄣ 禁 -10.711079 -ㄍㄨㄥ 公 -7.877973 -ㄍㄨㄥ 工 -7.822167 -ㄍㄨㄥ 攻 -99.000000 -ㄍㄨㄥ 功 -99.000000 -ㄍㄨㄥ 宮 -99.000000 -ㄓㄨㄥ 鍾 -9.685671 -ㄐㄧˋ 繫 -10.425662 -ㄍㄨㄥ 弓 -99.000000 -ㄍㄨㄥ 恭 -99.000000 -ㄐㄧˋ 劑 -8.888722 -ㄐㄧˋ 祭 -10.204425 -ㄐㄧㄣ 浸 -11.378321 -ㄓㄨㄥ 盅 -99.000000 -ㄐㄧˋ 忌 -99.000000 -ㄐㄧˋ 技 -8.450826 -ㄐㄧㄣ 筋 -11.074890 -ㄍㄨㄥ 躬 -99.000000 -ㄐㄧˋ 冀 -12.045357 -ㄓㄨㄥ 忠 -99.000000 -ㄐㄧˋ 妓 -99.000000 -ㄐㄧˋ 濟 -9.517568 -ㄐㄧˋ 薊 -12.021587 -ㄐㄧㄣ 巾 -99.000000 -ㄐㄧㄣ 襟 -12.784206 -ㄋㄧㄢˊ 年 -6.086515 -ㄐㄧㄤˇ 講 -9.164384 -ㄐㄧㄤˇ 獎 -8.690941 -ㄐㄧㄤˇ 蔣 -10.127828 -ㄋㄧㄢˊ 黏 -11.336864 -ㄋㄧㄢˊ 粘 -11.285740 -ㄐㄧㄤˇ 槳 -12.492933 -ㄍㄨㄥㄙ 公司 -6.299461 -ㄎㄜㄐㄧˋ 科技 -6.736613 -ㄐㄧˋㄍㄨㄥ 濟公 -13.336653 -ㄐㄧㄤˇㄐㄧㄣ 獎金 -10.344678 -ㄋㄧㄢˊㄓㄨㄥ 年終 -11.668947 -ㄋㄧㄢˊㄓㄨㄥ 年中 -11.373044 -ㄍㄠㄎㄜㄐㄧˋ 高科技 -9.842421 -)"; - -class SimpleLM : public LanguageModel { - public: - SimpleLM(const char* input, bool swapKeyValue = false) { - std::stringstream sstream(input); - while (sstream.good()) { - std::string line; - getline(sstream, line); - - if (!line.size() || (line.size() && line[0] == '#')) { - continue; - } - - std::stringstream linestream(line); - std::string col0; - std::string col1; - std::string col2; - linestream >> col0; - linestream >> col1; - linestream >> col2; - - Unigram u; - - if (swapKeyValue) { - u.keyValue.key = col1; - u.keyValue.value = col0; - } else { - u.keyValue.key = col0; - u.keyValue.value = col1; - } - - u.score = atof(col2.c_str()); - - m_db[u.keyValue.key].push_back(u); - } - } - - const std::vector unigramsForKey(const std::string& key) override { - std::map>::const_iterator f = - m_db.find(key); - return f == m_db.end() ? std::vector() : (*f).second; - } - - bool hasUnigramsForKey(const std::string& key) override { - std::map>::const_iterator f = - m_db.find(key); - return f != m_db.end(); - } - - protected: - std::map> m_db; -}; - -TEST(GramambularTest, InputTest) { - SimpleLM lm(SampleData); - - BlockReadingBuilder builder(&lm); - builder.insertReadingAtCursor("ㄍㄠ"); - builder.insertReadingAtCursor("ㄐㄧˋ"); - builder.setCursorIndex(1); - builder.insertReadingAtCursor("ㄎㄜ"); - builder.setCursorIndex(0); - builder.deleteReadingAfterCursor(); - builder.insertReadingAtCursor("ㄍㄠ"); - builder.setCursorIndex(builder.length()); - builder.insertReadingAtCursor("ㄍㄨㄥ"); - builder.insertReadingAtCursor("ㄙ"); - builder.insertReadingAtCursor("ㄉㄜ˙"); - builder.insertReadingAtCursor("ㄋㄧㄢˊ"); - builder.insertReadingAtCursor("ㄓㄨㄥ"); - builder.insertReadingAtCursor("ㄐㄧㄤˇ"); - builder.insertReadingAtCursor("ㄐㄧㄣ"); - - Walker walker(&builder.grid()); - - std::vector walked = walker.walk(0, 0.0); - - std::vector composed; - for (std::vector::iterator wi = walked.begin(); - wi != walked.end(); ++wi) { - composed.push_back((*wi).node->currentKeyValue().value); - } - ASSERT_EQ(composed, - (std::vector{"高科技", "公司", "的", "年中", "獎金"})); -} - -TEST(GramambularTest, WordSegmentationTest) { - SimpleLM lm2(SampleData, true); - BlockReadingBuilder builder2(&lm2); - builder2.insertReadingAtCursor("高"); - builder2.insertReadingAtCursor("科"); - builder2.insertReadingAtCursor("技"); - builder2.insertReadingAtCursor("公"); - builder2.insertReadingAtCursor("司"); - builder2.insertReadingAtCursor("的"); - builder2.insertReadingAtCursor("年"); - builder2.insertReadingAtCursor("終"); - builder2.insertReadingAtCursor("獎"); - builder2.insertReadingAtCursor("金"); - Walker walker2(&builder2.grid()); - - std::vector walked = walker2.walk(0, 0.0); - - std::vector segmented; - for (std::vector::iterator wi = walked.begin(); - wi != walked.end(); ++wi) { - segmented.push_back((*wi).node->currentKeyValue().key); - } - ASSERT_EQ(segmented, - (std::vector{"高科技", "公司", "的", "年終", "獎金"})); -} - -} // namespace Gramambular -} // namespace Formosa diff --git a/Source/Engine/Gramambular/Grid.cpp b/Source/Engine/Gramambular/Grid.cpp deleted file mode 100644 index 550111757..000000000 --- a/Source/Engine/Gramambular/Grid.cpp +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2007 and onwards Lukhnos Liu -// -// Permission is hereby granted, free of charge, to any person -// obtaining a copy of this software and associated documentation -// files (the "Software"), to deal in the Software without -// restriction, including without limitation the rights to use, -// copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following -// conditions: -// -// The above copyright notice and this permission notice shall be -// included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. - -#include "Grid.h" - -#include -#include - -namespace Formosa { -namespace Gramambular { - -std::string Grid::dumpDOT() { - std::stringstream sst; - sst << "digraph {" << std::endl; - sst << "graph [ rankdir=LR ];" << std::endl; - sst << "BOS;" << std::endl; - - for (size_t p = 0; p < m_spans.size(); p++) { - Span& span = m_spans[p]; - for (size_t ni = 0; ni <= span.maximumLength(); ni++) { - Node* np = span.nodeOfLength(ni); - if (np) { - if (!p) { - sst << "BOS -> " << np->currentKeyValue().value << ";" << std::endl; - } - - sst << np->currentKeyValue().value << ";" << std::endl; - - if (p + ni < m_spans.size()) { - Span& dstSpan = m_spans[p + ni]; - for (size_t q = 0; q <= dstSpan.maximumLength(); q++) { - Node* dn = dstSpan.nodeOfLength(q); - if (dn) { - sst << np->currentKeyValue().value << " -> " - << dn->currentKeyValue().value << ";" << std::endl; - } - } - } - - if (p + ni == m_spans.size()) { - sst << np->currentKeyValue().value << " -> " - << "EOS;" << std::endl; - } - } - } - } - - sst << "EOS;" << std::endl; - sst << "}"; - return sst.str(); -} - -} // namespace Gramambular -} // namespace Formosa diff --git a/Source/Engine/Gramambular/Grid.h b/Source/Engine/Gramambular/Grid.h deleted file mode 100644 index 0a960854f..000000000 --- a/Source/Engine/Gramambular/Grid.h +++ /dev/null @@ -1,281 +0,0 @@ -// -// Grid.h -// -// Copyright (c) 2007-2010 Lukhnos D. Liu (http://lukhnos.org) -// -// Permission is hereby granted, free of charge, to any person -// obtaining a copy of this software and associated documentation -// files (the "Software"), to deal in the Software without -// restriction, including without limitation the rights to use, -// copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following -// conditions: -// -// The above copyright notice and this permission notice shall be -// included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. -// - -#ifndef GRID_H_ -#define GRID_H_ - -#include -#include -#include - -#include "NodeAnchor.h" -#include "Span.h" - -namespace Formosa { -namespace Gramambular { - -class Grid { - public: - void clear(); - void insertNode(const Node& node, size_t location, size_t spanningLength); - bool hasNodeAtLocationSpanningLengthMatchingKey(size_t location, - size_t spanningLength, - const std::string& key); - - void expandGridByOneAtLocation(size_t location); - void shrinkGridByOneAtLocation(size_t location); - - size_t width() const; - std::vector nodesAt(size_t location); - std::vector nodesCrossingOrEndingAt(size_t location); - std::vector nodesInRange(size_t begin, size_t end); - - // "Freeze" the node with the unigram that represents the selected candidate - // value. After this, the node that contains the unigram will always be - // evaluated to that unigram, while all other overlapping nodes will be reset - // to their initial state (that is, if any of those nodes were "frozen" or - // fixed, they will be unfrozen.) - NodeAnchor fixNodeSelectedCandidate(size_t location, - const std::string& value); - - // Similar to fixNodeSelectedCandidate, but instead of "freezing" the node, - // only boost the unigram that represents the value with an overriding score. - // This has the same side effect as fixNodeSelectedCandidate, which is that - // all other overlapping nodes will be reset to their initial state. - void overrideNodeScoreForSelectedCandidate(size_t location, - const std::string& value, - float overridingScore); - - std::string dumpDOT(); - - protected: - std::vector m_spans; -}; - -inline void Grid::clear() { m_spans.clear(); } - -inline void Grid::insertNode(const Node& node, size_t location, - size_t spanningLength) { - if (location >= m_spans.size()) { - size_t diff = location - m_spans.size() + 1; - - for (size_t i = 0; i < diff; i++) { - m_spans.push_back(Span()); - } - } - - m_spans[location].insertNodeOfLength(node, spanningLength); -} - -inline bool Grid::hasNodeAtLocationSpanningLengthMatchingKey( - size_t location, size_t spanningLength, const std::string& key) { - if (location > m_spans.size()) { - return false; - } - - const Node* n = m_spans[location].nodeOfLength(spanningLength); - if (!n) { - return false; - } - - return key == n->key(); -} - -inline void Grid::expandGridByOneAtLocation(size_t location) { - if (!location || location == m_spans.size()) { - m_spans.insert(m_spans.begin() + location, Span()); - } else { - m_spans.insert(m_spans.begin() + location, Span()); - for (size_t i = 0; i < location; i++) { - // zaps overlapping spans - m_spans[i].removeNodeOfLengthGreaterThan(location - i); - } - } -} - -inline void Grid::shrinkGridByOneAtLocation(size_t location) { - if (location >= m_spans.size()) { - return; - } - - m_spans.erase(m_spans.begin() + location); - for (size_t i = 0; i < location; i++) { - // zaps overlapping spans - m_spans[i].removeNodeOfLengthGreaterThan(location - i); - } -} - -inline size_t Grid::width() const { return m_spans.size(); } - -inline std::vector Grid::nodesAt(size_t location) { - std::vector result; - - size_t spanSize = m_spans.size(); - if (m_spans.size() && location < spanSize) { - Span& span = m_spans[location]; - - for (size_t i = 1; i <= 6; i++) { - Node* np = span.nodeOfLength(i); - if (np) { - NodeAnchor na; - na.node = np; - na.location = location; - na.spanningLength = i; - result.push_back(na); - } - } - } - - return result; -}; - -inline std::vector Grid::nodesCrossingOrEndingAt(size_t location) { - std::vector result; - - if (m_spans.size() && location <= m_spans.size()) { - for (size_t i = 0; i < location; i++) { - Span& span = m_spans[i]; - - if (i + span.maximumLength() >= location) { - for (size_t j = 1, m = span.maximumLength(); j <= m; j++) { - if (i + j < location) { - continue; - } - - Node* np = span.nodeOfLength(j); - if (np) { - NodeAnchor na; - na.node = np; - na.location = i; - na.spanningLength = location - i; - - result.push_back(na); - } - } - } - } - } - - return result; -} - -inline std::vector Grid::nodesInRange(size_t begin, size_t end) { - std::vector result; - - if (m_spans.size() && end <= m_spans.size()) { - for (size_t i = 0; i < end; i++) { - Span& span = m_spans[i]; - - if (i + span.maximumLength() > begin) { - for (size_t j = 1, m = span.maximumLength(); j <= m; j++) { - if (i + j <= begin) { - continue; - } - - Node* np = span.nodeOfLength(j); - if (np) { - NodeAnchor na; - na.node = np; - na.location = i; - na.spanningLength = j; - - result.push_back(na); - } - } - } - } - } - - return result; -} - -// For nodes found at the location, fix their currently-selected candidate -// using the supplied string value. -inline NodeAnchor Grid::fixNodeSelectedCandidate(size_t location, - const std::string& value) { - std::vector nodes = nodesCrossingOrEndingAt(location); - NodeAnchor node; - size_t selectedIndex; - for (auto nodeAnchor : nodes) { - auto candidates = nodeAnchor.node->candidates(); - - for (size_t i = 0, c = candidates.size(); i < c; ++i) { - if (candidates[i].value == value) { - selectedIndex = i; - node = nodeAnchor; - break; - } - } - } - - if (node.node == nullptr) { - return node; - } - - nodes = nodesInRange(location - node.spanningLength, location); - for (auto nodeAnchor : nodes) { - const_cast(nodeAnchor.node)->resetCandidate(); - } - - const_cast(node.node)->selectCandidateAtIndex(selectedIndex); - return node; -} - -inline void Grid::overrideNodeScoreForSelectedCandidate( - size_t location, const std::string& value, float overridingScore) { - std::vector nodes = nodesCrossingOrEndingAt(location); - NodeAnchor node; - size_t selectedIndex; - for (auto nodeAnchor : nodes) { - auto candidates = nodeAnchor.node->candidates(); - - for (size_t i = 0, c = candidates.size(); i < c; ++i) { - if (candidates[i].value == value) { - selectedIndex = i; - node = nodeAnchor; - break; - } - } - } - - if (node.node == nullptr) { - return; - } - - nodes = nodesInRange(location - node.spanningLength, location); - for (auto nodeAnchor : nodes) { - const_cast(nodeAnchor.node)->resetCandidate(); - } - - const_cast(node.node)->selectFloatingCandidateAtIndex(selectedIndex, - overridingScore); -} - -} // namespace Gramambular -} // namespace Formosa - -#endif diff --git a/Source/Engine/Gramambular/KeyValuePair.h b/Source/Engine/Gramambular/KeyValuePair.h deleted file mode 100644 index ba33668a7..000000000 --- a/Source/Engine/Gramambular/KeyValuePair.h +++ /dev/null @@ -1,67 +0,0 @@ -// -// KeyValuePair.h -// -// Copyright (c) 2007-2010 Lukhnos D. Liu (http://lukhnos.org) -// -// Permission is hereby granted, free of charge, to any person -// obtaining a copy of this software and associated documentation -// files (the "Software"), to deal in the Software without -// restriction, including without limitation the rights to use, -// copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following -// conditions: -// -// The above copyright notice and this permission notice shall be -// included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. -// - -#ifndef KEYVALUEPAIR_H_ -#define KEYVALUEPAIR_H_ - -#include -#include - -namespace Formosa { -namespace Gramambular { - -class KeyValuePair { - public: - std::string key; - std::string value; - - bool operator==(const KeyValuePair& another) const; - bool operator<(const KeyValuePair& another) const; -}; - -inline std::ostream& operator<<(std::ostream& stream, - const KeyValuePair& pair) { - stream << "(" << pair.key << "," << pair.value << ")"; - return stream; -} - -inline bool KeyValuePair::operator==(const KeyValuePair& another) const { - return key == another.key && value == another.value; -} - -inline bool KeyValuePair::operator<(const KeyValuePair& another) const { - if (key < another.key) { - return true; - } else if (key == another.key) { - return value < another.value; - } - return false; -} -} // namespace Gramambular -} // namespace Formosa - -#endif diff --git a/Source/Engine/Gramambular/Node.h b/Source/Engine/Gramambular/Node.h deleted file mode 100644 index 96c8fce1f..000000000 --- a/Source/Engine/Gramambular/Node.h +++ /dev/null @@ -1,164 +0,0 @@ -// -// Node.h -// -// Copyright (c) 2007-2010 Lukhnos D. Liu (http://lukhnos.org) -// -// Permission is hereby granted, free of charge, to any person -// obtaining a copy of this software and associated documentation -// files (the "Software"), to deal in the Software without -// restriction, including without limitation the rights to use, -// copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following -// conditions: -// -// The above copyright notice and this permission notice shall be -// included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. -// - -#ifndef NODE_H_ -#define NODE_H_ - -#include -#include -#include -#include - -#include "LanguageModel.h" - -namespace Formosa { -namespace Gramambular { - -constexpr int kSelectedCandidateScore = 99; - -class Node { - public: - Node(); - Node(const std::string& key, const std::vector& unigrams); - - void primeNodeWithPreceedingKeyValues( - const std::vector& keyValues); - - const std::vector& candidates() const; - void selectCandidateAtIndex(size_t index = 0); - void resetCandidate(); - void selectFloatingCandidateAtIndex(size_t index, double score); - - const std::string& key() const; - double score() const; - double scoreForCandidate(const std::string& candidate) const; - const KeyValuePair currentKeyValue() const; - double highestUnigramScore() const; - - protected: - std::string m_key; - double m_score; - - std::vector m_unigrams; - std::vector m_candidates; - std::map m_valueUnigramIndexMap; - size_t m_selectedUnigramIndex; - - friend std::ostream& operator<<(std::ostream& stream, const Node& node); -}; - -inline std::ostream& operator<<(std::ostream& stream, const Node& node) { - stream << "(node,key:" << node.m_key - << ",selected:" << node.m_selectedUnigramIndex << "," - << node.m_unigrams << ")"; - return stream; -} - -inline Node::Node() : m_selectedUnigramIndex(0), m_score(0.0) {} - -inline Node::Node(const std::string& key, const std::vector& unigrams) - : m_key(key), - m_unigrams(unigrams), - m_selectedUnigramIndex(0), - m_score(0.0) { - stable_sort(m_unigrams.begin(), m_unigrams.end(), Unigram::ScoreCompare); - - if (m_unigrams.size()) { - m_score = m_unigrams[0].score; - } - - size_t i = 0; - for (std::vector::const_iterator ui = m_unigrams.begin(); - ui != m_unigrams.end(); ++ui) { - m_valueUnigramIndexMap[(*ui).keyValue.value] = i; - i++; - - m_candidates.push_back((*ui).keyValue); - } -} - -inline const std::vector& Node::candidates() const { - return m_candidates; -} - -inline void Node::selectCandidateAtIndex(size_t index) { - if (index >= m_unigrams.size()) { - m_selectedUnigramIndex = 0; - } else { - m_selectedUnigramIndex = index; - } - - m_score = kSelectedCandidateScore; -} - -inline void Node::resetCandidate() { - m_selectedUnigramIndex = 0; - if (m_unigrams.size()) { - m_score = m_unigrams[0].score; - } -} - -inline void Node::selectFloatingCandidateAtIndex(size_t index, double score) { - if (index >= m_unigrams.size()) { - m_selectedUnigramIndex = 0; - } else { - m_selectedUnigramIndex = index; - } - m_score = score; -} - -inline const std::string& Node::key() const { return m_key; } - -inline double Node::score() const { return m_score; } - -inline double Node::scoreForCandidate(const std::string& candidate) const { - for (auto unigram : m_unigrams) { - if (unigram.keyValue.value == candidate) { - return unigram.score; - } - } - return 0.0; -} - -inline double Node::highestUnigramScore() const { - if (m_unigrams.empty()) { - return 0.0; - } - return m_unigrams[0].score; -} - -inline const KeyValuePair Node::currentKeyValue() const { - if (m_selectedUnigramIndex >= m_unigrams.size()) { - return KeyValuePair(); - } else { - return m_candidates[m_selectedUnigramIndex]; - } -} -} // namespace Gramambular -} // namespace Formosa - -#endif diff --git a/Source/Engine/Gramambular/NodeAnchor.h b/Source/Engine/Gramambular/NodeAnchor.h deleted file mode 100644 index 3f81b4c28..000000000 --- a/Source/Engine/Gramambular/NodeAnchor.h +++ /dev/null @@ -1,72 +0,0 @@ -// -// NodeAnchor.h -// -// Copyright (c) 2007-2010 Lukhnos D. Liu (http://lukhnos.org) -// -// Permission is hereby granted, free of charge, to any person -// obtaining a copy of this software and associated documentation -// files (the "Software"), to deal in the Software without -// restriction, including without limitation the rights to use, -// copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following -// conditions: -// -// The above copyright notice and this permission notice shall be -// included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. -// - -#ifndef NODEANCHOR_H_ -#define NODEANCHOR_H_ - -#include - -#include "Node.h" - -namespace Formosa { -namespace Gramambular { - -struct NodeAnchor { - const Node* node = nullptr; - size_t location = 0; - size_t spanningLength = 0; - double accumulatedScore = 0.0; -}; - -inline std::ostream& operator<<(std::ostream& stream, - const NodeAnchor& anchor) { - stream << "{@(" << anchor.location << "," << anchor.spanningLength << "),"; - if (anchor.node) { - stream << *(anchor.node); - } else { - stream << "null"; - } - stream << "}"; - return stream; -} - -inline std::ostream& operator<<(std::ostream& stream, - const std::vector& anchor) { - for (std::vector::const_iterator i = anchor.begin(); - i != anchor.end(); ++i) { - stream << *i; - if (i + 1 != anchor.end()) { - stream << "<-"; - } - } - - return stream; -} -} // namespace Gramambular -} // namespace Formosa - -#endif diff --git a/Source/Engine/Gramambular/Span.h b/Source/Engine/Gramambular/Span.h deleted file mode 100644 index aa1cf38e7..000000000 --- a/Source/Engine/Gramambular/Span.h +++ /dev/null @@ -1,101 +0,0 @@ -// -// Span.h -// -// Copyright (c) 2007-2010 Lukhnos D. Liu (http://lukhnos.org) -// -// Permission is hereby granted, free of charge, to any person -// obtaining a copy of this software and associated documentation -// files (the "Software"), to deal in the Software without -// restriction, including without limitation the rights to use, -// copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following -// conditions: -// -// The above copyright notice and this permission notice shall be -// included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. -// - -#ifndef SPAN_H_ -#define SPAN_H_ - -#include -#include -#include - -#include "Node.h" - -namespace Formosa { -namespace Gramambular { -class Span { - public: - void clear(); - void insertNodeOfLength(const Node& node, size_t length); - void removeNodeOfLengthGreaterThan(size_t length); - - Node* nodeOfLength(size_t length); - size_t maximumLength() const; - - protected: - std::map m_lengthNodeMap; - size_t m_maximumLength = 0; -}; - -inline void Span::clear() { - m_lengthNodeMap.clear(); - m_maximumLength = 0; -} - -inline void Span::insertNodeOfLength(const Node& node, size_t length) { - m_lengthNodeMap[length] = node; - if (length > m_maximumLength) { - m_maximumLength = length; - } -} - -inline void Span::removeNodeOfLengthGreaterThan(size_t length) { - if (length > m_maximumLength) { - return; - } - - size_t max = 0; - std::set removeSet; - for (std::map::iterator i = m_lengthNodeMap.begin(), - e = m_lengthNodeMap.end(); - i != e; ++i) { - if ((*i).first > length) { - removeSet.insert((*i).first); - } else { - if ((*i).first > max) { - max = (*i).first; - } - } - } - - for (std::set::iterator i = removeSet.begin(), e = removeSet.end(); - i != e; ++i) { - m_lengthNodeMap.erase(*i); - } - - m_maximumLength = max; -} - -inline Node* Span::nodeOfLength(size_t length) { - std::map::iterator f = m_lengthNodeMap.find(length); - return f == m_lengthNodeMap.end() ? 0 : &(*f).second; -} - -inline size_t Span::maximumLength() const { return m_maximumLength; } -} // namespace Gramambular -} // namespace Formosa - -#endif diff --git a/Source/Engine/Gramambular/Unigram.h b/Source/Engine/Gramambular/Unigram.h deleted file mode 100644 index 6cd546b3d..000000000 --- a/Source/Engine/Gramambular/Unigram.h +++ /dev/null @@ -1,99 +0,0 @@ -// -// Unigram.h -// -// Copyright (c) 2007-2010 Lukhnos D. Liu (http://lukhnos.org) -// -// Permission is hereby granted, free of charge, to any person -// obtaining a copy of this software and associated documentation -// files (the "Software"), to deal in the Software without -// restriction, including without limitation the rights to use, -// copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following -// conditions: -// -// The above copyright notice and this permission notice shall be -// included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. -// - -#ifndef UNIGRAM_H_ -#define UNIGRAM_H_ - -#include - -#include "KeyValuePair.h" - -namespace Formosa { -namespace Gramambular { - -class Unigram { - public: - Unigram(); - - KeyValuePair keyValue; - double score; - - bool operator==(const Unigram& another) const; - bool operator<(const Unigram& another) const; - - static bool ScoreCompare(const Unigram& a, const Unigram& b); -}; - -inline std::ostream& operator<<(std::ostream& stream, const Unigram& gram) { - std::streamsize p = stream.precision(); - stream.precision(6); - stream << "(" << gram.keyValue << "," << gram.score << ")"; - stream.precision(p); - return stream; -} - -inline std::ostream& operator<<(std::ostream& stream, - const std::vector& grams) { - stream << "[" << grams.size() << "]=>{"; - - size_t index = 0; - - for (std::vector::const_iterator gi = grams.begin(); - gi != grams.end(); ++gi, ++index) { - stream << index << "=>"; - stream << *gi; - if (gi + 1 != grams.end()) { - stream << ","; - } - } - - stream << "}"; - return stream; -} - -inline Unigram::Unigram() : score(0.0) {} - -inline bool Unigram::operator==(const Unigram& another) const { - return keyValue == another.keyValue && score == another.score; -} - -inline bool Unigram::operator<(const Unigram& another) const { - if (keyValue < another.keyValue) { - return true; - } else if (keyValue == another.keyValue) { - return score < another.score; - } - return false; -} - -inline bool Unigram::ScoreCompare(const Unigram& a, const Unigram& b) { - return a.score > b.score; -} -} // namespace Gramambular -} // namespace Formosa - -#endif diff --git a/Source/Engine/Gramambular/Walker.h b/Source/Engine/Gramambular/Walker.h deleted file mode 100644 index a1cd887b3..000000000 --- a/Source/Engine/Gramambular/Walker.h +++ /dev/null @@ -1,173 +0,0 @@ -// -// Walker.h -// -// Copyright (c) 2007-2010 Lukhnos D. Liu (http://lukhnos.org) -// -// Permission is hereby granted, free of charge, to any person -// obtaining a copy of this software and associated documentation -// files (the "Software"), to deal in the Software without -// restriction, including without limitation the rights to use, -// copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following -// conditions: -// -// The above copyright notice and this permission notice shall be -// included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. -// - -#ifndef WALKER_H_ -#define WALKER_H_ - -#include -#include - -#include "Grid.h" - -namespace Formosa { -namespace Gramambular { - -constexpr int kDroppedPathScore = -999; - -class Walker { - public: - explicit Walker(Grid* inGrid); - const std::vector walk( - size_t location = 0, double accumulatedScore = 0.0, - std::string joinedPhrase = "", - std::vector longPhrases = std::vector()); - - protected: - Grid* m_grid; -}; - -inline Walker::Walker(Grid* inGrid) : m_grid(inGrid) {} - -inline const std::vector Walker::walk( - size_t location, double accumulatedScore, std::string joinedPhrase, - std::vector longPhrases) { - if (location >= m_grid->width()) { - return std::vector(); - } - - std::vector> paths; - - std::vector nodes = m_grid->nodesAt(location); - - stable_sort(nodes.begin(), nodes.end(), - [](const Formosa::Gramambular::NodeAnchor& a, - const Formosa::Gramambular::NodeAnchor& b) { - return a.node->score() > b.node->score(); - }); - - if (nodes[0].node->score() >= kSelectedCandidateScore) { - // If the user ever chosen a candidate on a node, we should only use the - // path based on the selected candidate and ignore other paths. - auto node = nodes[0]; - - node.accumulatedScore = accumulatedScore + node.node->score(); - std::vector path = - walk(location + node.spanningLength, (node).accumulatedScore); - path.insert(path.begin(), node); - paths.push_back(path); - } else if (longPhrases.size() > 0) { - std::vector path; - - for (std::vector::iterator ni = nodes.begin(); - ni != nodes.end(); ++ni) { - if (!ni->node) { - continue; - } - std::string joinedValue = joinedPhrase; - joinedValue.insert(joinedValue.size(), ni->node->currentKeyValue().value); - // If some nodes with only a character composed a result as a long phrase, - // we just give up the path and give it a really low score. - // - // For example, in a sentence "我這樣覺得", we have a longer phrase - // 覺得, and we found there is another path may ends with "覺" and - // "得", we just ignore the path since finally "我/這樣/覺得" and - // "我/這/樣/覺/得" are exactly the same for the users. - if (std::find(longPhrases.begin(), longPhrases.end(), joinedValue) != - longPhrases.end()) { - ni->accumulatedScore = kDroppedPathScore; - path.insert(path.begin(), *ni); - paths.push_back(path); - continue; - } - - ni->accumulatedScore = accumulatedScore + ni->node->score(); - if (joinedValue.size() >= longPhrases[0].size()) { - path = walk(location + ni->spanningLength, ni->accumulatedScore, "", - std::vector()); - } else { - path = walk(location + ni->spanningLength, ni->accumulatedScore, - joinedValue, longPhrases); - } - path.insert(path.begin(), *ni); - paths.push_back(path); - } - } else { - // Let's see if we have longer phrases in the position in the grid. - std::vector newLongPhrases; - for (std::vector::iterator ni = nodes.begin(); - ni != nodes.end(); ++ni) { - if (!ni->node) { - continue; - } - if (ni->spanningLength > 1) { - newLongPhrases.push_back(ni->node->currentKeyValue().value); - } - } - - stable_sort( - longPhrases.begin(), longPhrases.end(), - [](std::string a, std::string b) { return a.size() > b.size(); }); - - for (std::vector::iterator ni = nodes.begin(); - ni != nodes.end(); ++ni) { - if (!ni->node) { - continue; - } - - ni->accumulatedScore = accumulatedScore + ni->node->score(); - std::vector path; - - if (ni->spanningLength > 1) { - path = walk(location + ni->spanningLength, ni->accumulatedScore, "", - std::vector()); - } else { - path = walk(location + 1, ni->accumulatedScore, - ni->node->currentKeyValue().value, newLongPhrases); - } - path.insert(path.begin(), *ni); - paths.push_back(path); - } - } - - if (!paths.size()) { - return std::vector(); - } - - std::vector* result = &*(paths.begin()); - for (std::vector>::iterator pi = paths.begin(); - pi != paths.end(); ++pi) { - if (pi->back().accumulatedScore > result->back().accumulatedScore) { - result = &*pi; - } - } - - return *result; -}; -} // namespace Gramambular -} // namespace Formosa - -#endif diff --git a/Source/Engine/Mandarin/CMakeLists.txt b/Source/Engine/Mandarin/CMakeLists.txt index bc65a850b..e635a0c4b 100644 --- a/Source/Engine/Mandarin/CMakeLists.txt +++ b/Source/Engine/Mandarin/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.17) +cmake_minimum_required(VERSION 3.6) project(Mandarin) set(CMAKE_CXX_STANDARD 17) diff --git a/Source/Engine/McBopomofoLM.cpp b/Source/Engine/McBopomofoLM.cpp index 5f5d2e70d..79d1bc23f 100644 --- a/Source/Engine/McBopomofoLM.cpp +++ b/Source/Engine/McBopomofoLM.cpp @@ -87,38 +87,34 @@ void McBopomofoLM::loadPhraseReplacementMap(const char* phraseReplacementPath) } } -const std::vector McBopomofoLM::unigramsForKey(const std::string& key) +std::vector McBopomofoLM::getUnigrams(const std::string& key) { if (key == " ") { - std::vector spaceUnigrams; - Formosa::Gramambular::Unigram g; - g.keyValue.key = " "; - g.keyValue.value = " "; - g.score = 0; - spaceUnigrams.push_back(g); + std::vector spaceUnigrams; + spaceUnigrams.emplace_back(" ", 0); return spaceUnigrams; } - std::vector allUnigrams; - std::vector userUnigrams; + std::vector allUnigrams; + std::vector userUnigrams; std::unordered_set excludedValues; std::unordered_set insertedValues; - if (m_excludedPhrases.hasUnigramsForKey(key)) { - std::vector excludedUnigrams = m_excludedPhrases.unigramsForKey(key); + if (m_excludedPhrases.hasUnigrams(key)) { + std::vector excludedUnigrams = m_excludedPhrases.getUnigrams(key); transform(excludedUnigrams.begin(), excludedUnigrams.end(), inserter(excludedValues, excludedValues.end()), - [](const Formosa::Gramambular::Unigram& u) { return u.keyValue.value; }); + [](const Formosa::Gramambular2::LanguageModel::Unigram& u) { return u.value(); }); } - if (m_userPhrases.hasUnigramsForKey(key)) { - std::vector rawUserUnigrams = m_userPhrases.unigramsForKey(key); + if (m_userPhrases.hasUnigrams(key)) { + std::vector rawUserUnigrams = m_userPhrases.getUnigrams(key); userUnigrams = filterAndTransformUnigrams(rawUserUnigrams, excludedValues, insertedValues); } - if (m_languageModel.hasUnigramsForKey(key)) { - std::vector rawGlobalUnigrams = m_languageModel.unigramsForKey(key); + if (m_languageModel.hasUnigrams(key)) { + std::vector rawGlobalUnigrams = m_languageModel.getUnigrams(key); allUnigrams = filterAndTransformUnigrams(rawGlobalUnigrams, excludedValues, insertedValues); } @@ -126,17 +122,17 @@ const std::vector McBopomofoLM::unigramsForKey(co return allUnigrams; } -bool McBopomofoLM::hasUnigramsForKey(const std::string& key) +bool McBopomofoLM::hasUnigrams(const std::string& key) { if (key == " ") { return true; } - if (!m_excludedPhrases.hasUnigramsForKey(key)) { - return m_userPhrases.hasUnigramsForKey(key) || m_languageModel.hasUnigramsForKey(key); + if (!m_excludedPhrases.hasUnigrams(key)) { + return m_userPhrases.hasUnigrams(key) || m_languageModel.hasUnigrams(key); } - return unigramsForKey(key).size() > 0; + return getUnigrams(key).size() > 0; } void McBopomofoLM::setPhraseReplacementEnabled(bool enabled) @@ -164,14 +160,14 @@ void McBopomofoLM::setExternalConverter(std::function m_externalConverter = externalConverter; } -const std::vector McBopomofoLM::filterAndTransformUnigrams(const std::vector unigrams, const std::unordered_set& excludedValues, std::unordered_set& insertedValues) +std::vector McBopomofoLM::filterAndTransformUnigrams(const std::vector unigrams, const std::unordered_set& excludedValues, std::unordered_set& insertedValues) { - std::vector results; + std::vector results; for (auto&& unigram : unigrams) { // excludedValues filters out the unigrams with the original value. // insertedValues filters out the ones with the converted value - std::string originalValue = unigram.keyValue.value; + std::string originalValue = unigram.value(); if (excludedValues.find(originalValue) != excludedValues.end()) { continue; } @@ -188,11 +184,7 @@ const std::vector McBopomofoLM::filterAndTransfor value = replacement; } if (insertedValues.find(value) == insertedValues.end()) { - Formosa::Gramambular::Unigram g; - g.keyValue.value = value; - g.keyValue.key = unigram.keyValue.key; - g.score = unigram.score; - results.push_back(g); + results.emplace_back(value, unigram.score()); insertedValues.insert(value); } } diff --git a/Source/Engine/McBopomofoLM.h b/Source/Engine/McBopomofoLM.h index 2be05fbd9..6e0566ff2 100644 --- a/Source/Engine/McBopomofoLM.h +++ b/Source/Engine/McBopomofoLM.h @@ -28,6 +28,8 @@ #include "ParselessLM.h" #include "PhraseReplacementMap.h" #include "UserPhrasesLM.h" +#include "gramambular2/language_model.h" +#include #include #include @@ -55,10 +57,10 @@ namespace McBopomofo { /// model while launching and to load the user phrases anytime if the custom /// files are modified. It does not keep the reference of the data pathes but /// you have to pass the paths when you ask it to do loading. -class McBopomofoLM : public Formosa::Gramambular::LanguageModel { +class McBopomofoLM : public Formosa::Gramambular2::LanguageModel { public: McBopomofoLM(); - ~McBopomofoLM(); + ~McBopomofoLM() override; /// Asks to load the primary language model at the given path. /// @param languageModelPath The path of the language model. @@ -79,13 +81,14 @@ class McBopomofoLM : public Formosa::Gramambular::LanguageModel { /// Asks to load th phrase replacement table at the given path. /// @param phraseReplacementPath The path of the phrase replacement table. void loadPhraseReplacementMap(const char* phraseReplacementPath); + /// Returns a list of available unigram for the given key. /// @param key A string represents the BPMF reading or a symbol key. For /// example, it you pass "ㄇㄚ", it returns "嗎", "媽", and so on. - const std::vector unigramsForKey(const std::string& key); + std::vector getUnigrams(const std::string& key) override; /// If the model has unigrams for the given key. /// @param key The key. - bool hasUnigramsForKey(const std::string& key); + bool hasUnigrams(const std::string& key) override; /// Enables or disables phrase replacement. void setPhraseReplacementEnabled(bool enabled); @@ -110,7 +113,7 @@ class McBopomofoLM : public Formosa::Gramambular::LanguageModel { /// @param insertedValues The values for unigrams already in the results. /// It helps to prevent duplicated unigrams. Please note that the method /// has a side effect that it inserts values to `insertedValues`. - const std::vector filterAndTransformUnigrams(const std::vector unigrams, + std::vector filterAndTransformUnigrams(const std::vector unigrams, const std::unordered_set& excludedValues, std::unordered_set& insertedValues); diff --git a/Source/Engine/ParselessLM.cpp b/Source/Engine/ParselessLM.cpp index 352e6940e..b0f0f8f14 100644 --- a/Source/Engine/ParselessLM.cpp +++ b/Source/Engine/ParselessLM.cpp @@ -24,6 +24,7 @@ #include "ParselessLM.h" #include +#include #include #include #include @@ -85,16 +86,17 @@ void McBopomofo::ParselessLM::close() } } -const std::vector -McBopomofo::ParselessLM::unigramsForKey(const std::string& key) +std::vector +McBopomofo::ParselessLM::getUnigrams(const std::string& key) { if (db_ == nullptr) { - return std::vector(); + return std::vector(); } - std::vector results; + std::vector results; for (const auto& row : db_->findRows(key + " ")) { - Formosa::Gramambular::Unigram unigram; + std::string value; + double score = 0; // Move ahead until we encounter the first space. This is the key. auto it = row.begin(); @@ -102,7 +104,7 @@ McBopomofo::ParselessLM::unigramsForKey(const std::string& key) ++it; } - unigram.keyValue.key = std::string(row.begin(), it); + // The key is std::string(row.begin(), it), which we don't need. // Read past the space. if (it != row.end()) { @@ -118,7 +120,7 @@ McBopomofo::ParselessLM::unigramsForKey(const std::string& key) while (it != row.end() && *it != ' ') { ++it; } - unigram.keyValue.value = std::string(value_begin, it); + value = std::string(value_begin, it); } // Read past the space. The remainder, if it exists, is the score. @@ -127,14 +129,14 @@ McBopomofo::ParselessLM::unigramsForKey(const std::string& key) } if (it != row.end()) { - unigram.score = std::stod(std::string(it, row.end())); + score = std::stod(std::string(it, row.end())); } - results.push_back(unigram); + results.emplace_back(std::move(value), score); } return results; } -bool McBopomofo::ParselessLM::hasUnigramsForKey(const std::string& key) +bool McBopomofo::ParselessLM::hasUnigrams(const std::string& key) { if (db_ == nullptr) { return false; diff --git a/Source/Engine/ParselessLM.h b/Source/Engine/ParselessLM.h index 17ec7c729..8ac27c792 100644 --- a/Source/Engine/ParselessLM.h +++ b/Source/Engine/ParselessLM.h @@ -28,12 +28,12 @@ #include #include -#include "LanguageModel.h" #include "ParselessPhraseDB.h" +#include "gramambular2/language_model.h" namespace McBopomofo { -class ParselessLM : public Formosa::Gramambular::LanguageModel { +class ParselessLM : public Formosa::Gramambular2::LanguageModel { public: ~ParselessLM() override; @@ -41,9 +41,9 @@ class ParselessLM : public Formosa::Gramambular::LanguageModel { bool open(const std::string_view& path); void close(); - const std::vector unigramsForKey( + std::vector getUnigrams( const std::string& key) override; - bool hasUnigramsForKey(const std::string& key) override; + bool hasUnigrams(const std::string& key) override; private: int fd_ = -1; diff --git a/Source/Engine/ParselessLMTest.cpp b/Source/Engine/ParselessLMTest.cpp index e43bbc82c..d73aaf696 100644 --- a/Source/Engine/ParselessLMTest.cpp +++ b/Source/Engine/ParselessLMTest.cpp @@ -40,17 +40,17 @@ TEST(ParselessLMTest, SanityCheckTest) bool status = lm.open(data_path); ASSERT_TRUE(status); - ASSERT_TRUE(lm.hasUnigramsForKey("ㄕ")); - ASSERT_TRUE(lm.hasUnigramsForKey("ㄕˋ-ㄕˊ")); - ASSERT_TRUE(lm.hasUnigramsForKey("_punctuation_list")); + ASSERT_TRUE(lm.hasUnigrams("ㄕ")); + ASSERT_TRUE(lm.hasUnigrams("ㄕˋ-ㄕˊ")); + ASSERT_TRUE(lm.hasUnigrams("_punctuation_list")); - auto unigrams = lm.unigramsForKey("ㄕ"); + auto unigrams = lm.getUnigrams("ㄕ"); ASSERT_GT(unigrams.size(), 0); - unigrams = lm.unigramsForKey("ㄕˋ-ㄕˊ"); + unigrams = lm.getUnigrams("ㄕˋ-ㄕˊ"); ASSERT_GT(unigrams.size(), 0); - unigrams = lm.unigramsForKey("_punctuation_list"); + unigrams = lm.getUnigrams("_punctuation_list"); ASSERT_GT(unigrams.size(), 0); lm.close(); diff --git a/Source/Engine/PhraseReplacementMap.cpp b/Source/Engine/PhraseReplacementMap.cpp index ed5831fb2..a74e07ff1 100644 --- a/Source/Engine/PhraseReplacementMap.cpp +++ b/Source/Engine/PhraseReplacementMap.cpp @@ -1,9 +1,9 @@ #include "PhraseReplacementMap.h" -#include -#include #include #include +#include +#include #include #include "KeyValueBlobReader.h" @@ -13,9 +13,9 @@ namespace McBopomofo { using std::string; PhraseReplacementMap::PhraseReplacementMap() -: fd(-1) -, data(0) -, length(0) + : fd(-1) + , data(0) + , length(0) { } @@ -26,7 +26,7 @@ PhraseReplacementMap::~PhraseReplacementMap() } } -bool PhraseReplacementMap::open(const char *path) +bool PhraseReplacementMap::open(const char* path) { if (data) { return false; @@ -77,10 +77,9 @@ const std::string PhraseReplacementMap::valueForKey(const std::string& key) auto iter = keyValueMap.find(key); if (iter != keyValueMap.end()) { const std::string_view v = iter->second; - return {v.data(), v.size()}; + return { v.data(), v.size() }; } return string(""); } - } diff --git a/Source/Engine/PhraseReplacementMap.h b/Source/Engine/PhraseReplacementMap.h index 6a53a5bc4..4b54d8682 100644 --- a/Source/Engine/PhraseReplacementMap.h +++ b/Source/Engine/PhraseReplacementMap.h @@ -24,26 +24,25 @@ #ifndef PHRASEREPLACEMENTMAP_H #define PHRASEREPLACEMENTMAP_H -#include -#include #include +#include +#include namespace McBopomofo { -class PhraseReplacementMap -{ +class PhraseReplacementMap { public: PhraseReplacementMap(); ~PhraseReplacementMap(); - bool open(const char *path); + bool open(const char* path); void close(); const std::string valueForKey(const std::string& key); protected: std::map keyValueMap; int fd; - void *data; + void* data; size_t length; }; diff --git a/Source/Engine/UserOverrideModel.cpp b/Source/Engine/UserOverrideModel.cpp index 69344957d..855480124 100644 --- a/Source/Engine/UserOverrideModel.cpp +++ b/Source/Engine/UserOverrideModel.cpp @@ -27,6 +27,7 @@ #include "UserOverrideModel.h" +#include "gramambular2/reading_grid.h" #include #include #include @@ -41,8 +42,7 @@ static double Score(size_t eventCount, double eventTimestamp, double timestamp, double lambda); -static bool IsEndingPunctuation(const std::string& value); -static std::string WalkedNodesToKey(const std::vector& walkedNodes, +static std::string WalkedNodesToKey(const std::vector& walkedNodes, size_t cursorIndex); UserOverrideModel::UserOverrideModel(size_t capacity, double decayConstant) @@ -52,12 +52,16 @@ UserOverrideModel::UserOverrideModel(size_t capacity, double decayConstant) m_decayExponent = log(0.5) / decayConstant; } -void UserOverrideModel::observe(const std::vector& walkedNodes, +void UserOverrideModel::observe(const std::vector& walkedNodes, size_t cursorIndex, const std::string& candidate, double timestamp) { std::string key = WalkedNodesToKey(walkedNodes, cursorIndex); + if (key.empty()) { + return; + } + auto mapIter = m_lruMap.find(key); if (mapIter == m_lruMap.end()) { auto keyValuePair = KeyObservationPair(key, Observation()); @@ -86,11 +90,15 @@ void UserOverrideModel::observe(const std::vector& walkedNodes, +std::string UserOverrideModel::suggest(const std::vector& walkedNodes, size_t cursorIndex, double timestamp) { std::string key = WalkedNodesToKey(walkedNodes, cursorIndex); + if (key.empty()) { + return std::string(); + } + auto mapIter = m_lruMap.find(key); if (mapIter == m_lruMap.end()) { return std::string(); @@ -147,48 +155,51 @@ static double Score(size_t eventCount, return prob * decay; } -static bool IsEndingPunctuation(const std::string& value) +static bool IsPunctuation(const Formosa::Gramambular2::ReadingGrid::NodePtr node) { - return value == "," || value == "。" || value == "!" || value == "?" || value == "」" || value == "』" || value == "”" || value == "”"; + const std::string& reading = node->reading(); + return !reading.empty() && reading[0] == '_'; } -static std::string WalkedNodesToKey(const std::vector& walkedNodes, + +static std::string WalkedNodesToKey(const std::vector& walkedNodes, size_t cursorIndex) { std::stringstream s; - std::vector n; + std::vector n; size_t ll = 0; - for (std::vector::const_iterator i = walkedNodes.begin(); - i != walkedNodes.end(); - ++i) { - const auto& nn = *i; - n.push_back(nn); - ll += nn.spanningLength; - if (ll >= cursorIndex) { + for (auto i = walkedNodes.cbegin(); i != walkedNodes.cend(); ++i) { + n.push_back(*i); + ll += (*i)->spanningLength(); + if (ll > cursorIndex) { break; } } - std::vector::const_reverse_iterator r = n.rbegin(); + auto r = n.crbegin(); + if (r == n.crend()) { + return ""; + } - if (r == n.rend()) { + if ((*r)->unigrams().empty()) { return ""; } - std::string current = (*r).node->currentKeyValue().key; + std::string current = (*r)->unigrams()[0].value(); + ++r; s.clear(); s.str(std::string()); - if (r != n.rend()) { - std::string value = (*r).node->currentKeyValue().value; - if (IsEndingPunctuation(value)) { + if (r != n.crend()) { + if (IsPunctuation(*r)) { + // Ignore punctuation. s << "()"; r = n.rend(); } else { s << "(" - << (*r).node->currentKeyValue().key + << (*r)->reading() << "," - << value + << (*r)->value() << ")"; ++r; } @@ -200,15 +211,15 @@ static std::string WalkedNodesToKey(const std::vectorcurrentKeyValue().value; - if (IsEndingPunctuation(value)) { + if (IsPunctuation(*r)) { + // Ignore punctuation. s << "()"; r = n.rend(); } else { s << "(" - << (*r).node->currentKeyValue().key + << (*r)->reading() << "," - << value + << (*r)->value() << ")"; ++r; } diff --git a/Source/Engine/UserOverrideModel.h b/Source/Engine/UserOverrideModel.h index 710b83fff..425169976 100644 --- a/Source/Engine/UserOverrideModel.h +++ b/Source/Engine/UserOverrideModel.h @@ -28,7 +28,7 @@ #include #include -#include "Gramambular.h" +#include "gramambular2/reading_grid.h" namespace McBopomofo { @@ -36,12 +36,12 @@ class UserOverrideModel { public: UserOverrideModel(size_t capacity, double decayConstant); - void observe(const std::vector& walkedNodes, + void observe(const std::vector& walkedNodes, size_t cursorIndex, const std::string& candidate, double timestamp); - std::string suggest(const std::vector& walkedNodes, + std::string suggest(const std::vector& walkedNodes, size_t cursorIndex, double timestamp); diff --git a/Source/Engine/UserPhrasesLM.cpp b/Source/Engine/UserPhrasesLM.cpp index c01d1d081..37292212b 100644 --- a/Source/Engine/UserPhrasesLM.cpp +++ b/Source/Engine/UserPhrasesLM.cpp @@ -23,10 +23,10 @@ #include "UserPhrasesLM.h" -#include -#include #include #include +#include +#include #include #include "KeyValueBlobReader.h" @@ -55,7 +55,7 @@ bool UserPhrasesLM::isLoaded() return false; } -bool UserPhrasesLM::open(const char *path) +bool UserPhrasesLM::open(const char* path) { if (data) { return false; @@ -112,27 +112,23 @@ void UserPhrasesLM::dump() } } -const std::vector UserPhrasesLM::unigramsForKey(const std::string& key) +std::vector UserPhrasesLM::getUnigrams(const std::string& key) { - std::vector v; + std::vector v; auto iter = keyRowMap.find(key); if (iter != keyRowMap.end()) { const std::vector& rows = iter->second; for (const auto& row : rows) { - Formosa::Gramambular::Unigram g; - g.keyValue.key = row.key; - g.keyValue.value = row.value; - g.score = 0.0; - v.push_back(g); + v.emplace_back(std::string(row.value), 0); } } return v; } -bool UserPhrasesLM::hasUnigramsForKey(const std::string& key) +bool UserPhrasesLM::hasUnigrams(const std::string& key) { return keyRowMap.find(key) != keyRowMap.end(); } -}; // namespace McBopomofo +}; // namespace McBopomofo diff --git a/Source/Engine/UserPhrasesLM.h b/Source/Engine/UserPhrasesLM.h index 9d0822e6e..507a3bd5b 100644 --- a/Source/Engine/UserPhrasesLM.h +++ b/Source/Engine/UserPhrasesLM.h @@ -24,37 +24,40 @@ #ifndef USERPHRASESLM_H #define USERPHRASESLM_H -#include -#include +#include "gramambular2/language_model.h" #include -#include "LanguageModel.h" +#include +#include namespace McBopomofo { -class UserPhrasesLM : public Formosa::Gramambular::LanguageModel -{ +class UserPhrasesLM : public Formosa::Gramambular2::LanguageModel { public: UserPhrasesLM(); - ~UserPhrasesLM(); + ~UserPhrasesLM() override; bool isLoaded(); - bool open(const char *path); + bool open(const char* path); void close(); void dump(); - - virtual const std::vector unigramsForKey(const std::string& key); - virtual bool hasUnigramsForKey(const std::string& key); - + + std::vector getUnigrams(const std::string& key) override; + bool hasUnigrams(const std::string& key) override; + protected: struct Row { - Row(std::string_view& k, std::string_view& v) : key(k), value(v) {} + Row(std::string_view& k, std::string_view& v) + : key(k) + , value(v) + { + } std::string_view key; std::string_view value; }; - + std::map> keyRowMap; int fd; - void *data; + void* data; size_t length; }; diff --git a/Source/Engine/UserPhrasesLMTest.cpp b/Source/Engine/UserPhrasesLMTest.cpp index eda396ccf..9450bd932 100644 --- a/Source/Engine/UserPhrasesLMTest.cpp +++ b/Source/Engine/UserPhrasesLMTest.cpp @@ -46,11 +46,11 @@ TEST(UserPhreasesLMTest, LenientReading) UserPhrasesLM lm; lm.open(tmp_name.c_str()); - ASSERT_TRUE(lm.hasUnigramsForKey("reading1")); - ASSERT_FALSE(lm.hasUnigramsForKey("value2")); + ASSERT_TRUE(lm.hasUnigrams("reading1")); + ASSERT_FALSE(lm.hasUnigrams("value2")); // Anything after the error won't be parsed, so reading2 won't be found. - ASSERT_FALSE(lm.hasUnigramsForKey("reading2")); + ASSERT_FALSE(lm.hasUnigrams("reading2")); r = remove(tmp_name.c_str()); ASSERT_EQ(r, 0); diff --git a/Source/Engine/Gramambular/.clang-format b/Source/Engine/gramambular2/.clang-format similarity index 100% rename from Source/Engine/Gramambular/.clang-format rename to Source/Engine/gramambular2/.clang-format diff --git a/Source/Engine/Gramambular/CMakeLists.txt b/Source/Engine/gramambular2/CMakeLists.txt similarity index 52% rename from Source/Engine/Gramambular/CMakeLists.txt rename to Source/Engine/gramambular2/CMakeLists.txt index c6d2ca696..99d147e6e 100644 --- a/Source/Engine/Gramambular/CMakeLists.txt +++ b/Source/Engine/gramambular2/CMakeLists.txt @@ -1,9 +1,10 @@ -cmake_minimum_required(VERSION 3.17) -project(Gramambular) +cmake_minimum_required(VERSION 3.6) +project(gramambular2) set(CMAKE_CXX_STANDARD 17) +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") -add_library(GramambularLib BlockReadingBuilder.h Gramambular.h Grid.h Grid.cpp KeyValuePair.h LanguageModel.h Node.h NodeAnchor.h Span.h Unigram.h Walker.h) +add_library(gramambular2_lib language_model.h reading_grid.h reading_grid.cpp) # Let CMake fetch Google Test for us. # https://github.com/google/googletest/tree/main/googletest#incorporating-into-an-existing-cmake-project @@ -19,13 +20,14 @@ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) FetchContent_MakeAvailable(googletest) # Test target declarations. -add_executable(GramambularTest GramambularTest.cpp) -target_link_libraries(GramambularTest gtest_main GramambularLib) +add_executable(gramambular2_test reading_grid_test.cpp) +target_include_directories(gramambular2_test PRIVATE "${GMOCK_INCLUDE_DIRS}" "${GTEST_INCLUDE_DIRS}") +target_link_libraries(gramambular2_test gtest_main gramambular2_lib) include(GoogleTest) -gtest_discover_tests(GramambularTest) +gtest_discover_tests(gramambular2_test) add_custom_target( - runGramambularTest - COMMAND ${CMAKE_CURRENT_BINARY_DIR}/GramambularTest + runGramambular2Test + COMMAND ${CMAKE_CURRENT_BINARY_DIR}/gramambular2_test ) -add_dependencies(runGramambularTest GramambularTest) +add_dependencies(runGramambular2Test gramambular2_test) diff --git a/Source/Engine/gramambular2/README.md b/Source/Engine/gramambular2/README.md new file mode 100644 index 000000000..1cb4fee1f --- /dev/null +++ b/Source/Engine/gramambular2/README.md @@ -0,0 +1,13 @@ +# Gramambular2 + +This is the new version of Gramambular ("gram walk"), a segmentation library +mainly designed for Mandarin Chinese. The library can also be used to +implement input methods, and the many utility methods in the public API +actually reflect that design intent. + +The basic principle is a hidden Markov model, with the input (observations) +being Chinese characters and the output (hidden events) being the possible +groups (segmantations). When used for an input method, the input can be +a series of Bopomofo syllables, and the output will be the mostly likely +Chinese characters. The actual computation uses a naive Bayes classifier, +and the required language model is a very simple unigram model. diff --git a/Source/Engine/Gramambular/LanguageModel.h b/Source/Engine/gramambular2/language_model.h similarity index 54% rename from Source/Engine/Gramambular/LanguageModel.h rename to Source/Engine/gramambular2/language_model.h index f6ec96550..f35ab2ddb 100644 --- a/Source/Engine/Gramambular/LanguageModel.h +++ b/Source/Engine/gramambular2/language_model.h @@ -1,7 +1,4 @@ -// -// LanguageModel.h -// -// Copyright (c) 2007-2010 Lukhnos D. Liu (http://lukhnos.org) +// Copyright (c) 2022 and onwards Lukhnos Liu. // // Permission is hereby granted, free of charge, to any person // obtaining a copy of this software and associated documentation @@ -23,27 +20,43 @@ // WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR // OTHER DEALINGS IN THE SOFTWARE. -// -#ifndef LANGUAGEMODEL_H_ -#define LANGUAGEMODEL_H_ +#ifndef LANGUAGE_MODEL_H_ +#define LANGUAGE_MODEL_H_ #include +#include #include -#include "Unigram.h" - -namespace Formosa { -namespace Gramambular { +namespace Formosa::Gramambular2 { +// Represents an n-gram model. For our purposes, only unigrams are used. class LanguageModel { public: - virtual ~LanguageModel() {} + class Unigram; + + virtual ~LanguageModel() = default; - virtual const std::vector unigramsForKey(const std::string& key) = 0; - virtual bool hasUnigramsForKey(const std::string& key) = 0; + // Returns unigrams matching the reading, or an empty vector if none is found. + virtual std::vector getUnigrams(const std::string& reading) = 0; + virtual bool hasUnigrams(const std::string& reading) = 0; + + // An immutable unigram with an actual value, along with a score, which is + // usually a log probability from a language model. + class Unigram { + public: + explicit Unigram(std::string val = "", double sc = 0) + : value_(std::move(val)), score_(sc) {} + + [[nodiscard]] const std::string& value() const { return value_; } + [[nodiscard]] double score() const { return score_; } + + private: + std::string value_; + double score_; + }; }; -} // namespace Gramambular -} // namespace Formosa + +} // namespace Formosa::Gramambular2 #endif diff --git a/Source/Engine/gramambular2/reading_grid.cpp b/Source/Engine/gramambular2/reading_grid.cpp new file mode 100644 index 000000000..d2f363eff --- /dev/null +++ b/Source/Engine/gramambular2/reading_grid.cpp @@ -0,0 +1,624 @@ +// Copyright (c) 2022 and onwards Lukhnos Liu. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +#include "reading_grid.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace Formosa::Gramambular2 { + +void ReadingGrid::clear() { + cursor_ = 0; + readings_.clear(); + spans_.clear(); +} + +void ReadingGrid::setCursor(size_t cursor) { + assert(cursor <= readings_.size()); + cursor_ = cursor; +} + +void ReadingGrid::setReadingSeparator(const std::string& separator) { + separator_ = separator; +} + +bool ReadingGrid::insertReading(const std::string& reading) { + if (reading.empty() || reading == separator_) { + return false; + } + + if (!lm_.hasUnigrams(reading)) { + return false; + } + + readings_.insert(readings_.begin() + static_cast(cursor_), + reading); + expandGridAt(cursor_); + update(); + + // Cursor must only move after update(). + ++cursor_; + return true; +} + +bool ReadingGrid::deleteReadingBeforeCursor() { + if (!cursor_) { + return false; + } + + readings_.erase(readings_.begin() + static_cast(cursor_ - 1), + readings_.begin() + static_cast(cursor_)); + // Cursor must decrement for grid-shrinking and update to work. + --cursor_; + shrinkGridAt(cursor_); + update(); + return true; +} + +bool ReadingGrid::deleteReadingAfterCursor() { + if (cursor_ == readings_.size()) { + return false; + } + + readings_.erase(readings_.begin() + static_cast(cursor_), + readings_.begin() + static_cast(cursor_ + 1)); + shrinkGridAt(cursor_); + update(); + return true; +} + +namespace { + +// Defines a vertex of a DAG. This is a mutable data structure used for both +// DAG construction and single-source shortest-path computation. +struct Vertex { + explicit Vertex(ReadingGrid::NodePtr nodePtr) : node(std::move(nodePtr)) { + edges.reserve(ReadingGrid::kMaximumSpanLength); + } + ReadingGrid::NodePtr node; + std::vector edges; + + // Used during topological-sort. + bool topologicallySorted = false; + + // Used during shortest-path computation. We are actually computing the + // path with the *largest* weight, hence distance's initial value being + // negative infinity. If we were to compute the *shortest* weight/distance, + // we would have initialized this to infinity. + double distance = -std::numeric_limits::infinity(); + Vertex* prev = nullptr; +}; + +// Cormen et al. 2001 explains the historical origin of the term "relax." +void Relax(Vertex* u, Vertex* v) { + // The distance from u to w is simply v's score. + double w = v->node->score(); + + // Since we are computing the largest weight, we update v's distance and prev + // if the current distance to v is *less* than that of u's plus the distance + // to v (which is represented by w). + if (v->distance < u->distance + w) { + v->distance = u->distance + w; + v->prev = u; + } +} + +using VertexSpan = std::vector; + +// Topological-sorts a DAG that has a single root and returns the vertices in +// topological order. Here, a non-recursive version is implemented using our own +// stack and state definitions, so that we are not constrained by the current +// thread's stack size. This is the equivalent to this recursive version: +// +// void TopologicalSort(Vertex* v) { +// for (Vertex* nv : v->edges) { +// if (!nv->topologicallySorted) { +// dfs(nv, result); +// } +// } +// v->topologicallySorted = true; +// result.push_back(v); +// } +// +// The recursive version is similar to the TOPOLOGICAL-SORT algorithm found in +// Cormen et al. 2001. +std::vector TopologicalSort(Vertex* root) { + std::vector result; + struct State { + explicit State(Vertex* vv) : v(vv), edgeIter(v->edges.begin()) {} + Vertex* v; + std::vector::iterator edgeIter; + }; + std::stack stack; + stack.emplace(root); + + while (!stack.empty()) { + State& state = stack.top(); + Vertex* v = state.v; + + if (state.edgeIter != v->edges.end()) { + Vertex* nv = *state.edgeIter; + ++state.edgeIter; + if (!nv->topologicallySorted) { + stack.emplace(nv); + continue; + } + } + + v->topologicallySorted = true; + result.push_back(v); + stack.pop(); + } + + return result; +} + +int64_t GetEpochNowInMicroseconds() { + auto now = std::chrono::system_clock::now(); + int64_t timestamp = + std::chrono::time_point_cast(now) + .time_since_epoch() + .count(); + return timestamp; +} + +} // namespace + +// Find the weightiest path in the grid graph. The path represents the most +// likely hidden chain of events from the observations. We use the +// DAG-SHORTEST-PATHS algorithm in Cormen et al. 2001 to compute such path. +// Instead of computing the path with the shortest distance, though, we compute +// the path with the longest distance (so the weightiest), since with log +// probability a larger value means a larger probability. The algorithm runs in +// O(|V| + |E|) time for G = (V, E) where G is a DAG. This means the walk is +// fairly economical even when the grid is large. +ReadingGrid::WalkResult ReadingGrid::walk() { + WalkResult result; + if (spans_.empty()) { + return result; + } + int64_t start = GetEpochNowInMicroseconds(); + + std::vector vspans(spans_.size(), VertexSpan()); + size_t vertices = 0; + size_t edges = 0; + for (size_t i = 0, len = spans_.size(); i < len; ++i) { + const ReadingGrid::Span span = spans_[i]; + for (size_t j = 1, maxSpanLen = span.maxLength(); j <= maxSpanLen; ++j) { + NodePtr p = span.nodeOf(j); + if (p != nullptr) { + vspans[i].emplace_back(std::move(p)); + ++vertices; + } + } + } + result.vertices = vertices; + + Vertex terminal(std::make_shared( + "_TERMINAL_", 0, std::vector())); + for (size_t i = 0, vspansLen = vspans.size(); i < vspansLen; ++i) { + for (Vertex& v : vspans[i]) { + size_t nextVertexPos = i + v.node->spanningLength(); + if (nextVertexPos == vspansLen) { + v.edges.push_back(&terminal); + continue; + } + + for (Vertex& nv : vspans[nextVertexPos]) { + v.edges.push_back(&nv); + ++edges; + } + } + } + result.edges = edges; + + Vertex root(std::make_shared( + "_ROOT_", 0, std::vector())); + root.distance = 0; + for (Vertex& v : vspans[0]) { + root.edges.push_back(&v); + } + + std::vector ordered = TopologicalSort(&root); + for (auto it = ordered.rbegin(), rend = ordered.rend(); it != rend; ++it) { + Vertex* u = *it; + for (Vertex* v : u->edges) { + Relax(u, v); + } + } + + std::vector walked; + size_t totalReadingLen = 0; + Vertex* it = &terminal; + while (it->prev != nullptr) { + walked.push_back(it->prev->node); + it = it->prev; + totalReadingLen += it->node->spanningLength(); + } + + assert(totalReadingLen == readings_.size()); + assert(walked.size() >= 2); + result.nodes = std::vector(walked.rbegin() + 1, walked.rend()); + result.elapsedMicroseconds = GetEpochNowInMicroseconds() - start; + return result; +} + +std::vector ReadingGrid::candidatesAt(size_t loc) { + std::vector result; + if (readings_.empty()) { + return result; + } + + if (loc > readings_.size()) { + return result; + } + + std::vector nodes = + overlappingNodesAt(loc == readings_.size() ? loc - 1 : loc); + + // Sort nodes by reading length. + std::stable_sort( + nodes.begin(), nodes.end(), [](const auto& n1, const auto& n2) { + return n1.node->spanningLength() > n2.node->spanningLength(); + }); + + for (const NodeInSpan& nodeInSpan : nodes) { + for (const LanguageModel::Unigram& unigram : nodeInSpan.node->unigrams()) { + result.emplace_back(nodeInSpan.node->reading(), unigram.value()); + } + } + return result; +} + +bool ReadingGrid::overrideCandidate( + size_t loc, const ReadingGrid::Candidate& candidate, + ReadingGrid::Node::OverrideType overrideType) { + return overrideCandidate(loc, &candidate.reading, candidate.value, + overrideType); +} + +bool ReadingGrid::overrideCandidate( + size_t loc, const std::string& candidate, + ReadingGrid::Node::OverrideType overrideType) { + return overrideCandidate(loc, nullptr, candidate, overrideType); +} + +void ReadingGrid::expandGridAt(size_t loc) { + if (!loc || loc == spans_.size()) { + spans_.insert(spans_.begin() + static_cast(loc), Span()); + return; + } + spans_.insert(spans_.begin() + static_cast(loc), Span()); + removeAffectedNodes(loc); +} + +void ReadingGrid::shrinkGridAt(size_t loc) { + if (loc == spans_.size()) { + return; + } + spans_.erase(spans_.begin() + static_cast(loc)); + removeAffectedNodes(loc); +} + +void ReadingGrid::removeAffectedNodes(size_t loc) { + // Because of the expansion, certain spans now have "broken" nodes. We need + // to remove those. For example, before: + // + // Span index 0 1 2 3 + // (---) + // (-------) + // (-----------) + // + // After we've inserted a span at 2: + // + // Span index 0 1 2 3 4 + // (---) + // (---- ----) + // (-------- ----) + // + // Similarly for shrinkage, before: + // + // Span index 0 1 2 3 + // (---) + // (-------) + // (-----------) + // + // After we've deleted the span at 2: + // + // Span index 0 1 2 3 4 + // (---) + // XXXXX + // XXXXXXXXX + // + if (spans_.empty()) { + return; + } + size_t affectedLength = kMaximumSpanLength - 1; + size_t begin = loc <= affectedLength ? 0 : loc - affectedLength; + size_t end = loc >= 1 ? loc - 1 : 0; + for (size_t i = begin; i <= end; ++i) { + spans_[i].removeNodesOfOrLongerThan(loc - i + 1); + } +} + +void ReadingGrid::insert(size_t loc, const ReadingGrid::NodePtr& node) { + assert(loc < spans_.size()); + spans_[loc].add(node); +} + +std::string ReadingGrid::combineReading( + std::vector::const_iterator begin, + std::vector::const_iterator end) { + std::string result; + for (auto iter = begin; iter != end;) { + result += *iter; + ++iter; + if (iter != end) { + result += separator_; + } + } + return result; +} + +bool ReadingGrid::hasNodeAt(size_t loc, size_t readingLen, + const std::string& reading) { + if (loc > spans_.size()) { + return false; + } + const NodePtr& n = spans_[loc].nodeOf(readingLen); + if (n == nullptr) { + return false; + } + return reading == n->reading(); +} + +void ReadingGrid::update() { + size_t begin = + (cursor_ <= kMaximumSpanLength) ? 0 : cursor_ - kMaximumSpanLength; + size_t end = cursor_ + kMaximumSpanLength; + if (end > readings_.size()) { + end = readings_.size(); + } + + for (size_t pos = begin; pos < end; pos++) { + for (size_t len = 1; len <= kMaximumSpanLength && pos + len <= end; len++) { + std::string combinedReading = + combineReading(readings_.begin() + static_cast(pos), + readings_.begin() + static_cast(pos + len)); + + if (!hasNodeAt(pos, len, combinedReading)) { + auto unigrams = lm_.getUnigrams(combinedReading); + if (unigrams.empty()) { + continue; + } + + insert(pos, std::make_shared(combinedReading, len, unigrams)); + } + } + } +} + +bool ReadingGrid::overrideCandidate( + size_t loc, const std::string* reading, const std::string& value, + ReadingGrid::Node::OverrideType overrideType) { + if (loc > readings_.size()) { + return false; + } + + std::vector overlappingNodes = + overlappingNodesAt(loc == readings_.size() ? loc - 1 : loc); + NodeInSpan overridden; + for (NodeInSpan& nis : overlappingNodes) { + if (reading != nullptr && nis.node->reading() != *reading) { + continue; + } + + if (nis.node->selectOverrideUnigram(value, overrideType)) { + overridden = nis; + break; + } + } + + if (overridden.node == nullptr) { + // Nothing gets overridden. + return false; + } + + for (size_t i = overridden.spanIndex; + i < overridden.spanIndex + overridden.node->spanningLength() && + i < spans_.size(); + ++i) { + // We also need to reset *all* nodes that share the same location in the + // span. For example, if previously the two walked nodes are "A BC" where + // A and BC are two nodes with overrides. The user now chooses "DEF" which + // is a node that shares the same span location with "A". The node with BC + // will be reset as it's part of the overlapping node, but A is not. + std::vector nodes = overlappingNodesAt(i); + for (NodeInSpan& nis : nodes) { + if (nis.node != overridden.node) { + nis.node->reset(); + } + } + } + return true; +} + +std::vector ReadingGrid::overlappingNodesAt( + size_t loc) { + std::vector results; + + if (spans_.empty() || loc >= spans_.size()) { + return results; + } + + // First, get all nodes from the span at location. + for (size_t i = 1, len = spans_[loc].maxLength(); i <= len; ++i) { + NodePtr ptr = spans_[loc].nodeOf(i); + if (ptr != nullptr) { + ReadingGrid::NodeInSpan element{.node = std::move(ptr), .spanIndex = loc}; + results.emplace_back(std::move(element)); + } + } + + size_t begin = loc - std::min(loc, kMaximumSpanLength - 1); + for (size_t i = begin; i < loc; ++i) { + size_t beginLen = loc - i + 1; + size_t endLen = spans_[i].maxLength(); + for (size_t j = beginLen; j <= endLen; ++j) { + NodePtr ptr = spans_[i].nodeOf(j); + if (ptr != nullptr) { + ReadingGrid::NodeInSpan element{.node = std::move(ptr), .spanIndex = i}; + results.emplace_back(std::move(element)); + } + } + } + + return results; +} + +LanguageModel::Unigram ReadingGrid::Node::currentUnigram() const { + return unigrams_.empty() ? LanguageModel::Unigram{} : *unigramIter_; +} + +std::string ReadingGrid::Node::value() const { + return unigrams_.empty() ? "" : unigramIter_->value(); +} + +double ReadingGrid::Node::score() const { + if (unigrams_.empty()) { + return 0; + } + + switch (overrideType_) { + case OverrideType::kOverrideValueWithHighScore: + return kOverridingScore; + case OverrideType::kOverrideValueWithScoreFromTopUnigram: + return unigrams_[0].score(); + case OverrideType::kNone: + default: + return unigramIter_->score(); + } +} + +bool ReadingGrid::Node::isOverridden() const { + return overrideType_ != OverrideType::kNone; +} + +void ReadingGrid::Node::reset() { + unigramIter_ = unigrams_.begin(); + overrideType_ = OverrideType::kNone; +} + +bool ReadingGrid::Node::selectOverrideUnigram( + const std::string& value, ReadingGrid::Node::OverrideType type) { + assert(type != ReadingGrid::Node::OverrideType::kNone); + for (auto it = unigrams_.begin(), end = unigrams_.end(); it != end; ++it) { + if (value == it->value()) { + unigramIter_ = it; + overrideType_ = type; + return true; + } + } + return false; +} +std::vector ReadingGrid::WalkResult::valuesAsStrings() { + std::vector result; + for (const NodePtr& node : nodes) { + result.emplace_back(node->value()); + } + return result; +} + +std::vector ReadingGrid::WalkResult::readingsAsStrings() { + std::vector result; + for (const NodePtr& node : nodes) { + result.emplace_back(node->reading()); + } + return result; +} + +void ReadingGrid::Span::clear() { + nodes_.fill(nullptr); + maxLength_ = 0; +} + +void ReadingGrid::Span::add(const ReadingGrid::NodePtr& node) { + assert(node->spanningLength() > 0 && + node->spanningLength() <= kMaximumSpanLength); + nodes_[node->spanningLength() - 1] = node; + if (node->spanningLength() >= maxLength_) { + maxLength_ = node->spanningLength(); + } +} + +void ReadingGrid::Span::removeNodesOfOrLongerThan(size_t length) { + assert(length > 0 && length <= kMaximumSpanLength); + for (size_t i = length - 1; i < kMaximumSpanLength; ++i) { + nodes_[i] = nullptr; + } + maxLength_ = 0; + if (length == 1) { + return; + } + + size_t i = length - 2; + while (true) { + if (nodes_[i] != nullptr) { + maxLength_ = i + 1; + return; + } + + if (i == 0) { + return; + } + + --i; + } +} + +ReadingGrid::NodePtr ReadingGrid::Span::nodeOf(size_t length) const { + assert(length > 0 && length <= kMaximumSpanLength); + return nodes_[length - 1]; +} + +std::vector +ReadingGrid::ScoreRankedLanguageModel::getUnigrams(const std::string& reading) { + auto unigrams = lm_->getUnigrams(reading); + std::stable_sort( + unigrams.begin(), unigrams.end(), + [](const auto& u1, const auto& u2) { return u1.score() > u2.score(); }); + return unigrams; +} + +bool ReadingGrid::ScoreRankedLanguageModel::hasUnigrams( + const std::string& reading) { + return lm_->hasUnigrams(reading); +} + +} // namespace Formosa::Gramambular2 diff --git a/Source/Engine/gramambular2/reading_grid.h b/Source/Engine/gramambular2/reading_grid.h new file mode 100644 index 000000000..9039a66c3 --- /dev/null +++ b/Source/Engine/gramambular2/reading_grid.h @@ -0,0 +1,263 @@ +// Copyright (c) 2022 and onwards Lukhnos Liu. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +#ifndef READING_GRID_H_ +#define READING_GRID_H_ + +#include +#include +#include +#include +#include +#include + +#include "language_model.h" + +namespace Formosa::Gramambular2 { + +// A grid for deriving the most likely hidden values from a series of +// observations. For our purpose, the observations are Bopomofo readings, and +// the hidden values are the actual Mandarin words. This can also be used for +// segmentation: in that case, the observations are Mandarin words, and the +// hidden values are the most likely groupings. +// +// While we use the terminology from hidden Markov model (HMM), the actual +// implementation is a much simpler Bayesian inference, since the underlying +// language model consists of only unigrams. Once we have put all plausible +// unigrams as nodes on the grid, a simple DAG shortest-path walk will give us +// the maximum likelihood estimation (MLE) for the hidden values. +class ReadingGrid { + public: + explicit ReadingGrid(std::shared_ptr lm) + : lm_(std::move(lm)) {} + + void clear(); + + [[nodiscard]] size_t length() const { return readings_.size(); } + + [[nodiscard]] size_t cursor() const { return cursor_; } + + void setCursor(size_t cursor); + + [[nodiscard]] std::string readingSeparator() const { return separator_; } + + void setReadingSeparator(const std::string& separator); + + bool insertReading(const std::string& reading); + + // Delete the reading before the cursor, like Backspace. Cursor will decrement + // by one. + bool deleteReadingBeforeCursor(); + + // Delete the reading after the cursor, like Del. Cursor is unmoved. + bool deleteReadingAfterCursor(); + + static constexpr size_t kMaximumSpanLength = 6; + static constexpr char kDefaultSeparator[] = "-"; + + // A Node consists of a set of unigrams, a reading, and a spanning length. + // The spanning length denotes the length of the node in the grid. The grid + // is responsible for constructing its nodes. For Mandarin multi-character + // phrases, the grid will join separate readings into a single combined + // reading, and use that reading to retrieve the unigrams with that reading. + // Node with two-character phrases (so two readings, or two syllables) will + // then have a spanning length of 2. + class Node { + public: + enum class OverrideType { + kNone, + // Override the node with a unigram value and a score such that the node + // will almost always be favored by the walk. + kOverrideValueWithHighScore, + // Override the node with a unigram value but with the score of the + // top unigram. For example, if the unigrams in the node are ("a", -1), + // ("b", -2), ("c", -10), overriding using this type for "c" will cause + // the node to return the value "c" with the score -1. This is used for + // soft-override such as from a suggestion. The node with the override + // value will very likely be favored by a walk, but it does not prevent + // other nodes from prevailing, which would be the case if + // kOverrideValueWithHighScore was used. + kOverrideValueWithScoreFromTopUnigram + }; + + Node(std::string reading, size_t spanningLength, + std::vector unigrams) + : reading_(std::move(reading)), + spanningLength_(spanningLength), + unigrams_(std::move(unigrams)), + unigramIter_(unigrams_.begin()), + overrideType_(OverrideType::kNone) {} + + [[nodiscard]] const std::string& reading() const { return reading_; } + + [[nodiscard]] size_t spanningLength() const { return spanningLength_; } + + [[nodiscard]] const std::vector& unigrams() const { + return unigrams_; + } + + // Returns the top or overridden unigram. + [[nodiscard]] LanguageModel::Unigram currentUnigram() const; + + [[nodiscard]] std::string value() const; + + [[nodiscard]] double score() const; + + [[nodiscard]] bool isOverridden() const; + + void reset(); + + bool selectOverrideUnigram(const std::string& value, OverrideType type); + + // A sufficiently high score to cause the walk to go through an overriding + // node. Although this can be 0, setting it to a positive value has the + // desirable side effect that it reduces the competition of "free-floating" + // multiple-character phrases. For example, if the user override for + // reading "a b c" is "A B c", using the uppercase as the overriding node, + // now the standalone c may have to compete with a phrase with reading "bc", + // which in some pathological cases may actually cause the shortest path to + // be A->bc, especially when A and B use the zero overriding score, as they + // leave "c" alone to compete with "bc", and whether the path A-B is favored + // now solely depends on that competition. A positive value favors the route + // A->B, which gives "c" a better chance. + static constexpr double kOverridingScore = 42; + + protected: + const std::string reading_; + const size_t spanningLength_; + const std::vector unigrams_; + std::vector::const_iterator unigramIter_; + OverrideType overrideType_; + }; + + using NodePtr = std::shared_ptr; + + struct WalkResult { + std::vector nodes; + size_t vertices; + size_t edges; + uint64_t elapsedMicroseconds; + + std::vector valuesAsStrings(); + std::vector readingsAsStrings(); + }; + + WalkResult walk(); + + struct Candidate { + Candidate(std::string r, std::string v) + : reading(std::move(r)), value(std::move(v)) {} + const std::string reading; + const std::string value; + }; + + // Returns all candidate values at the location. If spans are not empty and + // loc is at the end of the spans, (loc - 1) is used, so that the caller does + // not have to care about this boundary condition. + std::vector candidatesAt(size_t loc); + + // Adds weight to the node with the unigram that has the designated candidate + // value and applies the desired override type, essentially resulting in user + // override. An overridden node would influence the grid walk to favor walking + // through it. + bool overrideCandidate(size_t loc, const Candidate& candidate, + Node::OverrideType overrideType = + Node::OverrideType::kOverrideValueWithHighScore); + + // Same as the method above, but since the string candidate value is used, if + // there are multiple nodes (of different spanning length) that have the same + // unigram value, it's not guaranteed which node will be selected. + bool overrideCandidate(size_t loc, const std::string& candidate, + Node::OverrideType overrideType = + Node::OverrideType::kOverrideValueWithHighScore); + + // A span is a collection of nodes that share the same starting location. + class Span { + public: + void clear(); + void add(const NodePtr& node); + void removeNodesOfOrLongerThan(size_t length); + [[nodiscard]] NodePtr nodeOf(size_t length) const; + [[nodiscard]] size_t maxLength() const { return maxLength_; } + + protected: + std::array nodes_; + size_t maxLength_ = 0; + }; + + // A language model wrapper that always returns score-ranked unigrams. + class ScoreRankedLanguageModel : public LanguageModel { + public: + explicit ScoreRankedLanguageModel(std::shared_ptr lm) + : lm_(std::move(lm)) { + assert(lm_ != nullptr); + } + std::vector getUnigrams(const std::string& reading) override; + bool hasUnigrams(const std::string& reading) override; + + protected: + std::shared_ptr lm_; + }; + + [[nodiscard]] const std::vector& spans() const { return spans_; } + + [[nodiscard]] const std::vector& readings() const { + return readings_; + } + + protected: + size_t cursor_ = 0; + std::string separator_ = kDefaultSeparator; + std::vector readings_; + std::vector spans_; + ScoreRankedLanguageModel lm_; + + // Internal methods for maintaining the grid. + + void expandGridAt(size_t loc); + void shrinkGridAt(size_t loc); + void removeAffectedNodes(size_t loc); + void insert(size_t loc, const NodePtr& node); + std::string combineReading(std::vector::const_iterator begin, + std::vector::const_iterator end); + bool hasNodeAt(size_t loc, size_t readingLen, const std::string& reading); + void update(); + + // Internal implementation of overrideCandidate, with an optional reading. + bool overrideCandidate(size_t loc, const std::string* reading, + const std::string& value, + Node::OverrideType overrideType); + + struct NodeInSpan { + NodePtr node; + size_t spanIndex; + }; + + // Find all nodes that overlap with the location. The return value is a list + // of nodes along with their starting location in the grid. + std::vector overlappingNodesAt(size_t loc); +}; + +} // namespace Formosa::Gramambular2 + +#endif diff --git a/Source/Engine/gramambular2/reading_grid_test.cpp b/Source/Engine/gramambular2/reading_grid_test.cpp new file mode 100644 index 000000000..b54c4b664 --- /dev/null +++ b/Source/Engine/gramambular2/reading_grid_test.cpp @@ -0,0 +1,676 @@ +// Copyright (c) 2022 and onwards Lukhnos Liu +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +#include "reading_grid.h" + +#include + +#include +#include +#include +#include + +#include "language_model.h" + +namespace Formosa::Gramambular2 { + +constexpr char kSampleData[] = R"( +# +# The sample is from libtabe (https://sourceforge.net/projects/libtabe/) +# last updated in 2002. The project was originally initiated by +# Pai-Hsiang Hsiao in 1999. +# +# Libtabe is a frequency table of Taiwanese Mandarin words. The database +# itself is, according to the tar file, released under the BSD License. +# +ㄙ 絲 -9.495858 +ㄙ 思 -9.006414 +ㄙ 私 -99.000000 +ㄙ 斯 -8.091803 +ㄙ 司 -99.000000 +ㄙ 嘶 -13.513987 +ㄙ 撕 -12.259095 +ㄍㄠ 高 -7.171551 +ㄎㄜ 顆 -10.574273 +ㄎㄜ 棵 -11.504072 +ㄎㄜ 刻 -10.450457 +ㄎㄜ 科 -7.171052 +ㄎㄜ 柯 -99.000000 +ㄍㄠ 膏 -11.928720 +ㄍㄠ 篙 -13.624335 +ㄍㄠ 糕 -12.390804 +ㄉㄜ˙ 的 -3.516024 +ㄉㄧˊ 的 -3.516024 +ㄉㄧˋ 的 -3.516024 +ㄓㄨㄥ 中 -5.809297 +ㄉㄜ˙ 得 -7.427179 +ㄍㄨㄥ 共 -8.381971 +ㄍㄨㄥ 供 -8.501463 +ㄐㄧˋ 既 -99.000000 +ㄐㄧㄣ 今 -8.034095 +ㄍㄨㄥ 紅 -8.858181 +ㄐㄧˋ 際 -7.608341 +ㄐㄧˋ 季 -99.000000 +ㄐㄧㄣ 金 -7.290109 +ㄐㄧˋ 騎 -10.939895 +ㄓㄨㄥ 終 -99.000000 +ㄐㄧˋ 記 -99.000000 +ㄐㄧˋ 寄 -99.000000 +ㄐㄧㄣ 斤 -99.000000 +ㄐㄧˋ 繼 -9.715317 +ㄐㄧˋ 計 -7.926683 +ㄐㄧˋ 暨 -8.373022 +ㄓㄨㄥ 鐘 -9.877580 +ㄐㄧㄣ 禁 -10.711079 +ㄍㄨㄥ 公 -7.877973 +ㄍㄨㄥ 工 -7.822167 +ㄍㄨㄥ 攻 -99.000000 +ㄍㄨㄥ 功 -99.000000 +ㄍㄨㄥ 宮 -99.000000 +ㄓㄨㄥ 鍾 -9.685671 +ㄐㄧˋ 繫 -10.425662 +ㄍㄨㄥ 弓 -99.000000 +ㄍㄨㄥ 恭 -99.000000 +ㄐㄧˋ 劑 -8.888722 +ㄐㄧˋ 祭 -10.204425 +ㄐㄧㄣ 浸 -11.378321 +ㄓㄨㄥ 盅 -99.000000 +ㄐㄧˋ 忌 -99.000000 +ㄐㄧˋ 技 -8.450826 +ㄐㄧㄣ 筋 -11.074890 +ㄍㄨㄥ 躬 -99.000000 +ㄐㄧˋ 冀 -12.045357 +ㄓㄨㄥ 忠 -99.000000 +ㄐㄧˋ 妓 -99.000000 +ㄐㄧˋ 濟 -9.517568 +ㄐㄧˋ 薊 -12.021587 +ㄐㄧㄣ 巾 -99.000000 +ㄐㄧㄣ 襟 -12.784206 +ㄋㄧㄢˊ 年 -6.086515 +ㄐㄧㄤˇ 講 -9.164384 +ㄐㄧㄤˇ 獎 -8.690941 +ㄐㄧㄤˇ 蔣 -10.127828 +ㄋㄧㄢˊ 黏 -11.336864 +ㄋㄧㄢˊ 粘 -11.285740 +ㄐㄧㄤˇ 槳 -12.492933 +ㄍㄨㄥㄙ 公司 -6.299461 +ㄎㄜㄐㄧˋ 科技 -6.736613 +ㄐㄧˋㄍㄨㄥ 濟公 -13.336653 +ㄐㄧㄤˇㄐㄧㄣ 獎金 -10.344678 +ㄋㄧㄢˊㄓㄨㄥ 年終 -11.668947 +ㄋㄧㄢˊㄓㄨㄥ 年中 -11.373044 +ㄍㄠㄎㄜㄐㄧˋ 高科技 -9.842421 +)"; + +class SimpleLM : public LanguageModel { + public: + explicit SimpleLM(const char* input, bool readingIsFirstColumn = true) { + std::stringstream sstream(input); + while (sstream.good()) { + std::string line; + getline(sstream, line); + if (line.empty() || line[0] == '#') { + continue; + } + std::stringstream linestream(line); + std::string col0; + std::string col1; + std::string col2; + linestream >> col0; + linestream >> col1; + linestream >> col2; + m_db[readingIsFirstColumn ? col0 : col1].emplace_back( + readingIsFirstColumn ? col1 : col0, std::stod(col2)); + } + } + + std::vector getUnigrams(const std::string& key) override { + const auto f = m_db.find(key); + return f == m_db.end() ? std::vector() : (*f).second; + } + + bool hasUnigrams(const std::string& key) override { + return m_db.find(key) != m_db.end(); + } + + protected: + std::map> m_db; +}; + +class MockLM : public LanguageModel { + public: + std::vector getUnigrams(const std::string& reading) override { + return std::vector{Unigram(reading, -1)}; + } + bool hasUnigrams(const std::string&) override { return true; } +}; + +static bool Contains(const std::vector& candidates, + const std::string& str) { + return std::any_of(candidates.cbegin(), candidates.cend(), + [&str](const auto& v) { return v.value == str; }); +} + +TEST(ReadingGridTest, Span) { + SimpleLM lm(kSampleData); + ReadingGrid::Span span; + + auto n1 = + std::make_shared("ㄍㄠ", 1, lm.getUnigrams("ㄍㄠ")); + auto n3 = std::make_shared( + "ㄍㄠㄎㄜㄐㄧˋ", 3, lm.getUnigrams("ㄍㄠㄎㄜㄐㄧˋ")); + + ASSERT_EQ(span.maxLength(), 0); + span.add(n1); + ASSERT_EQ(span.maxLength(), 1); + span.add(n3); + ASSERT_EQ(span.maxLength(), 3); + ASSERT_EQ(span.nodeOf(1), n1); + ASSERT_EQ(span.nodeOf(2), nullptr); + ASSERT_EQ(span.nodeOf(3), n3); + ASSERT_EQ(span.nodeOf(ReadingGrid::kMaximumSpanLength), nullptr); + span.clear(); + ASSERT_EQ(span.maxLength(), 0); + ASSERT_EQ(span.nodeOf(1), nullptr); + ASSERT_EQ(span.nodeOf(2), nullptr); + ASSERT_EQ(span.nodeOf(3), nullptr); + ASSERT_EQ(span.nodeOf(ReadingGrid::kMaximumSpanLength), nullptr); + + span.add(n1); + span.add(n3); + span.removeNodesOfOrLongerThan(2); + ASSERT_EQ(span.maxLength(), 1); + ASSERT_EQ(span.nodeOf(1), n1); + ASSERT_EQ(span.nodeOf(2), nullptr); + ASSERT_EQ(span.nodeOf(3), nullptr); + span.removeNodesOfOrLongerThan(1); + ASSERT_EQ(span.maxLength(), 0); + ASSERT_EQ(span.nodeOf(1), nullptr); + + auto n10 = std::make_shared("", 10, lm.getUnigrams("")); + ASSERT_DEATH({ (void)span.add(n10); }, "Assertion"); + ASSERT_DEATH({ (void)span.nodeOf(0); }, "Assertion"); + ASSERT_DEATH({ (void)span.nodeOf(ReadingGrid::kMaximumSpanLength + 1); }, + "Assertion"); +} + +TEST(ReadingGridTest, ScoreRankedLanguageModel) { + class TestLM : public LanguageModel { + public: + std::vector getUnigrams(const std::string& reading) override { + std::vector unigrams; + if (reading == "foo") { + unigrams.emplace_back("middle", -5); + unigrams.emplace_back("highest", -2); + unigrams.emplace_back("lowest", -10); + } + return unigrams; + } + + bool hasUnigrams(const std::string& reading) override { + return reading == "foo"; + } + }; + + ReadingGrid::ScoreRankedLanguageModel lm(std::make_shared()); + ASSERT_TRUE(lm.hasUnigrams("foo")); + ASSERT_FALSE(lm.hasUnigrams("bar")); + ASSERT_TRUE(lm.getUnigrams("bar").empty()); + auto unigrams = lm.getUnigrams("foo"); + ASSERT_EQ(unigrams.size(), 3); + ASSERT_EQ(unigrams[0].value(), "highest"); + ASSERT_EQ(unigrams[0].score(), -2); + ASSERT_EQ(unigrams[1].value(), "middle"); + ASSERT_EQ(unigrams[1].score(), -5); + ASSERT_EQ(unigrams[2].value(), "lowest"); + ASSERT_EQ(unigrams[2].score(), -10); +} + +TEST(ReadingGridTest, BasicOperations) { + ReadingGrid grid(std::make_shared()); + ASSERT_EQ(grid.readingSeparator(), ReadingGrid::kDefaultSeparator); + + ASSERT_EQ(grid.cursor(), 0); + ASSERT_EQ(grid.length(), 0); + grid.insertReading("a"); + + ASSERT_EQ(grid.cursor(), 1); + ASSERT_EQ(grid.length(), 1); + ASSERT_EQ(grid.spans().size(), 1); + ASSERT_EQ(grid.spans()[0].maxLength(), 1); + ASSERT_EQ(grid.spans()[0].nodeOf(1)->reading(), "a"); + + grid.deleteReadingBeforeCursor(); + ASSERT_EQ(grid.cursor(), 0); + ASSERT_EQ(grid.length(), 0); + ASSERT_EQ(grid.spans().size(), 0); +} + +TEST(ReadingGridTest, InvalidOperations) { + class TestLM : public LanguageModel { + public: + std::vector getUnigrams(const std::string& reading) override { + std::vector unigrams; + if (reading == "foo") { + unigrams.emplace_back("foo", -1); + } + return unigrams; + } + + bool hasUnigrams(const std::string& reading) override { + return reading == "foo"; + } + }; + + ReadingGrid grid(std::make_shared()); + + grid.setReadingSeparator(";"); + ASSERT_FALSE(grid.insertReading("bar")); + ASSERT_FALSE(grid.insertReading("")); + ASSERT_FALSE(grid.insertReading(";")); + ASSERT_FALSE(grid.deleteReadingBeforeCursor()); + ASSERT_FALSE(grid.deleteReadingAfterCursor()); + + ASSERT_TRUE(grid.insertReading("foo")); + ASSERT_TRUE(grid.deleteReadingBeforeCursor()); + ASSERT_EQ(grid.length(), 0); + ASSERT_TRUE(grid.insertReading("foo")); + grid.setCursor(0); + ASSERT_TRUE(grid.deleteReadingAfterCursor()); + ASSERT_EQ(grid.length(), 0); +} + +TEST(ReadingGridTest, DeleteAfterCursor) { + ReadingGrid grid(std::make_shared()); + grid.insertReading("a"); + grid.setCursor(0); + ASSERT_EQ(grid.cursor(), 0); + ASSERT_EQ(grid.length(), 1); + ASSERT_EQ(grid.spans().size(), 1); + + grid.deleteReadingBeforeCursor(); + ASSERT_EQ(grid.cursor(), 0); + ASSERT_EQ(grid.length(), 1); + + grid.deleteReadingAfterCursor(); + ASSERT_EQ(grid.cursor(), 0); + ASSERT_EQ(grid.length(), 0); + ASSERT_EQ(grid.spans().size(), 0); +} + +TEST(ReadingGridTest, MultipleSpans) { + ReadingGrid grid(std::make_shared()); + grid.setReadingSeparator(";"); + grid.insertReading("a"); + grid.insertReading("b"); + grid.insertReading("c"); + + ASSERT_EQ(grid.cursor(), 3); + ASSERT_EQ(grid.length(), 3); + ASSERT_EQ(grid.spans().size(), 3); + ASSERT_EQ(grid.spans()[0].maxLength(), 3); + ASSERT_EQ(grid.spans()[0].nodeOf(1)->reading(), "a"); + ASSERT_EQ(grid.spans()[0].nodeOf(2)->reading(), "a;b"); + ASSERT_EQ(grid.spans()[0].nodeOf(3)->reading(), "a;b;c"); + ASSERT_EQ(grid.spans()[1].maxLength(), 2); + ASSERT_EQ(grid.spans()[1].nodeOf(1)->reading(), "b"); + ASSERT_EQ(grid.spans()[1].nodeOf(2)->reading(), "b;c"); + ASSERT_EQ(grid.spans()[2].maxLength(), 1); + ASSERT_EQ(grid.spans()[2].nodeOf(1)->reading(), "c"); +} + +TEST(ReadingGridTest, SpanDeletionSimple) { + ReadingGrid grid(std::make_shared()); + grid.setReadingSeparator(";"); + grid.insertReading("a"); + grid.insertReading("b"); + grid.insertReading("c"); + grid.deleteReadingBeforeCursor(); + ASSERT_EQ(grid.cursor(), 2); + ASSERT_EQ(grid.length(), 2); + ASSERT_EQ(grid.spans().size(), 2); + ASSERT_EQ(grid.spans()[0].maxLength(), 2); + ASSERT_EQ(grid.spans()[0].nodeOf(1)->reading(), "a"); + ASSERT_EQ(grid.spans()[0].nodeOf(2)->reading(), "a;b"); + ASSERT_EQ(grid.spans()[1].maxLength(), 1); + ASSERT_EQ(grid.spans()[1].nodeOf(1)->reading(), "b"); +} + +TEST(ReadingGridTest, SpanDeletionFromMiddle) { + ReadingGrid grid(std::make_shared()); + grid.setReadingSeparator(";"); + grid.insertReading("a"); + grid.insertReading("b"); + grid.insertReading("c"); + grid.setCursor(2); + grid.deleteReadingBeforeCursor(); + ASSERT_EQ(grid.cursor(), 1); + ASSERT_EQ(grid.length(), 2); + ASSERT_EQ(grid.spans().size(), 2); + ASSERT_EQ(grid.spans()[0].maxLength(), 2); + ASSERT_EQ(grid.spans()[0].nodeOf(1)->reading(), "a"); + ASSERT_EQ(grid.spans()[0].nodeOf(2)->reading(), "a;c"); + ASSERT_EQ(grid.spans()[1].maxLength(), 1); + ASSERT_EQ(grid.spans()[1].nodeOf(1)->reading(), "c"); +} + +TEST(ReadingGridTest, SpanDeletionFromMiddleUsingDeleteAfterCursor) { + ReadingGrid grid(std::make_shared()); + grid.setReadingSeparator(";"); + grid.insertReading("a"); + grid.insertReading("b"); + grid.insertReading("c"); + grid.setCursor(1); + grid.deleteReadingAfterCursor(); + ASSERT_EQ(grid.cursor(), 1); + ASSERT_EQ(grid.length(), 2); + ASSERT_EQ(grid.spans().size(), 2); + ASSERT_EQ(grid.spans()[0].maxLength(), 2); + ASSERT_EQ(grid.spans()[0].nodeOf(1)->reading(), "a"); + ASSERT_EQ(grid.spans()[0].nodeOf(2)->reading(), "a;c"); + ASSERT_EQ(grid.spans()[1].maxLength(), 1); + ASSERT_EQ(grid.spans()[1].nodeOf(1)->reading(), "c"); +} + +TEST(ReadingGridTest, SpanInsertion) { + ReadingGrid grid(std::make_shared()); + grid.setReadingSeparator(";"); + grid.insertReading("a"); + grid.insertReading("b"); + grid.insertReading("c"); + grid.setCursor(1); + grid.insertReading("X"); + + ASSERT_EQ(grid.cursor(), 2); + ASSERT_EQ(grid.length(), 4); + ASSERT_EQ(grid.spans().size(), 4); + ASSERT_EQ(grid.spans()[0].maxLength(), 4); + ASSERT_EQ(grid.spans()[0].nodeOf(1)->reading(), "a"); + ASSERT_EQ(grid.spans()[0].nodeOf(2)->reading(), "a;X"); + ASSERT_EQ(grid.spans()[0].nodeOf(3)->reading(), "a;X;b"); + ASSERT_EQ(grid.spans()[0].nodeOf(4)->reading(), "a;X;b;c"); + ASSERT_EQ(grid.spans()[1].maxLength(), 3); + ASSERT_EQ(grid.spans()[1].nodeOf(1)->reading(), "X"); + ASSERT_EQ(grid.spans()[1].nodeOf(2)->reading(), "X;b"); + ASSERT_EQ(grid.spans()[1].nodeOf(3)->reading(), "X;b;c"); + ASSERT_EQ(grid.spans()[2].maxLength(), 2); + ASSERT_EQ(grid.spans()[2].nodeOf(1)->reading(), "b"); + ASSERT_EQ(grid.spans()[2].nodeOf(2)->reading(), "b;c"); + ASSERT_EQ(grid.spans()[3].maxLength(), 1); + ASSERT_EQ(grid.spans()[3].nodeOf(1)->reading(), "c"); +} + +TEST(ReadingGridTest, LongGridDeletion) { + ReadingGrid grid(std::make_shared()); + grid.setReadingSeparator(""); + grid.insertReading("a"); + grid.insertReading("b"); + grid.insertReading("c"); + grid.insertReading("d"); + grid.insertReading("e"); + grid.insertReading("f"); + grid.insertReading("g"); + grid.insertReading("h"); + grid.insertReading("i"); + grid.insertReading("j"); + grid.insertReading("k"); + grid.insertReading("l"); + grid.insertReading("m"); + grid.insertReading("n"); + grid.setCursor(7); + grid.deleteReadingBeforeCursor(); + ASSERT_EQ(grid.cursor(), 6); + ASSERT_EQ(grid.length(), 13); + ASSERT_EQ(grid.spans().size(), 13); + ASSERT_EQ(grid.spans()[0].nodeOf(6)->reading(), "abcdef"); + ASSERT_EQ(grid.spans()[1].nodeOf(6)->reading(), "bcdefh"); + ASSERT_EQ(grid.spans()[1].nodeOf(5)->reading(), "bcdef"); + ASSERT_EQ(grid.spans()[2].nodeOf(6)->reading(), "cdefhi"); + ASSERT_EQ(grid.spans()[2].nodeOf(5)->reading(), "cdefh"); + ASSERT_EQ(grid.spans()[3].nodeOf(6)->reading(), "defhij"); + ASSERT_EQ(grid.spans()[4].nodeOf(6)->reading(), "efhijk"); + ASSERT_EQ(grid.spans()[5].nodeOf(6)->reading(), "fhijkl"); + ASSERT_EQ(grid.spans()[6].nodeOf(6)->reading(), "hijklm"); + ASSERT_EQ(grid.spans()[7].nodeOf(6)->reading(), "ijklmn"); + ASSERT_EQ(grid.spans()[8].nodeOf(5)->reading(), "jklmn"); +} + +TEST(ReadingGridTest, StressTest) { + constexpr char kStressData[] = R"( +ㄧ 一 -2.08170692 +ㄧ-ㄧ 一一 -4.38468400 +)"; + + ReadingGrid grid(std::make_shared(kStressData)); + for (int i = 0; i < 8001; i++) { + grid.insertReading("ㄧ"); + } + ReadingGrid::WalkResult result = grid.walk(); + std::cout << "stress test elapsed: " << result.elapsedMicroseconds + << " microseconds, vertices: " << result.vertices + << ", edges: " << result.edges << "\n"; +} + +TEST(ReadingGridTest, LongGridInsertion) { + ReadingGrid grid(std::make_shared()); + grid.setReadingSeparator(""); + grid.insertReading("a"); + grid.insertReading("b"); + grid.insertReading("c"); + grid.insertReading("d"); + grid.insertReading("e"); + grid.insertReading("f"); + grid.insertReading("g"); + grid.insertReading("h"); + grid.insertReading("i"); + grid.insertReading("j"); + grid.insertReading("k"); + grid.insertReading("l"); + grid.insertReading("m"); + grid.insertReading("n"); + grid.setCursor(7); + grid.insertReading("X"); + ASSERT_EQ(grid.cursor(), 8); + ASSERT_EQ(grid.length(), 15); + ASSERT_EQ(grid.spans().size(), 15); + ASSERT_EQ(grid.spans()[0].nodeOf(6)->reading(), "abcdef"); + ASSERT_EQ(grid.spans()[1].nodeOf(6)->reading(), "bcdefg"); + ASSERT_EQ(grid.spans()[2].nodeOf(6)->reading(), "cdefgX"); + ASSERT_EQ(grid.spans()[3].nodeOf(6)->reading(), "defgXh"); + ASSERT_EQ(grid.spans()[3].nodeOf(5)->reading(), "defgX"); + ASSERT_EQ(grid.spans()[4].nodeOf(6)->reading(), "efgXhi"); + ASSERT_EQ(grid.spans()[4].nodeOf(5)->reading(), "efgXh"); + ASSERT_EQ(grid.spans()[4].nodeOf(4)->reading(), "efgX"); + ASSERT_EQ(grid.spans()[4].nodeOf(3)->reading(), "efg"); + ASSERT_EQ(grid.spans()[5].nodeOf(6)->reading(), "fgXhij"); + ASSERT_EQ(grid.spans()[6].nodeOf(6)->reading(), "gXhijk"); + ASSERT_EQ(grid.spans()[7].nodeOf(6)->reading(), "Xhijkl"); + ASSERT_EQ(grid.spans()[8].nodeOf(6)->reading(), "hijklm"); +} + +TEST(ReadingGridTest, WordSegmentationTest) { + ReadingGrid grid( + std::make_shared(kSampleData, /*readingIsFirstColumn=*/false)); + grid.setReadingSeparator(""); + grid.insertReading("高"); + grid.insertReading("科"); + grid.insertReading("技"); + grid.insertReading("公"); + grid.insertReading("司"); + grid.insertReading("的"); + grid.insertReading("年"); + grid.insertReading("終"); + grid.insertReading("獎"); + grid.insertReading("金"); + + ReadingGrid::WalkResult result = grid.walk(); + ASSERT_EQ(result.readingsAsStrings(), + (std::vector{"高科技", "公司", "的", "年終", "獎金"})); +} + +TEST(ReadingGridTest, InputTest) { + ReadingGrid grid(std::make_shared(kSampleData)); + grid.setReadingSeparator(""); + grid.insertReading("ㄍㄠ"); + grid.insertReading("ㄐㄧˋ"); + grid.setCursor(1); + grid.insertReading("ㄎㄜ"); + grid.setCursor(0); + grid.deleteReadingAfterCursor(); + grid.insertReading("ㄍㄠ"); + grid.setCursor(grid.length()); + grid.insertReading("ㄍㄨㄥ"); + grid.insertReading("ㄙ"); + grid.insertReading("ㄉㄜ˙"); + grid.insertReading("ㄋㄧㄢˊ"); + grid.insertReading("ㄓㄨㄥ"); + grid.insertReading("ㄐㄧㄤˇ"); + grid.insertReading("ㄐㄧㄣ"); + ReadingGrid::WalkResult result = grid.walk(); + ASSERT_EQ(result.valuesAsStrings(), + (std::vector{"高科技", "公司", "的", "年中", "獎金"})); + + ASSERT_EQ(grid.length(), 10); + grid.setCursor(7); // Before 年中 + + auto candidates = grid.candidatesAt(grid.cursor()); + ASSERT_TRUE(Contains(candidates, "年中")); + ASSERT_TRUE(Contains(candidates, "年終")); + ASSERT_TRUE(Contains(candidates, "中")); + ASSERT_TRUE(Contains(candidates, "鍾")); + + ASSERT_TRUE(grid.overrideCandidate(7, "年終")); + result = grid.walk(); + ASSERT_EQ(result.valuesAsStrings(), + (std::vector{"高科技", "公司", "的", "年終", "獎金"})); +} + +TEST(ReadingGridTest, OverrideResetOverlappingNodes) { + ReadingGrid grid(std::make_shared(kSampleData)); + grid.setReadingSeparator(""); + grid.insertReading("ㄍㄠ"); + grid.insertReading("ㄎㄜ"); + grid.insertReading("ㄐㄧˋ"); + grid.setCursor(0); + ASSERT_TRUE(grid.overrideCandidate(grid.cursor(), "膏")); + ReadingGrid::WalkResult result = grid.walk(); + ASSERT_EQ(result.valuesAsStrings(), (std::vector{"膏", "科技"})); + + ASSERT_TRUE(grid.overrideCandidate(1, "高科技")); + result = grid.walk(); + ASSERT_EQ(result.valuesAsStrings(), (std::vector{"高科技"})); + + ASSERT_TRUE(grid.overrideCandidate(0, "膏")); + result = grid.walk(); + ASSERT_EQ(result.valuesAsStrings(), (std::vector{"膏", "科技"})); + + ASSERT_TRUE(grid.overrideCandidate(1, "柯")); + result = grid.walk(); + ASSERT_EQ(result.valuesAsStrings(), + (std::vector{"膏", "柯", "際"})); + + ASSERT_TRUE(grid.overrideCandidate(2, "暨")); + result = grid.walk(); + ASSERT_EQ(result.valuesAsStrings(), + (std::vector{"膏", "柯", "暨"})); + + ASSERT_TRUE(grid.overrideCandidate(3, "高科技")); + result = grid.walk(); + ASSERT_EQ(result.valuesAsStrings(), std::vector{"高科技"}); +} + +TEST(ReadingGridTest, OverrideResetTest) { + std::string sampleData(kSampleData); + sampleData += "ㄓㄨㄥㄐㄧㄤˇ 終講 -11.0\n"; + sampleData += "ㄐㄧㄤˇㄐㄧㄣ 槳襟 -11.0\n"; + + ReadingGrid grid(std::make_shared(sampleData.c_str())); + grid.setReadingSeparator(""); + grid.insertReading("ㄋㄧㄢˊ"); + grid.insertReading("ㄓㄨㄥ"); + grid.insertReading("ㄐㄧㄤˇ"); + grid.insertReading("ㄐㄧㄣ"); + ReadingGrid::WalkResult result = grid.walk(); + ASSERT_EQ(result.valuesAsStrings(), + (std::vector{"年中", "獎金"})); + + ASSERT_TRUE(grid.overrideCandidate(1, "終講")); + result = grid.walk(); + ASSERT_EQ(result.valuesAsStrings(), + (std::vector{"年", "終講", "金"})); + + ASSERT_TRUE(grid.overrideCandidate(2, "槳襟")); + result = grid.walk(); + ASSERT_EQ(result.valuesAsStrings(), + (std::vector{"年中", "槳襟"})); + + ASSERT_TRUE(grid.overrideCandidate(0, "年終")); + result = grid.walk(); + ASSERT_EQ(result.valuesAsStrings(), + (std::vector{"年終", "槳襟"})); +} + +TEST(ReadingGridTest, DisambiguateCandidates) { + std::string sampleData(kSampleData); + sampleData += R"( +ㄍㄠ 高 -2.9396 +ㄖㄜˋ 熱 -3.6024 +ㄍㄠㄖㄜˋ 高熱 -6.1526 +ㄏㄨㄛˇ 火 -3.6966 +ㄏㄨㄛˇ 🔥 -8 +ㄧㄢˋ 焰 -5.4466 +ㄏㄨㄛˇㄧㄢˋ 火焰 -5.6231 +ㄏㄨㄛˇㄧㄢˋ 🔥 -8 +ㄨㄟˊ 危 -3.9832 +ㄒㄧㄢˇ 險 -3.7810 +ㄨㄟˊㄒㄧㄢˇ 危險 -4.2623 +)"; + + ReadingGrid grid(std::make_shared(sampleData.c_str())); + grid.setReadingSeparator(""); + grid.insertReading("ㄍㄠ"); + grid.insertReading("ㄖㄜˋ"); + grid.insertReading("ㄏㄨㄛˇ"); + grid.insertReading("ㄧㄢˋ"); + grid.insertReading("ㄨㄟˊ"); + grid.insertReading("ㄒㄧㄢˇ"); + auto result = grid.walk(); + ASSERT_EQ(result.valuesAsStrings(), + (std::vector{"高熱", "火焰", "危險"})); + + constexpr size_t loc = 2; // after 高熱 + + ASSERT_TRUE( + grid.overrideCandidate(loc, ReadingGrid::Candidate("ㄏㄨㄛˇ", "🔥"))); + result = grid.walk(); + ASSERT_EQ(result.valuesAsStrings(), + (std::vector{"高熱", "🔥", "焰", "危險"})); + + ASSERT_TRUE( + grid.overrideCandidate(loc, ReadingGrid::Candidate("ㄏㄨㄛˇㄧㄢˋ", "🔥"))); + result = grid.walk(); + ASSERT_EQ(result.valuesAsStrings(), + (std::vector{"高熱", "🔥", "危險"})); +} + +} // namespace Formosa::Gramambular2 diff --git a/Source/InputMethodController.swift b/Source/InputMethodController.swift index 5fc82787f..6e5aac62b 100644 --- a/Source/InputMethodController.swift +++ b/Source/InputMethodController.swift @@ -379,11 +379,6 @@ extension McBopomofoInputMethodController { return } - let poppedText = state.poppedText - if !poppedText.isEmpty { - commit(text: poppedText, client: client) - } - // the selection range is where the cursor is, with the length being 0 and replacement range NSNotFound, // i.e. the client app needs to take care of where to put this composing buffer client.setMarkedText(state.attributedString, selectionRange: NSMakeRange(Int(state.cursorIndex), 0), replacementRange: NSMakeRange(NSNotFound, NSNotFound)) diff --git a/Source/InputState.swift b/Source/InputState.swift index 9601a99c7..f7699649c 100644 --- a/Source/InputState.swift +++ b/Source/InputState.swift @@ -132,7 +132,6 @@ class InputState: NSObject { /// Represents that the user is inputting text. @objc (InputStateInputting) class Inputting: NotEmpty { - @objc var poppedText: String = "" @objc var tooltip: String = "" @objc override init(composingBuffer: String, cursorIndex: UInt) { @@ -148,7 +147,7 @@ class InputState: NSObject { } override var description: String { - ", poppedText:\(poppedText)>" + "" } } diff --git a/Source/KeyHandler.mm b/Source/KeyHandler.mm index e230ba5ad..7dc7ee476 100644 --- a/Source/KeyHandler.mm +++ b/Source/KeyHandler.mm @@ -22,13 +22,17 @@ // OTHER DEALINGS IN THE SOFTWARE. #import "KeyHandler.h" -#import "Gramambular.h" +#import "reading_grid.h" #import "LanguageModelManager+Privates.h" #import "Mandarin.h" #import "McBopomofo-Swift.h" #import "McBopomofoLM.h" #import "UserOverrideModel.h" + +#import +#import #import +#import @import CandidateUI; @import NSStringUtils; @@ -36,38 +40,19 @@ InputMode InputModeBopomofo = @"org.openvanilla.inputmethod.McBopomofo.Bopomofo"; InputMode InputModePlainBopomofo = @"org.openvanilla.inputmethod.McBopomofo.PlainBopomofo"; -static const double kEpsilon = 0.000001; -static const size_t kMaxComposingBufferNeedsToWalkSize = 10; - - -static double FindHighestScore(const std::vector &nodes, double epsilon) -{ - double highestScore = 0.0; - for (auto ni = nodes.begin(), ne = nodes.end(); ni != ne; ++ni) { - double score = ni->node->highestUnigramScore(); - if (score > highestScore) { - highestScore = score; - } - } - return highestScore + epsilon; +static std::u32string ToU32(const std::string& s) { + std::wstring_convert, char32_t> conv; + return conv.from_bytes(s); } -// sort helper -class NodeAnchorDescendingSorter { -public: - bool operator()(const Formosa::Gramambular::NodeAnchor &a, const Formosa::Gramambular::NodeAnchor &b) const - { - return a.node->key().length() > b.node->key().length(); - } -}; - -// if DEBUG is defined, a DOT file (GraphViz format) will be written to the -// specified path every time the grid is walked -#if DEBUG -static NSString *const kGraphVizOutputfile = @"/tmp/McBopomofo-visualization.dot"; -#endif +static std::string ToU8(const std::u32string& s) { + std::wstring_convert, char32_t> conv; + return conv.to_bytes(s); +} @implementation KeyHandler { + std::shared_ptr _emptySharedPtr; + // the reading buffer that takes user input Formosa::Mandarin::BopomofoReadingBuffer *_bpmfReadingBuffer; @@ -77,11 +62,8 @@ @implementation KeyHandler { // user override model McBopomofo::UserOverrideModel *_userOverrideModel; - // the grid (lattice) builder for the unigrams - Formosa::Gramambular::BlockReadingBuilder *_builder; - - // latest walked path (trellis) using the Viterbi algorithm - std::vector _walkedNodes; + Formosa::Gramambular2::ReadingGrid *_grid; + Formosa::Gramambular2::ReadingGrid::WalkResult _latestWalk; NSString *_inputMode; } @@ -115,10 +97,16 @@ - (void)setInputMode:(NSString *)value _inputMode = newInputMode; _languageModel = newLanguageModel; - if (_builder) { - delete _builder; - _builder = new Formosa::Gramambular::BlockReadingBuilder(_languageModel); - _builder->setJoinSeparator("-"); + if (_grid == nullptr) { + NSLog(@"used after release????"); + } + + if (_grid != nullptr) { + delete _grid; + // This returns a shared_ptr that in turn points to an unmanaged object. + std::shared_ptr lm(_emptySharedPtr, _languageModel); + _grid = new Formosa::Gramambular2::ReadingGrid(lm); + _grid->setReadingSeparator("-"); } if (!_bpmfReadingBuffer->isEmpty()) { @@ -129,14 +117,8 @@ - (void)setInputMode:(NSString *)value - (void)dealloc { - // clean up everything - if (_bpmfReadingBuffer) { - delete _bpmfReadingBuffer; - } - - if (_builder) { - delete _builder; - } + delete _bpmfReadingBuffer; + delete _grid; } - (instancetype)init @@ -150,10 +132,11 @@ - (instancetype)init _languageModel->setPhraseReplacementEnabled(Preferences.phraseReplacementEnabled); _userOverrideModel = [LanguageModelManager userOverrideModel]; - _builder = new Formosa::Gramambular::BlockReadingBuilder(_languageModel); + // This returns a shared_ptr that in turn points to an unmanaged object. + std::shared_ptr lm(_emptySharedPtr, _languageModel); + _grid = new Formosa::Gramambular2::ReadingGrid(lm); + _grid->setReadingSeparator("-"); - // each Mandarin syllable is separated by a hyphen - _builder->setJoinSeparator("-"); _inputMode = InputModeBopomofo; } return self; @@ -190,48 +173,38 @@ - (void)syncWithPreferences - (void)fixNodeWithValue:(NSString *)value useMoveCursorAfterSelectionSetting:(BOOL)flag { - size_t cursorIndex = [self _actualCandidateCursorIndex]; - std::string stringValue(value.UTF8String); - Formosa::Gramambular::NodeAnchor selectedNode = _builder->grid().fixNodeSelectedCandidate(cursorIndex, stringValue); - if (_inputMode != InputModePlainBopomofo) { - // If the length of the readings and the characters do not match, - // it often means it is a special symbol and it should not be stored - // in the user override model. - BOOL addToOverrideModel = YES; - if (selectedNode.spanningLength != [value count]) { - addToOverrideModel = NO; - } - if (addToOverrideModel) { - double score = selectedNode.node->scoreForCandidate(stringValue); - if (score <= -8) { - addToOverrideModel = NO; - } - } - if (addToOverrideModel) { - _userOverrideModel->observe(_walkedNodes, cursorIndex, stringValue, [[NSDate date] timeIntervalSince1970]); - } + size_t actualCursor = [self _actualCandidateCursorIndex]; + std::string candidateValue(std::string(value.UTF8String)); + if (!_grid->overrideCandidate(actualCursor, candidateValue)) { + return; } [self _walk]; - if (flag && Preferences.moveCursorAfterSelectingCandidate) { - size_t nextPosition = 0; - for (auto node : _walkedNodes) { - if (nextPosition >= cursorIndex) { - break; - } - nextPosition += node.spanningLength; - } - if (nextPosition <= _builder->length()) { - _builder->setCursorIndex(nextPosition); + // Update the user override model if warranted. + size_t accumulatedCursor = 0; + Formosa::Gramambular2::ReadingGrid::NodePtr currentNode; + for (const auto& node : _latestWalk.nodes) { + accumulatedCursor += node->spanningLength(); + if (accumulatedCursor > actualCursor) { + currentNode = node; + break; } } + + if (currentNode != nullptr && currentNode->currentUnigram().score() > -8) { + _userOverrideModel->observe(_latestWalk.nodes, actualCursor, candidateValue, [[NSDate date] timeIntervalSince1970]); + } + + if (currentNode != nullptr && flag && Preferences.moveCursorAfterSelectingCandidate) { + _grid->setCursor(accumulatedCursor); + } } - (void)clear { _bpmfReadingBuffer->clear(); - _builder->clear(); - _walkedNodes.clear(); + _grid->clear(); + _latestWalk = Formosa::Gramambular2::ReadingGrid::WalkResult{}; } - (std::string)_currentLayout @@ -351,10 +324,10 @@ - (BOOL)handleInput:(KeyHandlerInput *)input state:(InputState *)inState stateCa std::string reading = _bpmfReadingBuffer->syllable().composedString(); // see if we have a unigram for this - if (!_languageModel->hasUnigramsForKey(reading)) { + if (!_languageModel->hasUnigrams(reading)) { errorCallback(); _bpmfReadingBuffer->clear(); - if (!_builder->length()) { + if (!_grid->length()) { stateCallback([[InputStateEmptyIgnoringPreviousState alloc] init]); } else { stateCallback([self buildInputtingState]); @@ -362,29 +335,20 @@ - (BOOL)handleInput:(KeyHandlerInput *)input state:(InputState *)inState stateCa return YES; } - // and insert it into the lattice - _builder->insertReadingAtCursor(reading); - - // then walk the lattice - NSString *poppedText = [self _popOverflowComposingTextAndWalk]; + _grid->insertReading(reading); + [self _walk]; // get user override model suggestion - std::string overrideValue = (_inputMode == InputModePlainBopomofo) ? "" : _userOverrideModel->suggest(_walkedNodes, _builder->cursorIndex(), [[NSDate date] timeIntervalSince1970]); + std::string overrideValue = (_inputMode == InputModePlainBopomofo) ? "" : _userOverrideModel->suggest(_latestWalk.nodes, [self _actualCandidateCursorIndex], [[NSDate date] timeIntervalSince1970]); if (!overrideValue.empty()) { - size_t cursorIndex = [self _actualCandidateCursorIndex]; - std::vector nodes = _builder->grid().nodesCrossingOrEndingAt(cursorIndex); - double highestScore = FindHighestScore(nodes, kEpsilon); - _builder->grid().overrideNodeScoreForSelectedCandidate(cursorIndex, overrideValue, static_cast(highestScore)); + _grid->overrideCandidate([self _actualCandidateCursorIndex], overrideValue, Formosa::Gramambular2::ReadingGrid::Node::OverrideType::kOverrideValueWithScoreFromTopUnigram); } - [self fixNodesIfRequired]; - // then update the text _bpmfReadingBuffer->clear(); InputStateInputting *inputting = (InputStateInputting *)[self buildInputtingState]; - inputting.poppedText = poppedText; stateCallback(inputting); if (_inputMode == InputModePlainBopomofo) { @@ -433,7 +397,7 @@ - (BOOL)handleInput:(KeyHandlerInput *)input state:(InputState *)inState stateCa if (charCode == 32) { // if the spacebar is NOT set to be a selection key if ([input isShiftHold] || !Preferences.chooseCandidateUsingSpace) { - if (_builder->cursorIndex() >= _builder->length()) { + if (_grid->cursor() >= _grid->length()) { NSString *composingBuffer = [(InputStateNotEmpty *)state composingBuffer]; if (composingBuffer.length) { InputStateCommitting *committing = [[InputStateCommitting alloc] initWithPoppedText:composingBuffer]; @@ -444,11 +408,10 @@ - (BOOL)handleInput:(KeyHandlerInput *)input state:(InputState *)inState stateCa stateCallback(committing); InputStateEmpty *empty = [[InputStateEmpty alloc] init]; stateCallback(empty); - } else if (_languageModel->hasUnigramsForKey(" ")) { - _builder->insertReadingAtCursor(" "); - NSString *poppedText = [self _popOverflowComposingTextAndWalk]; + } else if (_languageModel->hasUnigrams(" ")) { + _grid->insertReading(" "); + [self _walk]; InputStateInputting *inputting = (InputStateInputting *)[self buildInputtingState]; - inputting.poppedText = poppedText; stateCallback(inputting); } return YES; @@ -514,12 +477,11 @@ - (BOOL)handleInput:(KeyHandlerInput *)input state:(InputState *)inState stateCa // MARK: Punctuation list if ((char)charCode == '`') { - if (_languageModel->hasUnigramsForKey("_punctuation_list")) { + if (_languageModel->hasUnigrams("_punctuation_list")) { if (_bpmfReadingBuffer->isEmpty()) { - _builder->insertReadingAtCursor("_punctuation_list"); - NSString *poppedText = [self _popOverflowComposingTextAndWalk]; + _grid->insertReading("_punctuation_list"); + [self _walk]; InputStateInputting *inputting = (InputStateInputting *)[self buildInputtingState]; - inputting.poppedText = poppedText; stateCallback(inputting); InputStateChoosingCandidate *choosingCandidate = [self _buildCandidateState:inputting useVerticalMode:input.useVerticalMode]; stateCallback(choosingCandidate); @@ -584,7 +546,7 @@ - (BOOL)handleInput:(KeyHandlerInput *)input state:(InputState *)inState stateCa - (BOOL)_handleTabState:(InputState *)state shiftIsHold:(BOOL)shiftIsHold stateCallback:(void (^)(InputState *))stateCallback errorCallback:(void (^)(void))errorCallback { - if (!_builder->length()) { + if (!_grid->length()) { return NO; } @@ -599,50 +561,53 @@ - (BOOL)_handleTabState:(InputState *)state shiftIsHold:(BOOL)shiftIsHold stateC } NSArray *candidates = [[self _buildCandidateState:(InputStateInputting *)state useVerticalMode:NO] candidates]; - if ([candidates count] == 0) { + if (candidates.count == 0) { errorCallback(); return YES; } + size_t cursorIndex = [self _actualCandidateCursorIndex]; size_t length = 0; - Formosa::Gramambular::NodeAnchor currentNode; + Formosa::Gramambular2::ReadingGrid::NodePtr currentNode; - for (auto node : _walkedNodes) { - length += node.spanningLength; - if (length >= cursorIndex) { + for (const auto& node : _latestWalk.nodes) { + length += node->spanningLength(); + if (length > cursorIndex) { currentNode = node; break; } } - NSString *currentValue = [[NSString alloc] initWithUTF8String:currentNode.node->currentKeyValue().value.c_str()]; + if (currentNode == nullptr) { + // Shouldn't happen. + errorCallback(); + return true; + } size_t currentIndex = 0; - if (currentNode.node->score() < - Formosa::Gramambular::kSelectedCandidateScore) { - // Once the user never select a candidate for the node, we start from the + if (!currentNode->isOverridden()) { + // If the user never selects a candidate for the node, we start from the // first candidate, so the user has a chance to use the unigram with two or // more characters when type the tab key for the first time. // // In other words, if a user type two BPMF readings, but the score of seeing // them as two unigrams is higher than a phrase with two characters, the // user can just use the longer phrase by typing the tab key. - if ([candidates[0] isEqualToString:currentValue]) { + if (currentNode->value() == [candidates[0] UTF8String]) { // If the first candidate is the value of the current node, we use next // one. if (shiftIsHold) { - currentIndex = candidates.count - 1; + currentIndex = [candidates count] - 1; } else { currentIndex = 1; } } } else { - for (NSString * candidate in candidates) { - if ([candidate isEqualToString:currentValue]) { + for (NSString* candidate : candidates) { + if (currentNode->value() == candidate.UTF8String) { if (shiftIsHold) { - currentIndex == 0 ? currentIndex = candidates.count - 1 - : currentIndex--; + currentIndex == 0 ? currentIndex = candidates.count - 1 : currentIndex--; } else { currentIndex++; } @@ -656,9 +621,7 @@ - (BOOL)_handleTabState:(InputState *)state shiftIsHold:(BOOL)shiftIsHold stateC currentIndex = 0; } - NSString *candidate = candidates[currentIndex]; - [self fixNodeWithValue:candidate useMoveCursorAfterSelectionSetting:NO]; - + [self fixNodeWithValue:candidates[currentIndex] useMoveCursorAfterSelectionSetting:NO]; InputStateInputting *inputting = (InputStateInputting *)[self buildInputtingState]; stateCallback(inputting); return YES; @@ -687,7 +650,7 @@ - (BOOL)_handleEscWithState:(InputState *)state stateCallback:(void (^)(InputSta if (!_bpmfReadingBuffer->isEmpty()) { _bpmfReadingBuffer->clear(); - if (!_builder->length()) { + if (!_grid->length()) { InputStateEmptyIgnoringPreviousState *empty = [[InputStateEmptyIgnoringPreviousState alloc] init]; stateCallback(empty); } else { @@ -725,8 +688,8 @@ - (BOOL)_handleBackwardWithState:(InputState *)state input:(KeyHandlerInput *)in stateCallback(state); } } else { - if (_builder->cursorIndex() > 0) { - _builder->setCursorIndex(_builder->cursorIndex() - 1); + if (_grid->cursor() > 0) { + _grid->setCursor(_grid->cursor() - 1); InputStateInputting *inputting = (InputStateInputting *)[self buildInputtingState]; stateCallback(inputting); } else { @@ -763,8 +726,8 @@ - (BOOL)_handleForwardWithState:(InputState *)state input:(KeyHandlerInput *)inp stateCallback(state); } } else { - if (_builder->cursorIndex() < _builder->length()) { - _builder->setCursorIndex(_builder->cursorIndex() + 1); + if (_grid->cursor() < _grid->length()) { + _grid->setCursor(_grid->cursor() + 1); InputStateInputting *inputting = (InputStateInputting *)[self buildInputtingState]; stateCallback(inputting); } else { @@ -788,8 +751,8 @@ - (BOOL)_handleHomeWithState:(InputState *)state stateCallback:(void (^)(InputSt return YES; } - if (_builder->cursorIndex()) { - _builder->setCursorIndex(0); + if (_grid->cursor()) { + _grid->setCursor(0); InputStateInputting *inputting = (InputStateInputting *)[self buildInputtingState]; stateCallback(inputting); } else { @@ -812,8 +775,8 @@ - (BOOL)_handleEndWithState:(InputState *)state stateCallback:(void (^)(InputSta return YES; } - if (_builder->cursorIndex() != _builder->length()) { - _builder->setCursorIndex(_builder->length()); + if (_grid->cursor() != _grid->length()) { + _grid->setCursor(_grid->length()); InputStateInputting *inputting = (InputStateInputting *)[self buildInputtingState]; stateCallback(inputting); } else { @@ -847,8 +810,8 @@ - (BOOL)_handleBackspaceWithState:(InputState *)state stateCallback:(void (^)(In _bpmfReadingBuffer->clear(); } else if (_bpmfReadingBuffer->isEmpty()) { - if (_builder->cursorIndex()) { - _builder->deleteReadingBeforeCursor(); + if (_grid->cursor()) { + _grid->deleteReadingBeforeCursor(); [self _walk]; } else { errorCallback(); @@ -859,7 +822,7 @@ - (BOOL)_handleBackspaceWithState:(InputState *)state stateCallback:(void (^)(In _bpmfReadingBuffer->backspace(); } - if (_bpmfReadingBuffer->isEmpty() && !_builder->length()) { + if (_bpmfReadingBuffer->isEmpty() && !_grid->length()) { InputStateEmptyIgnoringPreviousState *empty = [[InputStateEmptyIgnoringPreviousState alloc] init]; stateCallback(empty); } else { @@ -876,8 +839,8 @@ - (BOOL)_handleDeleteWithState:(InputState *)state stateCallback:(void (^)(Input } if (_bpmfReadingBuffer->isEmpty()) { - if (_builder->cursorIndex() != _builder->length()) { - _builder->deleteReadingAfterCursor(); + if (_grid->cursor() != _grid->length()) { + _grid->deleteReadingAfterCursor(); [self _walk]; InputStateInputting *inputting = (InputStateInputting *)[self buildInputtingState]; if (!inputting.composingBuffer.length) { @@ -935,14 +898,13 @@ - (BOOL)_handleEnterWithState:(InputState *)state stateCallback:(void (^)(InputS - (BOOL)_handlePunctuation:(std::string)customPunctuation state:(InputState *)state usingVerticalMode:(BOOL)useVerticalMode stateCallback:(void (^)(InputState *))stateCallback errorCallback:(void (^)(void))errorCallback { - if (!_languageModel->hasUnigramsForKey(customPunctuation)) { + if (!_languageModel->hasUnigrams(customPunctuation)) { return NO; } - NSString *poppedText; if (_bpmfReadingBuffer->isEmpty()) { - _builder->insertReadingAtCursor(customPunctuation); - poppedText = [self _popOverflowComposingTextAndWalk]; + _grid->insertReading(customPunctuation); + [self _walk]; } else { // If there is still unfinished bpmf reading, ignore the punctuation errorCallback(); stateCallback(state); @@ -950,7 +912,6 @@ - (BOOL)_handlePunctuation:(std::string)customPunctuation state:(InputState *)st } InputStateInputting *inputting = (InputStateInputting *)[self buildInputtingState]; - inputting.poppedText = poppedText; stateCallback(inputting); if (_inputMode == InputModePlainBopomofo && _bpmfReadingBuffer->isEmpty()) { @@ -1246,11 +1207,11 @@ - (BOOL)_handleCandidateState:(InputState *)state std::string customPunctuation = punctuationNamePrefix + layout + std::string(1, (char)charCode); std::string punctuation = punctuationNamePrefix + std::string(1, (char)charCode); - BOOL shouldAutoSelectCandidate = _bpmfReadingBuffer->isValidKey((char)charCode) || _languageModel->hasUnigramsForKey(customPunctuation) || _languageModel->hasUnigramsForKey(punctuation); + BOOL shouldAutoSelectCandidate = _bpmfReadingBuffer->isValidKey((char)charCode) || _languageModel->hasUnigrams(customPunctuation) || _languageModel->hasUnigrams(punctuation); if (!shouldAutoSelectCandidate && (char)charCode >= 'A' && (char)charCode <= 'Z') { std::string letter = std::string("_letter_") + std::string(1, (char)charCode); - if (_languageModel->hasUnigramsForKey(letter)) { + if (_languageModel->hasUnigrams(letter)) { shouldAutoSelectCandidate = YES; } } @@ -1276,77 +1237,79 @@ - (BOOL)_handleCandidateState:(InputState *)state - (InputStateInputting *)buildInputtingState { - // "updating the composing buffer" means to request the client to "refresh" the text input buffer - // with our "composing text" - NSMutableString *composingBuffer = [[NSMutableString alloc] init]; - NSInteger composedStringCursorIndex = 0; + // To construct an Inputting state, we need to first retrieve the entire + // composing buffer from the current grid, then split the composed string + // into head and tail, so that we can insert the current reading (if + // not-empty) between them. + // + // We'll also need to compute the UTF-8 cursor index. The idea here is we + // use a "running" index that will eventually catch the cursor index in the + // builder. The tricky part is that if the spanning length of the node that + // the cursor is at does not agree with the actual codepoint count of the + // node's value, we'll need to move the cursor at the end of the node to + // avoid confusions. + size_t runningCursor = 0; // spanning-length-based, like the builder cursor + + std::string composed; + size_t builderCursor = _grid->cursor(); + size_t composedCursor = 0; // UTF-8 (so "byte") cursor per fcitx5 requirement. + NSString *tooltip = @""; - size_t readingCursorIndex = 0; - size_t builderCursorIndex = _builder->cursorIndex(); + for (const auto& node : _latestWalk.nodes) { + std::string value = node->value(); + composed += value; - NSString *tooltip = @""; + // No work if runningCursor has already caught up with builderCursor. + if (runningCursor == builderCursor) { + continue; + } + size_t readingLength = node->spanningLength(); - // we must do some Unicode codepoint counting to find the actual cursor location for the client - // i.e. we need to take UTF-16 into consideration, for which a surrogate pair takes 2 UniChars - // locations - for (std::vector::iterator wi = _walkedNodes.begin(), we = _walkedNodes.end(); wi != we; ++wi) { - if ((*wi).node) { - std::string nodeStr = (*wi).node->currentKeyValue().value; - NSString *valueString = [NSString stringWithUTF8String:nodeStr.c_str()]; - [composingBuffer appendString:valueString]; - - NSArray *splited = [valueString split]; - NSInteger codepointCount = splited.count; - - // this re-aligns the cursor index in the composed string - // (the actual cursor on the screen) with the builder's logical - // cursor (reading) cursor; each built node has a "spanning length" - // (e.g. two reading blocks has a spanning length of 2), and we - // accumulate those lengths to calculate the displayed cursor - // index - size_t spanningLength = (*wi).spanningLength; - if (readingCursorIndex + spanningLength <= builderCursorIndex) { - composedStringCursorIndex += [valueString length]; - readingCursorIndex += spanningLength; - } else { - if (codepointCount == spanningLength) { - for (size_t i = 0; i < codepointCount && readingCursorIndex < builderCursorIndex; i++) { - composedStringCursorIndex += [splited[i] length]; - readingCursorIndex++; - } - } else { - if (readingCursorIndex < builderCursorIndex) { - composedStringCursorIndex += [valueString length]; - readingCursorIndex += spanningLength; - if (readingCursorIndex > builderCursorIndex) { - readingCursorIndex = builderCursorIndex; - } - if (builderCursorIndex == 0) { - tooltip = [NSString stringWithFormat:NSLocalizedString(@"Cursor is before \"%@\".", @""), - [NSString stringWithUTF8String:_builder->readings()[builderCursorIndex].c_str()]]; - } else if (builderCursorIndex >= _builder->readings().size()) { - tooltip = [NSString stringWithFormat:NSLocalizedString(@"Cursor is after \"%@\".", @""), - [NSString stringWithUTF8String:_builder->readings()[_builder->readings().size() - 1].c_str()]]; - } else { - tooltip = [NSString stringWithFormat:NSLocalizedString(@"Cursor is between \"%@\" and \"%@\".", @""), - [NSString stringWithUTF8String:_builder->readings()[builderCursorIndex - 1].c_str()], - [NSString stringWithUTF8String:_builder->readings()[builderCursorIndex].c_str()]]; - } - } - } - } + // Simple case: if the running cursor is behind, add the spanning length. + if (runningCursor + readingLength <= builderCursor) { + composedCursor += value.length(); + runningCursor += readingLength; + continue; + } + + // The builder cursor is in the middle of the node. + size_t distance = builderCursor - runningCursor; + std::u32string u32Value = ToU32(value); + + // The actual partial value's code point length is the shorter of the + // distance and the value's code point count. + size_t cpLen = std::min(distance, u32Value.length()); + std::u32string actualU32Value(u32Value.begin(), u32Value.begin() + static_cast(cpLen)); + std::string actualValue = ToU8(actualU32Value); + composedCursor += actualValue.length(); + runningCursor += distance; + + // Create a tooltip to warn the user that their cursor is between two + // readings (syllables) even if the cursor is not in the middle of a + // composed string due to its being shorter than the number of readings. + if (u32Value.length() < readingLength) { + // builderCursor is guaranteed to be > 0. If it was 0, we wouldn't even + // reach here due to runningCursor having already "caught up" with + // builderCursor. It is also guaranteed to be less than the size of the + // builder's readings for the same reason: runningCursor would have + // already caught up. + const std::string& prevReading = _grid->readings()[builderCursor - 1]; + const std::string& nextReading = _grid->readings()[builderCursor]; + + tooltip = [NSString stringWithFormat:NSLocalizedString(@"Cursor is between \"%@\" and \"%@\".", @""), + [NSString stringWithUTF8String:prevReading.c_str()], + [NSString stringWithUTF8String:nextReading.c_str()]]; } } - // now we gather all the info, we separate the composing buffer to two parts, head and tail, - // and insert the reading text (the Mandarin syllable) in between them; - // the reading text is what the user is typing - NSString *head = [composingBuffer substringToIndex:composedStringCursorIndex]; + std::string headStr = composed.substr(0, composedCursor); + std::string tailStr =composed.substr(composedCursor, composed.length() - composedCursor); + + NSString *head = [NSString stringWithUTF8String:headStr.c_str()]; NSString *reading = [NSString stringWithUTF8String:_bpmfReadingBuffer->composedString().c_str()]; - NSString *tail = [composingBuffer substringFromIndex:composedStringCursorIndex]; + NSString *tail = [NSString stringWithUTF8String:tailStr.c_str()]; NSString *composedText = [head stringByAppendingString:[reading stringByAppendingString:tail]]; - NSInteger cursorIndex = composedStringCursorIndex + [reading length]; - + NSInteger cursorIndex = head.length + reading.length; InputStateInputting *newState = [[InputStateInputting alloc] initWithComposingBuffer:composedText cursorIndex:cursorIndex]; newState.tooltip = tooltip; return newState; @@ -1354,84 +1317,15 @@ - (InputStateInputting *)buildInputtingState - (void)_walk { - // retrieve the most likely trellis, i.e. a Maximum Likelihood Estimation - // of the best possible Mandarain characters given the input syllables, - // using the Viterbi algorithm implemented in the Gramambular library - Formosa::Gramambular::Walker walker(&_builder->grid()); - - // the walker traces the trellis from the end - _walkedNodes = walker.walk(0); - - // if DEBUG is defined, a GraphViz file is written to kGraphVizOutputfile -#if 0 - std::string dotDump = _builder->grid().dumpDOT(); - NSString *dotStr = [NSString stringWithUTF8String:dotDump.c_str()]; - NSError *error = nil; - - BOOL __unused success = [dotStr writeToFile:kGraphVizOutputfile atomically:YES encoding:NSUTF8StringEncoding error:&error]; -#endif -} - -- (NSString *)_popOverflowComposingTextAndWalk -{ - // in an ideal world, we can as well let the user type forever, - // but because the Viterbi algorithm has a complexity of O(N^2), - // the walk will become slower as the number of nodes increase, - // therefore we need to "pop out" overflown text -- they usually - // lose their influence over the whole MLE anyway -- so that when - // the user type along, the already composed text at front will - // be popped out - - NSString *poppedText = @""; - NSInteger composingBufferSize = Preferences.composingBufferSize; - - if (_builder->grid().width() > (size_t)composingBufferSize) { - if (_walkedNodes.size() > 0) { - Formosa::Gramambular::NodeAnchor &anchor = _walkedNodes[0]; - poppedText = [NSString stringWithUTF8String:anchor.node->currentKeyValue().value.c_str()]; - _builder->removeHeadReadings(anchor.spanningLength); - } - } - - [self _walk]; - return poppedText; -} - -- (void)fixNodesIfRequired -{ - size_t width = _builder->grid().width(); - if (width > kMaxComposingBufferNeedsToWalkSize) { - size_t index = 0; - for (auto node : _walkedNodes) { - if (index >= width - kMaxComposingBufferNeedsToWalkSize) { - break; - } - if (node.node->score() < Formosa::Gramambular::kSelectedCandidateScore) { - auto candidate = node.node->currentKeyValue().value; - _builder->grid().fixNodeSelectedCandidate(index + node.spanningLength, candidate); - } - index += node.spanningLength; - } - } + _latestWalk = _grid->walk(); } - - (InputStateChoosingCandidate *)_buildCandidateState:(InputStateNotEmpty *)currentState useVerticalMode:(BOOL)useVerticalMode { + auto candidates = _grid->candidatesAt([self _actualCandidateCursorIndex]); NSMutableArray *candidatesArray = [[NSMutableArray alloc] init]; - - size_t cursorIndex = [self _actualCandidateCursorIndex]; - std::vector nodes = _builder->grid().nodesCrossingOrEndingAt(cursorIndex); - - // sort the nodes, so that longer nodes (representing longer phrases) are placed at the top of the candidate list - stable_sort(nodes.begin(), nodes.end(), NodeAnchorDescendingSorter()); - - // then use the C++ trick to retrieve the candidates for each node at/crossing the cursor - for (std::vector::iterator ni = nodes.begin(), ne = nodes.end(); ni != ne; ++ni) { - const std::vector &candidates = (*ni).node->candidates(); - for (std::vector::const_iterator ci = candidates.begin(), ce = candidates.end(); ci != ce; ++ci) { - [candidatesArray addObject:[NSString stringWithUTF8String:(*ci).value.c_str()]]; - } + for (const auto& c : candidates) { + [candidatesArray addObject:[NSString stringWithUTF8String:c.value.c_str()]]; } InputStateChoosingCandidate *state = [[InputStateChoosingCandidate alloc] initWithComposingBuffer:currentState.composingBuffer cursorIndex:currentState.cursorIndex candidates:candidatesArray useVerticalMode:useVerticalMode]; @@ -1440,27 +1334,33 @@ - (InputStateChoosingCandidate *)_buildCandidateState:(InputStateNotEmpty *)curr - (size_t)_actualCandidateCursorIndex { - size_t cursorIndex = _builder->cursorIndex(); - if (Preferences.selectPhraseAfterCursorAsCandidate) { - // MS Phonetics IME style, phrase is *after* the cursor, i.e. cursor is always *before* the phrase - if (cursorIndex < _builder->length()) { - ++cursorIndex; - } - } else { - if (!cursorIndex) { - ++cursorIndex; - } + size_t cursor = _grid->cursor(); + + // If the cursor is at the end, always return cursor - 1. Even though + // ReadingGrid already handles this edge case, we want to use this value + // consistently. UserOverrideModel also requires the cursor to be this + // correct value. + if (cursor == _grid->length() && cursor > 0) { + return cursor - 1; + } + + // ReadingGrid already makes the assumption that the cursor is always *at* + // the reading location, and when selectPhraseAfterCursorAsCandidate is true + // we don't need to do anything. Rather, it's when the flag is false (the + // default value), that we want to decrement the cursor by one. + if (!Preferences.selectPhraseAfterCursorAsCandidate && cursor > 0) { + return cursor - 1; } - return cursorIndex; + return cursor; } - (NSArray *)_currentReadings { NSMutableArray *readingsArray = [[NSMutableArray alloc] init]; - std::vector v = _builder->readings(); - for (std::vector::iterator it_i = v.begin(); it_i != v.end(); ++it_i) { - [readingsArray addObject:[NSString stringWithUTF8String:it_i->c_str()]]; + std::vector v = _grid->readings(); + for (const auto& reading : _grid->readings()) { + [readingsArray addObject:[NSString stringWithUTF8String:reading.c_str()]]; } return readingsArray; } diff --git a/Source/LanguageModelManager.mm b/Source/LanguageModelManager.mm index dbfe4c7d0..d79ec9d04 100644 --- a/Source/LanguageModelManager.mm +++ b/Source/LanguageModelManager.mm @@ -192,10 +192,10 @@ + (BOOL)checkIfUserLanguageModelFilesExist + (BOOL)checkIfUserPhraseExist:(NSString *)userPhrase key:(NSString *)key NS_SWIFT_NAME(checkIfExist(userPhrase:key:)) { std::string unigramKey(key.UTF8String); - std::vector unigrams = gLanguageModelMcBopomofo.unigramsForKey(unigramKey); + auto unigrams = gLanguageModelMcBopomofo.getUnigrams(unigramKey); std::string userPhraseString(userPhrase.UTF8String); - for (auto unigram : unigrams) { - if (unigram.keyValue.value == userPhraseString) { + for (const auto& unigram : unigrams) { + if (unigram.value() == userPhraseString) { return YES; } } From 7084bda99b355ca170c4ec2c6783a976e76e0994 Mon Sep 17 00:00:00 2001 From: Lukhnos Liu Date: Mon, 11 Jul 2022 22:15:43 -0700 Subject: [PATCH 2/4] Remove the composing buffer size setting --- McBopomofoTests/PreferencesTests.swift | 16 ----- Source/Base.lproj/preferences.xib | 59 ++------------- Source/Preferences.swift | 44 ------------ Source/PreferencesWindowController.swift | 9 --- Source/en.lproj/Localizable.strings | 4 -- Source/zh-Hant.lproj/Localizable.strings | 4 -- Source/zh-Hant.lproj/preferences.xib | 91 ++++++------------------ 7 files changed, 30 insertions(+), 197 deletions(-) diff --git a/McBopomofoTests/PreferencesTests.swift b/McBopomofoTests/PreferencesTests.swift index 78dffaffd..7687b1a62 100644 --- a/McBopomofoTests/PreferencesTests.swift +++ b/McBopomofoTests/PreferencesTests.swift @@ -123,22 +123,6 @@ class PreferencesTests: XCTestCase { XCTAssert(Preferences.useHorizontalCandidateList == true) } - func testComposingBufferSize() { - XCTAssert(Preferences.composingBufferSize == 10) - Preferences.composingBufferSize = 4 - XCTAssert(Preferences.composingBufferSize == 4) - Preferences.composingBufferSize = 20 - XCTAssert(Preferences.composingBufferSize == 20) - Preferences.composingBufferSize = 3 - XCTAssert(Preferences.composingBufferSize == 4) - Preferences.composingBufferSize = 101 - XCTAssert(Preferences.composingBufferSize == 100) - Preferences.composingBufferSize = 5 - XCTAssert(Preferences.composingBufferSize == 5) - Preferences.composingBufferSize = 19 - XCTAssert(Preferences.composingBufferSize == 19) - } - func testChooseCandidateUsingSpace() { XCTAssert(Preferences.chooseCandidateUsingSpace == true) Preferences.chooseCandidateUsingSpace = false diff --git a/Source/Base.lproj/preferences.xib b/Source/Base.lproj/preferences.xib index 11b8f397d..7e67e8f94 100644 --- a/Source/Base.lproj/preferences.xib +++ b/Source/Base.lproj/preferences.xib @@ -13,7 +13,6 @@ - @@ -29,7 +28,7 @@ - + @@ -109,7 +108,7 @@ - + @@ -223,7 +222,7 @@ - + @@ -324,7 +323,7 @@ - + @@ -368,7 +367,7 @@ - + @@ -386,7 +385,7 @@ - + @@ -430,14 +429,6 @@ - - - - - - - - @@ -463,55 +454,19 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + diff --git a/Source/Preferences.swift b/Source/Preferences.swift index 42aa1a29c..d6ca3cc4b 100644 --- a/Source/Preferences.swift +++ b/Source/Preferences.swift @@ -34,7 +34,6 @@ private let kCandidateListTextSizeKey = "CandidateListTextSize" private let kSelectPhraseAfterCursorAsCandidateKey = "SelectPhraseAfterCursorAsCandidate" private let kMoveCursorAfterSelectingCandidateKey = "MoveCursorAfterSelectingCandidate" private let kUseHorizontalCandidateListPreferenceKey = "UseHorizontalCandidateList" -private let kComposingBufferSizePreferenceKey = "ComposingBufferSize" private let kChooseCandidateUsingSpaceKey = "ChooseCandidateUsingSpaceKey" private let kChineseConversionEnabledKey = "ChineseConversionEnabled" private let kHalfWidthPunctuationEnabledKey = "HalfWidthPunctuationEnable" @@ -56,15 +55,6 @@ private let kDefaultCandidateListTextSize: CGFloat = 16 private let kMinCandidateListTextSize: CGFloat = 12 private let kMaxCandidateListTextSize: CGFloat = 196 -// default, min and max composing buffer size (in codepoints) -// modern Macs can usually work up to 16 codepoints when the builder still -// walks the grid with good performance; slower Macs (like old PowerBooks) -// will start to sputter beyond 12; such is the algorithmatic complexity -// of the Viterbi algorithm used in the builder library (at O(N^2)) -private let kDefaultComposingBufferSize = 10 -private let kMinComposingBufferSize = 4 -private let kMaxComposingBufferSize = 100 - private let kDefaultKeys = "123456789" private let kDefaultAssociatedPhrasesKeys = "!@#$%^&*(" @@ -135,36 +125,6 @@ struct CandidateListTextSize { } } -@propertyWrapper -struct ComposingBufferSize { - let key: String - let defaultValue: Int = kDefaultComposingBufferSize - lazy var container: UserDefault = { - UserDefault(key: key, defaultValue: defaultValue) - }() - - var wrappedValue: Int { - mutating get { - let currentValue = container.wrappedValue - if currentValue < kMinComposingBufferSize { - return kMinComposingBufferSize - } else if currentValue > kMaxComposingBufferSize { - return kMaxComposingBufferSize - } - return currentValue - } - set { - var value = newValue - if value < kMinComposingBufferSize { - value = kMinComposingBufferSize - } else if value > kMaxComposingBufferSize { - value = kMaxComposingBufferSize - } - container.wrappedValue = value - } - } -} - // MARK: - @objc enum KeyboardLayout: Int { @@ -232,7 +192,6 @@ class Preferences: NSObject { kCandidateListTextSizeKey, kSelectPhraseAfterCursorAsCandidateKey, kUseHorizontalCandidateListPreferenceKey, - kComposingBufferSizePreferenceKey, kChooseCandidateUsingSpaceKey, kChineseConversionEnabledKey, kHalfWidthPunctuationEnabledKey, @@ -278,9 +237,6 @@ class Preferences: NSObject { @UserDefault(key: kUseHorizontalCandidateListPreferenceKey, defaultValue: false) @objc static var useHorizontalCandidateList: Bool - @ComposingBufferSize(key: kComposingBufferSizePreferenceKey) - @objc static var composingBufferSize: Int - @UserDefault(key: kChooseCandidateUsingSpaceKey, defaultValue: true) @objc static var chooseCandidateUsingSpace: Bool diff --git a/Source/PreferencesWindowController.swift b/Source/PreferencesWindowController.swift index cab75d028..316c7de4b 100644 --- a/Source/PreferencesWindowController.swift +++ b/Source/PreferencesWindowController.swift @@ -51,7 +51,6 @@ fileprivate let kWindowTitleHeight: CGFloat = 78 @IBOutlet weak var advancedSettingsView: NSView! @IBOutlet weak var addPhraseHookPathField: NSTextField! - @IBOutlet weak var composingBufferSizeTextField: NSTextField! override func awakeFromNib() { let toolbar = NSToolbar(identifier: "preference toolbar") @@ -167,14 +166,6 @@ fileprivate let kWindowTitleHeight: CGFloat = 78 customUserPhraseLocationEnabledButton.selectItem(at: index) updateUserPhraseLocation() addPhraseHookPathField.stringValue = Preferences.addPhraseHookPath - let composingBufferSizeFormatter = NumberFormatter() - composingBufferSizeFormatter.numberStyle = .decimal - composingBufferSizeFormatter.generatesDecimalNumbers = true - composingBufferSizeFormatter.maximumIntegerDigits = 3 - composingBufferSizeFormatter.maximumFractionDigits = 0 - composingBufferSizeFormatter.maximum = 100 - composingBufferSizeFormatter.minimum = 4 - composingBufferSizeTextField.cell?.formatter = composingBufferSizeFormatter } @IBAction func updateBasisKeyboardLayoutAction(_ sender: Any) { diff --git a/Source/en.lproj/Localizable.strings b/Source/en.lproj/Localizable.strings index 02beeb589..062536504 100644 --- a/Source/en.lproj/Localizable.strings +++ b/Source/en.lproj/Localizable.strings @@ -85,10 +85,6 @@ "Certain Unicode symbols or characters not supported as user phrases." = "Certain Unicode symbols or characters not supported as user phrases."; -"Cursor is before \"%@\"." = "Cursor is before \"%@\"."; - -"Cursor is after \"%@\"." = "Cursor is after \"%@\"."; - "Cursor is between \"%@\" and \"%@\"." = "Cursor is between \"%@\" and \"%@\"."; "The phrase being marked \"%@\" already exists." = "The phrase being marked \"%@\" already exists."; diff --git a/Source/zh-Hant.lproj/Localizable.strings b/Source/zh-Hant.lproj/Localizable.strings index bba1e0944..7639d9435 100644 --- a/Source/zh-Hant.lproj/Localizable.strings +++ b/Source/zh-Hant.lproj/Localizable.strings @@ -85,10 +85,6 @@ "Certain Unicode symbols or characters not supported as user phrases." = "您輸入了特殊符號,我們還無法支援在這種狀況下手動加詞。"; -"Cursor is before \"%@\"." = "游標正在「%@」前方"; - -"Cursor is after \"%@\"." = "游標正在「%@」後方"; - "Cursor is between \"%@\" and \"%@\"." = "游標正在「%@」與「%@」之間"; "The phrase being marked \"%@\" already exists." = "您目前選擇了「%@」,這個詞彙已經存在了"; diff --git a/Source/zh-Hant.lproj/preferences.xib b/Source/zh-Hant.lproj/preferences.xib index 633a09101..e0c794478 100644 --- a/Source/zh-Hant.lproj/preferences.xib +++ b/Source/zh-Hant.lproj/preferences.xib @@ -13,7 +13,6 @@ - @@ -29,7 +28,7 @@ - + @@ -42,7 +41,7 @@ - + @@ -64,7 +63,7 @@ - + @@ -72,7 +71,7 @@ - + @@ -83,7 +82,7 @@ - + @@ -91,7 +90,7 @@ - + @@ -99,7 +98,7 @@ - + @@ -154,7 +153,7 @@ - + @@ -180,7 +179,7 @@ - + @@ -199,7 +198,7 @@ - + @@ -207,7 +206,7 @@