diff --git a/docs/userguide/boundaryanalysis/break-rules.md b/docs/userguide/boundaryanalysis/break-rules.md index afc28829133a..02d8d9597737 100644 --- a/docs/userguide/boundaryanalysis/break-rules.md +++ b/docs/userguide/boundaryanalysis/break-rules.md @@ -270,6 +270,39 @@ See, for example, this snippet from the [line break rules](https://github.com/un $dictionary = [$SA]; ``` +The status value of dictionary breaks is determined as follows: +* the status value of the final break of the rule-based segment refined by the + dictionary breaks, if the largest status value defined in the rules is greater + than 100 (in that case, the rules are called *word-like*). +* 0 otherwise. + +> **Note:** In practice, only word segmentation is word-like. +> The need for a distinct behaviour was realized long after status values were +> introduced: prior to ICU 79, all rules were considered word-like. Using the +> largest status value as a heuristic allows rules that are customized versions +> of the word breaking rules to behave like word segmentation should, without +> needing to introduce a new syntax to select the status of dictionary breaks. + +> **Example:** +> With the rules +> ``` +> $dictionary = [A-Z]; +> $ {100}; +> [A-Z] [A-Z]; +> ``` +> The string `ARMAVIRUMQUECANO` has a final break with status 100. +> These rules are not word-like, so if dictionary breaking finds breaks between +> `ARMA`, `VIRUMQUE`, and `CANO`, +> These will have a status value of 0. +> If however the following rule is added, +> ``` +> . [?] {200}; +> ``` +> the rules become word-like, and breaks within `ARMAVIRUMQUECANO` all get a +> status value of 100. Any dictionary breaks within +> `QUOUSQUETANDEMABUTERECATILINAPATIENTIANOSTRA?` would get a status value of +> 200. + ## Rule Options | Option | Description | diff --git a/icu4c/source/common/rbbi_cache.cpp b/icu4c/source/common/rbbi_cache.cpp index 3ef030cb9195..da4121210d66 100644 --- a/icu4c/source/common/rbbi_cache.cpp +++ b/icu4c/source/common/rbbi_cache.cpp @@ -40,6 +40,11 @@ void RuleBasedBreakIterator::DictionaryCache::reset() { fFirstRuleStatusIndex = 0; fOtherRuleStatusIndex = 0; fBreaks.removeAllElements(); + int32_t maxStatus = 0; + for (int32_t i = 0; i < fBI->fData->fStatusMaxIdx; ++i) { + maxStatus = std::max(maxStatus, fBI->fData->fRuleStatusTable[i]); + } + isWordLike = maxStatus > 100; } UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) { @@ -60,7 +65,7 @@ UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_ r = fBreaks.elementAti(fPositionInCache); U_ASSERT(r > fromPos); *result = r; - *statusIndex = fOtherRuleStatusIndex; + *statusIndex = isWordLike || r == fLimit ? fOtherRuleStatusIndex : 0; return true; } @@ -70,7 +75,7 @@ UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_ r= fBreaks.elementAti(fPositionInCache); if (r > fromPos) { *result = r; - *statusIndex = fOtherRuleStatusIndex; + *statusIndex = isWordLike || r == fLimit ? fOtherRuleStatusIndex : 0; return true; } } @@ -97,7 +102,9 @@ UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_ r = fBreaks.elementAti(fPositionInCache); U_ASSERT(r < fromPos); *result = r; - *statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; + *statusIndex = (r == fStart) ? fFirstRuleStatusIndex + : isWordLike ? fOtherRuleStatusIndex + : 0; return true; } @@ -110,7 +117,9 @@ UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_ r = fBreaks.elementAti(fPositionInCache); if (r < fromPos) { *result = r; - *statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; + *statusIndex = (r == fStart) ? fFirstRuleStatusIndex + : isWordLike ? fOtherRuleStatusIndex + : 0; return true; } } diff --git a/icu4c/source/common/rbbi_cache.h b/icu4c/source/common/rbbi_cache.h index 597312e85c45..8ce8b4f21a67 100644 --- a/icu4c/source/common/rbbi_cache.h +++ b/icu4c/source/common/rbbi_cache.h @@ -64,6 +64,13 @@ class RuleBasedBreakIterator::DictionaryCache: public UMemory { // text segment being handled by the dictionary. int32_t fFirstRuleStatusIndex; // Rule status info for first boundary. int32_t fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries. + + // If `this->isWordLike`, the status of dictionary breaks is equal to the status of the final + // break of the rule-based segment they refine (fOtherRuleStatusIndex); otherwise, dictionary + // breaks have status 0. + // For compatibility, this property is determined by the largest status value used by `*fBI`: + // rules that have a largest status greater than 100 are considered word-like. + bool isWordLike; }; diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 781ce068be7b..0775fc3bd584 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -1268,6 +1268,9 @@ between •Mae •Hong •Son •and •the •Salween •River, •the •Thano the •Khun •Tan •Range •(ดอย•ขุน•ตาน), •the •Phi •Pan •Nam •Range •(ทิว•เขา•ผี•ปัน•น้ำ), •as •well •as •the •western •\ part •of •the •Luang •Prabang •Range •(ทิว•เขา•หลวง•พระ•บาง).• +•บทความ•แนะนำ +<100> + # Breaking around numbers that begin with a decimal point. # Bug ICU-12017 diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedBreakIterator.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedBreakIterator.java index bc494dd96452..8c1ac1a84506 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedBreakIterator.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedBreakIterator.java @@ -1067,6 +1067,11 @@ void reset() { fFirstRuleStatusIndex = 0; fOtherRuleStatusIndex = 0; fBreaks.removeAllElements(); + int maxStatus = 0; + for (int status : fRData.fStatusTable) { + maxStatus = Math.max(status, maxStatus); + } + isWordLike = maxStatus > 100; } ; @@ -1090,7 +1095,7 @@ boolean following(int fromPos) { r = fBreaks.elementAt(fPositionInCache); assert (r > fromPos); fBoundary = r; - fStatusIndex = fOtherRuleStatusIndex; + fStatusIndex = isWordLike || r == fLimit ? fOtherRuleStatusIndex : 0; return true; } @@ -1100,7 +1105,7 @@ boolean following(int fromPos) { r = fBreaks.elementAt(fPositionInCache); if (r > fromPos) { fBoundary = r; - fStatusIndex = fOtherRuleStatusIndex; + fStatusIndex = isWordLike || r == fLimit ? fOtherRuleStatusIndex : 0; return true; } } @@ -1133,7 +1138,10 @@ boolean preceding(int fromPos) { r = fBreaks.elementAt(fPositionInCache); assert (r < fromPos); fBoundary = r; - fStatusIndex = (r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; + fStatusIndex = + (r == fStart) + ? fFirstRuleStatusIndex + : isWordLike ? fOtherRuleStatusIndex : 0; return true; } @@ -1146,7 +1154,10 @@ boolean preceding(int fromPos) { r = fBreaks.elementAt(fPositionInCache); if (r < fromPos) { fBoundary = r; - fStatusIndex = (r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; + fStatusIndex = + (r == fStart) + ? fFirstRuleStatusIndex + : isWordLike ? fOtherRuleStatusIndex : 0; return true; } } @@ -1287,6 +1298,15 @@ void populateDictionary( int fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries. int fBoundary; // Current boundary. Set by preceding(), following(). int fStatusIndex; // Current rule status index. Set by preceding, following(). + + /** + * If `this.isWordLike`, the status of dictionary breaks is equal to the status of the final + * break of the rule-based segment they refine (fOtherRuleStatusIndex); otherwise, + * dictionary breaks have status 0. For compatibility, this property is determined by the + * largest status value used by the rules: rules that have a largest status greater than 100 + * are considered word-like. + */ + private boolean isWordLike; } ; diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt index 781ce068be7b..0775fc3bd584 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -1268,6 +1268,9 @@ between •Mae •Hong •Son •and •the •Salween •River, •the •Thano the •Khun •Tan •Range •(ดอย•ขุน•ตาน), •the •Phi •Pan •Nam •Range •(ทิว•เขา•ผี•ปัน•น้ำ), •as •well •as •the •western •\ part •of •the •Luang •Prabang •Range •(ทิว•เขา•หลวง•พระ•บาง).• +•บทความ•แนะนำ +<100> + # Breaking around numbers that begin with a decimal point. # Bug ICU-12017