1*b1cdbd2cSJim Jagielski#
2*b1cdbd2cSJim Jagielski#   Copyright (C) 2002-2006, International Business Machines Corporation and others.
3*b1cdbd2cSJim Jagielski#       All Rights Reserved.
4*b1cdbd2cSJim Jagielski#
5*b1cdbd2cSJim Jagielski#   file:  sent.txt
6*b1cdbd2cSJim Jagielski#
7*b1cdbd2cSJim Jagielski#   ICU Sentence Break Rules
8*b1cdbd2cSJim Jagielski#      See Unicode Standard Annex #29.
9*b1cdbd2cSJim Jagielski#      These rules are based on SA 29 version 5.0.0
10*b1cdbd2cSJim Jagielski#      Includes post 5.0 changes to treat Japanese half width voicing marks
11*b1cdbd2cSJim Jagielski#        as Grapheme Extend.
12*b1cdbd2cSJim Jagielski#
13*b1cdbd2cSJim Jagielski
14*b1cdbd2cSJim Jagielski
15*b1cdbd2cSJim Jagielski$VoiceMarks   = [\uff9e\uff9f];
16*b1cdbd2cSJim Jagielski$Thai         = [:Script = Thai:];
17*b1cdbd2cSJim Jagielski
18*b1cdbd2cSJim Jagielski#
19*b1cdbd2cSJim Jagielski# Character categories as defined in TR 29
20*b1cdbd2cSJim Jagielski#
21*b1cdbd2cSJim Jagielski$Sep       = [\p{Sentence_Break = Sep}];
22*b1cdbd2cSJim Jagielski$Format    = [\p{Sentence_Break = Format}];
23*b1cdbd2cSJim Jagielski$Sp        = [\p{Sentence_Break = Sp}];
24*b1cdbd2cSJim Jagielski$Lower     = [\p{Sentence_Break = Lower}];
25*b1cdbd2cSJim Jagielski$Upper     = [\p{Sentence_Break = Upper}];
26*b1cdbd2cSJim Jagielski$OLetter   = [\p{Sentence_Break = OLetter}-$VoiceMarks];
27*b1cdbd2cSJim Jagielski$Numeric   = [\p{Sentence_Break = Numeric}];
28*b1cdbd2cSJim Jagielski$ATerm     = [\p{Sentence_Break = ATerm}];
29*b1cdbd2cSJim Jagielski$STerm     = [\p{Sentence_Break = STerm}];
30*b1cdbd2cSJim Jagielski$Close     = [\p{Sentence_Break = Close}];
31*b1cdbd2cSJim Jagielski
32*b1cdbd2cSJim Jagielski#
33*b1cdbd2cSJim Jagielski# Define extended forms of the character classes,
34*b1cdbd2cSJim Jagielski#   incorporate grapheme cluster + format chars.
35*b1cdbd2cSJim Jagielski#   Rules 4 and 5.
36*b1cdbd2cSJim Jagielski
37*b1cdbd2cSJim Jagielski
38*b1cdbd2cSJim Jagielski$CR         = \u000d;
39*b1cdbd2cSJim Jagielski$LF         = \u000a;
40*b1cdbd2cSJim Jagielski$Extend     = [[:Grapheme_Extend = TRUE:]$VoiceMarks];
41*b1cdbd2cSJim Jagielski
42*b1cdbd2cSJim Jagielski$SpEx       = $Sp      ($Extend | $Format)*;
43*b1cdbd2cSJim Jagielski$LowerEx    = $Lower   ($Extend | $Format)*;
44*b1cdbd2cSJim Jagielski$UpperEx    = $Upper   ($Extend | $Format)*;
45*b1cdbd2cSJim Jagielski$OLetterEx  = $OLetter ($Extend | $Format)*;
46*b1cdbd2cSJim Jagielski$NumericEx  = $Numeric ($Extend | $Format)*;
47*b1cdbd2cSJim Jagielski$ATermEx    = $ATerm   ($Extend | $Format)*;
48*b1cdbd2cSJim Jagielski$STermEx    = $STerm   ($Extend | $Format)*;
49*b1cdbd2cSJim Jagielski$CloseEx    = $Close   ($Extend | $Format)*;
50*b1cdbd2cSJim Jagielski
51*b1cdbd2cSJim Jagielski
52*b1cdbd2cSJim Jagielski## -------------------------------------------------
53*b1cdbd2cSJim Jagielski
54*b1cdbd2cSJim Jagielski!!chain;
55*b1cdbd2cSJim Jagielski!!forward;
56*b1cdbd2cSJim Jagielski
57*b1cdbd2cSJim Jagielski# Rule 3 - break after separators.  Keep CR/LF together.
58*b1cdbd2cSJim Jagielski#
59*b1cdbd2cSJim Jagielski$CR $LF;
60*b1cdbd2cSJim Jagielski
61*b1cdbd2cSJim Jagielski$LettersEx = [$OLetter $Upper $Lower $Numeric $Close $STerm] ($Extend | $Format)*;
62*b1cdbd2cSJim Jagielski$LettersEx* $Thai $LettersEx* ($ATermEx | $SpEx)*;
63*b1cdbd2cSJim Jagielski
64*b1cdbd2cSJim Jagielski# Rule 4 - Break after $Sep.
65*b1cdbd2cSJim Jagielski# Rule 5 - Ignore $Format and $Extend
66*b1cdbd2cSJim Jagielski#
67*b1cdbd2cSJim Jagielski[^$Sep]? ($Extend | $Format)*;
68*b1cdbd2cSJim Jagielski
69*b1cdbd2cSJim Jagielski
70*b1cdbd2cSJim Jagielski# Rule 6
71*b1cdbd2cSJim Jagielski$ATermEx $NumericEx;
72*b1cdbd2cSJim Jagielski
73*b1cdbd2cSJim Jagielski# Rule 7
74*b1cdbd2cSJim Jagielski$UpperEx $ATermEx $UpperEx;
75*b1cdbd2cSJim Jagielski
76*b1cdbd2cSJim Jagielski#Rule 8
77*b1cdbd2cSJim Jagielski#  Note:  follows errata for Unicode 5.0 boundary rules.
78*b1cdbd2cSJim Jagielski$NotLettersEx = [^$OLetter $Upper $Lower $Sep $ATerm $STerm] ($Extend | $Format)*;
79*b1cdbd2cSJim Jagielski$ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
80*b1cdbd2cSJim Jagielski
81*b1cdbd2cSJim Jagielski# Rule 8a
82*b1cdbd2cSJim Jagielski($STermEx | $ATermEx) $CloseEx* $SpEx* ($STermEx | $ATermEx);
83*b1cdbd2cSJim Jagielski
84*b1cdbd2cSJim Jagielski#Rule 9, 10, 11
85*b1cdbd2cSJim Jagielski($STermEx | $ATermEx) $CloseEx* $SpEx* $Sep?;
86*b1cdbd2cSJim Jagielski
87*b1cdbd2cSJim Jagielski#Rule 12
88*b1cdbd2cSJim Jagielski[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend $Thai]{bof}] ($Extend | $Format | $Close | $Sp)* [^$Thai];
89*b1cdbd2cSJim Jagielski[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep{eof}] | $CR $LF){100};
90*b1cdbd2cSJim Jagielski
91*b1cdbd2cSJim Jagielski## -------------------------------------------------
92*b1cdbd2cSJim Jagielski
93*b1cdbd2cSJim Jagielski!!reverse;
94*b1cdbd2cSJim Jagielski
95*b1cdbd2cSJim Jagielski$SpEx_R       = ($Extend | $Format)* $Sp;
96*b1cdbd2cSJim Jagielski$ATermEx_R    = ($Extend | $Format)* $ATerm;
97*b1cdbd2cSJim Jagielski$STermEx_R    = ($Extend | $Format)* $STerm;
98*b1cdbd2cSJim Jagielski$CloseEx_R    = ($Extend | $Format)* $Close;
99*b1cdbd2cSJim Jagielski
100*b1cdbd2cSJim Jagielski#
101*b1cdbd2cSJim Jagielski#  Reverse rules.
102*b1cdbd2cSJim Jagielski#     For now, use the old style inexact reverse rules, which are easier
103*b1cdbd2cSJim Jagielski#     to write, but less efficient.
104*b1cdbd2cSJim Jagielski#     TODO:  exact reverse rules.  It appears that exact reverse rules
105*b1cdbd2cSJim Jagielski#            may require improving support for look-ahead breaks in the
106*b1cdbd2cSJim Jagielski#            builder.  Needs more investigation.
107*b1cdbd2cSJim Jagielski#
108*b1cdbd2cSJim Jagielski
109*b1cdbd2cSJim Jagielski[{bof}] (.? | $LF $CR) [^$Sep]* [$Sep {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*;
110*b1cdbd2cSJim Jagielski#.*;
111*b1cdbd2cSJim Jagielski
112*b1cdbd2cSJim Jagielski# Explanation for this rule:
113*b1cdbd2cSJim Jagielski#
114*b1cdbd2cSJim Jagielski#    It needs to back over
115*b1cdbd2cSJim Jagielski#        The $Sep at which we probably begin
116*b1cdbd2cSJim Jagielski#        All of the non $Sep chars leading to the preceding $Sep
117*b1cdbd2cSJim Jagielski#        The preceding $Sep, which will be the second one that the rule matches.
118*b1cdbd2cSJim Jagielski#        Any immediately preceding STerm or ATerm sequences.  We need to see these
119*b1cdbd2cSJim Jagielski#              to get the correct rule status when moving forwards again.
120*b1cdbd2cSJim Jagielski#
121*b1cdbd2cSJim Jagielski# [{bof}]           inhibit rule chaining.  Without this, rule would loop on itself and match
122*b1cdbd2cSJim Jagielski#                   the entire string.
123*b1cdbd2cSJim Jagielski#
124*b1cdbd2cSJim Jagielski# (.? | $LF $CR)    Match one $Sep instance.  Use .? rather than $Sep because position might be
125*b1cdbd2cSJim Jagielski#                   at the beginning of the string at this point, and we don't want to fail.
126*b1cdbd2cSJim Jagielski#                   Can only use {eof} once, and it is used later.
127*b1cdbd2cSJim Jagielski#
128*b1cdbd2cSJim Jagielski
129