1*b1cdbd2cSJim Jagielski#
2*b1cdbd2cSJim Jagielski#   Copyright (C) 2002-2009, International Business Machines Corporation and others.
3*b1cdbd2cSJim Jagielski#       All Rights Reserved.
4*b1cdbd2cSJim Jagielski#
5*b1cdbd2cSJim Jagielski#   file:  char.txt
6*b1cdbd2cSJim Jagielski#
7*b1cdbd2cSJim Jagielski#   ICU Character Break Rules, also known as Grapheme Cluster Boundaries
8*b1cdbd2cSJim Jagielski#      See Unicode Standard Annex #29.
9*b1cdbd2cSJim Jagielski#      These rules are based on TR29 Revision 13, for Unicode Version 5.1
10*b1cdbd2cSJim Jagielski#
11*b1cdbd2cSJim Jagielski
12*b1cdbd2cSJim Jagielski#
13*b1cdbd2cSJim Jagielski#  Character Class Definitions.
14*b1cdbd2cSJim Jagielski#
15*b1cdbd2cSJim Jagielski$CR          = [\p{Grapheme_Cluster_Break = CR}];
16*b1cdbd2cSJim Jagielski$LF          = [\p{Grapheme_Cluster_Break = LF}];
17*b1cdbd2cSJim Jagielski$Control     = [\p{Grapheme_Cluster_Break = Control}];
18*b1cdbd2cSJim Jagielski$Prepend     = [\p{Grapheme_Cluster_Break = Prepend}];
19*b1cdbd2cSJim Jagielski$Extend      = [\p{Grapheme_Cluster_Break = Extend}];
20*b1cdbd2cSJim Jagielski$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
21*b1cdbd2cSJim Jagielski$BengaliLetter = [\u0985-\u09B9 \u09CE \u09DC-\u09E1 \u09F0-\u09F1];
22*b1cdbd2cSJim Jagielski$BengaliSignVirama = \u09CD;
23*b1cdbd2cSJim Jagielski$GujaratiLetter = [\u0A85-\u0A8C \u0A8F-\u0A90 \u0A93-\u0AB9 \u0AE0-\u0AE1];
24*b1cdbd2cSJim Jagielski$GujaratiSignVirama = \u0ACD;
25*b1cdbd2cSJim Jagielski$DevanagariLetter = [\u0904-\u0939 \u0958-\u0961 \u0972-\u097F];
26*b1cdbd2cSJim Jagielski$DevanagariSignVirama = \u094D;
27*b1cdbd2cSJim Jagielski$KannadaLetter = [\u0C85-\u0CB9 \u0CDE-\u0CE1];
28*b1cdbd2cSJim Jagielski$KannadaSignVirama = \u0CCD;
29*b1cdbd2cSJim Jagielski$MalayalamLetter = [\u0D05-\u0D39 \u0D60-\u0D61 \u0D7A-\u0D7F];
30*b1cdbd2cSJim Jagielski$MalayalamSignVirama = \u0D4D;
31*b1cdbd2cSJim Jagielski$OriyaLetter = [\u0B05-\u0B39 \u0B5C-\u0B61 \u0B71];
32*b1cdbd2cSJim Jagielski$OriyaSignVirama = \u0B4D;
33*b1cdbd2cSJim Jagielski$GurmukhiLetter = [\u0A05-\u0A39 \u0A59-\u0A5E];
34*b1cdbd2cSJim Jagielski$GurmukhiSignVirama = \u0A4D;
35*b1cdbd2cSJim Jagielski$TamilLetter = [\u0B85-\u0BB9];
36*b1cdbd2cSJim Jagielski$TamilSignVirama = \u0BCD;
37*b1cdbd2cSJim Jagielski$TeluguLetter = [\u0C05-\u0C39 \u0C58-\u0C61];
38*b1cdbd2cSJim Jagielski$TeluguSignVirama = \u0C4D;
39*b1cdbd2cSJim Jagielski
40*b1cdbd2cSJim Jagielski#
41*b1cdbd2cSJim Jagielski# Korean Syllable Definitions
42*b1cdbd2cSJim Jagielski#
43*b1cdbd2cSJim Jagielski$L       = [\p{Grapheme_Cluster_Break = L}];
44*b1cdbd2cSJim Jagielski$V       = [\p{Grapheme_Cluster_Break = V}];
45*b1cdbd2cSJim Jagielski$T       = [\p{Grapheme_Cluster_Break = T}];
46*b1cdbd2cSJim Jagielski
47*b1cdbd2cSJim Jagielski$LV      = [\p{Grapheme_Cluster_Break = LV}];
48*b1cdbd2cSJim Jagielski$LVT     = [\p{Grapheme_Cluster_Break = LVT}];
49*b1cdbd2cSJim Jagielski
50*b1cdbd2cSJim Jagielski
51*b1cdbd2cSJim Jagielski## -------------------------------------------------
52*b1cdbd2cSJim Jagielski!!chain;
53*b1cdbd2cSJim Jagielski
54*b1cdbd2cSJim Jagielski!!forward;
55*b1cdbd2cSJim Jagielski
56*b1cdbd2cSJim Jagielski$CR $LF;
57*b1cdbd2cSJim Jagielski
58*b1cdbd2cSJim Jagielski$BengaliLetter ($BengaliSignVirama $BengaliLetter?)+;
59*b1cdbd2cSJim Jagielski$GujaratiLetter ($GujaratiSignVirama $GujaratiLetter?)+;
60*b1cdbd2cSJim Jagielski$DevanagariLetter ($DevanagariSignVirama $DevanagariLetter?)+;
61*b1cdbd2cSJim Jagielski$KannadaLetter ($KannadaSignVirama $KannadaLetter?)+;
62*b1cdbd2cSJim Jagielski$MalayalamLetter ($MalayalamSignVirama $MalayalamLetter?)+;
63*b1cdbd2cSJim Jagielski$OriyaLetter ($OriyaSignVirama $OriyaLetter?)+;
64*b1cdbd2cSJim Jagielski$GurmukhiLetter ($GurmukhiSignVirama $GurmukhiLetter?)+;
65*b1cdbd2cSJim Jagielski$TamilLetter ($TamilSignVirama $TamilLetter?)+;
66*b1cdbd2cSJim Jagielski$TeluguLetter ($TeluguSignVirama $TeluguLetter?)+;
67*b1cdbd2cSJim Jagielski
68*b1cdbd2cSJim Jagielski$L ($L | $V | $LV | $LVT);
69*b1cdbd2cSJim Jagielski($LV | $V) ($V | $T);
70*b1cdbd2cSJim Jagielski($LVT | $T) $T;
71*b1cdbd2cSJim Jagielski
72*b1cdbd2cSJim Jagielski[^$Control $CR $LF] $Extend;
73*b1cdbd2cSJim Jagielski
74*b1cdbd2cSJim Jagielski[^$Control $CR $LF] $SpacingMark;
75*b1cdbd2cSJim Jagielski$Prepend [^$Control $CR $LF];
76*b1cdbd2cSJim Jagielski
77*b1cdbd2cSJim Jagielski
78*b1cdbd2cSJim Jagielski## -------------------------------------------------
79*b1cdbd2cSJim Jagielski
80*b1cdbd2cSJim Jagielski!!reverse;
81*b1cdbd2cSJim Jagielski$LF $CR;
82*b1cdbd2cSJim Jagielski($BengaliLetter? $BengaliSignVirama)+ $BengaliLetter;
83*b1cdbd2cSJim Jagielski($GujaratiLetter? $GujaratiSignVirama)+ $GujaratiLetter;
84*b1cdbd2cSJim Jagielski($DevanagariLetter? $DevanagariSignVirama)+ $DevanagariLetter;
85*b1cdbd2cSJim Jagielski($KannadaLetter? $KannadaSignVirama)+ $KannadaLetter;
86*b1cdbd2cSJim Jagielski($MalayalamLetter? $MalayalamSignVirama)+ $MalayalamLetter;
87*b1cdbd2cSJim Jagielski($OriyaLetter? $OriyaSignVirama)+ $OriyaLetter;
88*b1cdbd2cSJim Jagielski($GurmukhiLetter? $GurmukhiSignVirama)+ $GurmukhiLetter;
89*b1cdbd2cSJim Jagielski($TamilLetter? $TamilSignVirama)+ $TamilLetter;
90*b1cdbd2cSJim Jagielski($TeluguLetter? $TeluguSignVirama)+ $TeluguLetter;
91*b1cdbd2cSJim Jagielski($L | $V | $LV | $LVT) $L;
92*b1cdbd2cSJim Jagielski($V | $T) ($LV | $V);
93*b1cdbd2cSJim Jagielski$T ($LVT | $T);
94*b1cdbd2cSJim Jagielski
95*b1cdbd2cSJim Jagielski$Extend      [^$Control $CR $LF];
96*b1cdbd2cSJim Jagielski$SpacingMark [^$Control $CR $LF];
97*b1cdbd2cSJim Jagielski[^$Control $CR $LF] $Prepend;
98*b1cdbd2cSJim Jagielski
99*b1cdbd2cSJim Jagielski
100*b1cdbd2cSJim Jagielski## -------------------------------------------------
101*b1cdbd2cSJim Jagielski
102*b1cdbd2cSJim Jagielski!!safe_reverse;
103*b1cdbd2cSJim Jagielski
104*b1cdbd2cSJim Jagielski
105*b1cdbd2cSJim Jagielski## -------------------------------------------------
106*b1cdbd2cSJim Jagielski
107*b1cdbd2cSJim Jagielski!!safe_forward;
108*b1cdbd2cSJim Jagielski
109