1*cdf0e10cSrcweir# 2*cdf0e10cSrcweir# Copyright (C) 2002-2009, International Business Machines Corporation and others. 3*cdf0e10cSrcweir# All Rights Reserved. 4*cdf0e10cSrcweir# 5*cdf0e10cSrcweir# file: char.txt 6*cdf0e10cSrcweir# 7*cdf0e10cSrcweir# ICU Character Break Rules, also known as Grapheme Cluster Boundaries 8*cdf0e10cSrcweir# See Unicode Standard Annex #29. 9*cdf0e10cSrcweir# These rules are based on TR29 Revision 13, for Unicode Version 5.1 10*cdf0e10cSrcweir# 11*cdf0e10cSrcweir 12*cdf0e10cSrcweir# 13*cdf0e10cSrcweir# Character Class Definitions. 14*cdf0e10cSrcweir# 15*cdf0e10cSrcweir$CR = [\p{Grapheme_Cluster_Break = CR}]; 16*cdf0e10cSrcweir$LF = [\p{Grapheme_Cluster_Break = LF}]; 17*cdf0e10cSrcweir$Control = [\p{Grapheme_Cluster_Break = Control}]; 18*cdf0e10cSrcweir$Prepend = [\p{Grapheme_Cluster_Break = Prepend}]; 19*cdf0e10cSrcweir$Extend = [\p{Grapheme_Cluster_Break = Extend}]; 20*cdf0e10cSrcweir$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}]; 21*cdf0e10cSrcweir$BengaliLetter = [\u0985-\u09B9 \u09CE \u09DC-\u09E1 \u09F0-\u09F1]; 22*cdf0e10cSrcweir$BengaliSignVirama = \u09CD; 23*cdf0e10cSrcweir$GujaratiLetter = [\u0A85-\u0A8C \u0A8F-\u0A90 \u0A93-\u0AB9 \u0AE0-\u0AE1]; 24*cdf0e10cSrcweir$GujaratiSignVirama = \u0ACD; 25*cdf0e10cSrcweir$DevanagariLetter = [\u0904-\u0939 \u0958-\u0961 \u0972-\u097F]; 26*cdf0e10cSrcweir$DevanagariSignVirama = \u094D; 27*cdf0e10cSrcweir$KannadaLetter = [\u0C85-\u0CB9 \u0CDE-\u0CE1]; 28*cdf0e10cSrcweir$KannadaSignVirama = \u0CCD; 29*cdf0e10cSrcweir$MalayalamLetter = [\u0D05-\u0D39 \u0D60-\u0D61 \u0D7A-\u0D7F]; 30*cdf0e10cSrcweir$MalayalamSignVirama = \u0D4D; 31*cdf0e10cSrcweir$OriyaLetter = [\u0B05-\u0B39 \u0B5C-\u0B61 \u0B71]; 32*cdf0e10cSrcweir$OriyaSignVirama = \u0B4D; 33*cdf0e10cSrcweir$GurmukhiLetter = [\u0A05-\u0A39 \u0A59-\u0A5E]; 34*cdf0e10cSrcweir$GurmukhiSignVirama = \u0A4D; 35*cdf0e10cSrcweir$TamilLetter = [\u0B85-\u0BB9]; 36*cdf0e10cSrcweir$TamilSignVirama = \u0BCD; 37*cdf0e10cSrcweir$TeluguLetter = [\u0C05-\u0C39 \u0C58-\u0C61]; 38*cdf0e10cSrcweir$TeluguSignVirama = \u0C4D; 39*cdf0e10cSrcweir 40*cdf0e10cSrcweir# 41*cdf0e10cSrcweir# Korean Syllable Definitions 42*cdf0e10cSrcweir# 43*cdf0e10cSrcweir$L = [\p{Grapheme_Cluster_Break = L}]; 44*cdf0e10cSrcweir$V = [\p{Grapheme_Cluster_Break = V}]; 45*cdf0e10cSrcweir$T = [\p{Grapheme_Cluster_Break = T}]; 46*cdf0e10cSrcweir 47*cdf0e10cSrcweir$LV = [\p{Grapheme_Cluster_Break = LV}]; 48*cdf0e10cSrcweir$LVT = [\p{Grapheme_Cluster_Break = LVT}]; 49*cdf0e10cSrcweir 50*cdf0e10cSrcweir 51*cdf0e10cSrcweir## ------------------------------------------------- 52*cdf0e10cSrcweir!!chain; 53*cdf0e10cSrcweir 54*cdf0e10cSrcweir!!forward; 55*cdf0e10cSrcweir 56*cdf0e10cSrcweir$CR $LF; 57*cdf0e10cSrcweir 58*cdf0e10cSrcweir$BengaliLetter ($BengaliSignVirama $BengaliLetter?)+; 59*cdf0e10cSrcweir$GujaratiLetter ($GujaratiSignVirama $GujaratiLetter?)+; 60*cdf0e10cSrcweir$DevanagariLetter ($DevanagariSignVirama $DevanagariLetter?)+; 61*cdf0e10cSrcweir$KannadaLetter ($KannadaSignVirama $KannadaLetter?)+; 62*cdf0e10cSrcweir$MalayalamLetter ($MalayalamSignVirama $MalayalamLetter?)+; 63*cdf0e10cSrcweir$OriyaLetter ($OriyaSignVirama $OriyaLetter?)+; 64*cdf0e10cSrcweir$GurmukhiLetter ($GurmukhiSignVirama $GurmukhiLetter?)+; 65*cdf0e10cSrcweir$TamilLetter ($TamilSignVirama $TamilLetter?)+; 66*cdf0e10cSrcweir$TeluguLetter ($TeluguSignVirama $TeluguLetter?)+; 67*cdf0e10cSrcweir 68*cdf0e10cSrcweir$L ($L | $V | $LV | $LVT); 69*cdf0e10cSrcweir($LV | $V) ($V | $T); 70*cdf0e10cSrcweir($LVT | $T) $T; 71*cdf0e10cSrcweir 72*cdf0e10cSrcweir[^$Control $CR $LF] $Extend; 73*cdf0e10cSrcweir 74*cdf0e10cSrcweir[^$Control $CR $LF] $SpacingMark; 75*cdf0e10cSrcweir$Prepend [^$Control $CR $LF]; 76*cdf0e10cSrcweir 77*cdf0e10cSrcweir 78*cdf0e10cSrcweir## ------------------------------------------------- 79*cdf0e10cSrcweir 80*cdf0e10cSrcweir!!reverse; 81*cdf0e10cSrcweir$LF $CR; 82*cdf0e10cSrcweir($BengaliLetter? $BengaliSignVirama)+ $BengaliLetter; 83*cdf0e10cSrcweir($GujaratiLetter? $GujaratiSignVirama)+ $GujaratiLetter; 84*cdf0e10cSrcweir($DevanagariLetter? $DevanagariSignVirama)+ $DevanagariLetter; 85*cdf0e10cSrcweir($KannadaLetter? $KannadaSignVirama)+ $KannadaLetter; 86*cdf0e10cSrcweir($MalayalamLetter? $MalayalamSignVirama)+ $MalayalamLetter; 87*cdf0e10cSrcweir($OriyaLetter? $OriyaSignVirama)+ $OriyaLetter; 88*cdf0e10cSrcweir($GurmukhiLetter? $GurmukhiSignVirama)+ $GurmukhiLetter; 89*cdf0e10cSrcweir($TamilLetter? $TamilSignVirama)+ $TamilLetter; 90*cdf0e10cSrcweir($TeluguLetter? $TeluguSignVirama)+ $TeluguLetter; 91*cdf0e10cSrcweir($L | $V | $LV | $LVT) $L; 92*cdf0e10cSrcweir($V | $T) ($LV | $V); 93*cdf0e10cSrcweir$T ($LVT | $T); 94*cdf0e10cSrcweir 95*cdf0e10cSrcweir$Extend [^$Control $CR $LF]; 96*cdf0e10cSrcweir$SpacingMark [^$Control $CR $LF]; 97*cdf0e10cSrcweir[^$Control $CR $LF] $Prepend; 98*cdf0e10cSrcweir 99*cdf0e10cSrcweir 100*cdf0e10cSrcweir## ------------------------------------------------- 101*cdf0e10cSrcweir 102*cdf0e10cSrcweir!!safe_reverse; 103*cdf0e10cSrcweir 104*cdf0e10cSrcweir 105*cdf0e10cSrcweir## ------------------------------------------------- 106*cdf0e10cSrcweir 107*cdf0e10cSrcweir!!safe_forward; 108*cdf0e10cSrcweir 109