1*b1cdbd2cSJim Jagielski# 2*b1cdbd2cSJim Jagielski# Copyright (C) 2002-2009, International Business Machines Corporation and others. 3*b1cdbd2cSJim Jagielski# All Rights Reserved. 4*b1cdbd2cSJim Jagielski# 5*b1cdbd2cSJim Jagielski# file: char.txt 6*b1cdbd2cSJim Jagielski# 7*b1cdbd2cSJim Jagielski# ICU Character Break Rules, also known as Grapheme Cluster Boundaries 8*b1cdbd2cSJim Jagielski# See Unicode Standard Annex #29. 9*b1cdbd2cSJim Jagielski# These rules are based on TR29 Revision 13, for Unicode Version 5.1 10*b1cdbd2cSJim Jagielski# 11*b1cdbd2cSJim Jagielski 12*b1cdbd2cSJim Jagielski# 13*b1cdbd2cSJim Jagielski# Character Class Definitions. 14*b1cdbd2cSJim Jagielski# 15*b1cdbd2cSJim Jagielski$CR = [\p{Grapheme_Cluster_Break = CR}]; 16*b1cdbd2cSJim Jagielski$LF = [\p{Grapheme_Cluster_Break = LF}]; 17*b1cdbd2cSJim Jagielski$Control = [\p{Grapheme_Cluster_Break = Control}]; 18*b1cdbd2cSJim Jagielski$Prepend = [\p{Grapheme_Cluster_Break = Prepend}]; 19*b1cdbd2cSJim Jagielski$Extend = [\p{Grapheme_Cluster_Break = Extend}]; 20*b1cdbd2cSJim Jagielski$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}]; 21*b1cdbd2cSJim Jagielski$BengaliLetter = [\u0985-\u09B9 \u09CE \u09DC-\u09E1 \u09F0-\u09F1]; 22*b1cdbd2cSJim Jagielski$BengaliSignVirama = \u09CD; 23*b1cdbd2cSJim Jagielski$GujaratiLetter = [\u0A85-\u0A8C \u0A8F-\u0A90 \u0A93-\u0AB9 \u0AE0-\u0AE1]; 24*b1cdbd2cSJim Jagielski$GujaratiSignVirama = \u0ACD; 25*b1cdbd2cSJim Jagielski$DevanagariLetter = [\u0904-\u0939 \u0958-\u0961 \u0972-\u097F]; 26*b1cdbd2cSJim Jagielski$DevanagariSignVirama = \u094D; 27*b1cdbd2cSJim Jagielski$KannadaLetter = [\u0C85-\u0CB9 \u0CDE-\u0CE1]; 28*b1cdbd2cSJim Jagielski$KannadaSignVirama = \u0CCD; 29*b1cdbd2cSJim Jagielski$MalayalamLetter = [\u0D05-\u0D39 \u0D60-\u0D61 \u0D7A-\u0D7F]; 30*b1cdbd2cSJim Jagielski$MalayalamSignVirama = \u0D4D; 31*b1cdbd2cSJim Jagielski$OriyaLetter = [\u0B05-\u0B39 \u0B5C-\u0B61 \u0B71]; 32*b1cdbd2cSJim Jagielski$OriyaSignVirama = \u0B4D; 33*b1cdbd2cSJim Jagielski$GurmukhiLetter = [\u0A05-\u0A39 \u0A59-\u0A5E]; 34*b1cdbd2cSJim Jagielski$GurmukhiSignVirama = \u0A4D; 35*b1cdbd2cSJim Jagielski$TamilLetter = [\u0B85-\u0BB9]; 36*b1cdbd2cSJim Jagielski$TamilSignVirama = \u0BCD; 37*b1cdbd2cSJim Jagielski$TeluguLetter = [\u0C05-\u0C39 \u0C58-\u0C61]; 38*b1cdbd2cSJim Jagielski$TeluguSignVirama = \u0C4D; 39*b1cdbd2cSJim Jagielski 40*b1cdbd2cSJim Jagielski# 41*b1cdbd2cSJim Jagielski# Korean Syllable Definitions 42*b1cdbd2cSJim Jagielski# 43*b1cdbd2cSJim Jagielski$L = [\p{Grapheme_Cluster_Break = L}]; 44*b1cdbd2cSJim Jagielski$V = [\p{Grapheme_Cluster_Break = V}]; 45*b1cdbd2cSJim Jagielski$T = [\p{Grapheme_Cluster_Break = T}]; 46*b1cdbd2cSJim Jagielski 47*b1cdbd2cSJim Jagielski$LV = [\p{Grapheme_Cluster_Break = LV}]; 48*b1cdbd2cSJim Jagielski$LVT = [\p{Grapheme_Cluster_Break = LVT}]; 49*b1cdbd2cSJim Jagielski 50*b1cdbd2cSJim Jagielski 51*b1cdbd2cSJim Jagielski## ------------------------------------------------- 52*b1cdbd2cSJim Jagielski!!chain; 53*b1cdbd2cSJim Jagielski 54*b1cdbd2cSJim Jagielski!!forward; 55*b1cdbd2cSJim Jagielski 56*b1cdbd2cSJim Jagielski$CR $LF; 57*b1cdbd2cSJim Jagielski 58*b1cdbd2cSJim Jagielski$BengaliLetter ($BengaliSignVirama $BengaliLetter?)+; 59*b1cdbd2cSJim Jagielski$GujaratiLetter ($GujaratiSignVirama $GujaratiLetter?)+; 60*b1cdbd2cSJim Jagielski$DevanagariLetter ($DevanagariSignVirama $DevanagariLetter?)+; 61*b1cdbd2cSJim Jagielski$KannadaLetter ($KannadaSignVirama $KannadaLetter?)+; 62*b1cdbd2cSJim Jagielski$MalayalamLetter ($MalayalamSignVirama $MalayalamLetter?)+; 63*b1cdbd2cSJim Jagielski$OriyaLetter ($OriyaSignVirama $OriyaLetter?)+; 64*b1cdbd2cSJim Jagielski$GurmukhiLetter ($GurmukhiSignVirama $GurmukhiLetter?)+; 65*b1cdbd2cSJim Jagielski$TamilLetter ($TamilSignVirama $TamilLetter?)+; 66*b1cdbd2cSJim Jagielski$TeluguLetter ($TeluguSignVirama $TeluguLetter?)+; 67*b1cdbd2cSJim Jagielski 68*b1cdbd2cSJim Jagielski$L ($L | $V | $LV | $LVT); 69*b1cdbd2cSJim Jagielski($LV | $V) ($V | $T); 70*b1cdbd2cSJim Jagielski($LVT | $T) $T; 71*b1cdbd2cSJim Jagielski 72*b1cdbd2cSJim Jagielski[^$Control $CR $LF] $Extend; 73*b1cdbd2cSJim Jagielski 74*b1cdbd2cSJim Jagielski[^$Control $CR $LF] $SpacingMark; 75*b1cdbd2cSJim Jagielski$Prepend [^$Control $CR $LF]; 76*b1cdbd2cSJim Jagielski 77*b1cdbd2cSJim Jagielski 78*b1cdbd2cSJim Jagielski## ------------------------------------------------- 79*b1cdbd2cSJim Jagielski 80*b1cdbd2cSJim Jagielski!!reverse; 81*b1cdbd2cSJim Jagielski$LF $CR; 82*b1cdbd2cSJim Jagielski($BengaliLetter? $BengaliSignVirama)+ $BengaliLetter; 83*b1cdbd2cSJim Jagielski($GujaratiLetter? $GujaratiSignVirama)+ $GujaratiLetter; 84*b1cdbd2cSJim Jagielski($DevanagariLetter? $DevanagariSignVirama)+ $DevanagariLetter; 85*b1cdbd2cSJim Jagielski($KannadaLetter? $KannadaSignVirama)+ $KannadaLetter; 86*b1cdbd2cSJim Jagielski($MalayalamLetter? $MalayalamSignVirama)+ $MalayalamLetter; 87*b1cdbd2cSJim Jagielski($OriyaLetter? $OriyaSignVirama)+ $OriyaLetter; 88*b1cdbd2cSJim Jagielski($GurmukhiLetter? $GurmukhiSignVirama)+ $GurmukhiLetter; 89*b1cdbd2cSJim Jagielski($TamilLetter? $TamilSignVirama)+ $TamilLetter; 90*b1cdbd2cSJim Jagielski($TeluguLetter? $TeluguSignVirama)+ $TeluguLetter; 91*b1cdbd2cSJim Jagielski($L | $V | $LV | $LVT) $L; 92*b1cdbd2cSJim Jagielski($V | $T) ($LV | $V); 93*b1cdbd2cSJim Jagielski$T ($LVT | $T); 94*b1cdbd2cSJim Jagielski 95*b1cdbd2cSJim Jagielski$Extend [^$Control $CR $LF]; 96*b1cdbd2cSJim Jagielski$SpacingMark [^$Control $CR $LF]; 97*b1cdbd2cSJim Jagielski[^$Control $CR $LF] $Prepend; 98*b1cdbd2cSJim Jagielski 99*b1cdbd2cSJim Jagielski 100*b1cdbd2cSJim Jagielski## ------------------------------------------------- 101*b1cdbd2cSJim Jagielski 102*b1cdbd2cSJim Jagielski!!safe_reverse; 103*b1cdbd2cSJim Jagielski 104*b1cdbd2cSJim Jagielski 105*b1cdbd2cSJim Jagielski## ------------------------------------------------- 106*b1cdbd2cSJim Jagielski 107*b1cdbd2cSJim Jagielski!!safe_forward; 108*b1cdbd2cSJim Jagielski 109