1*cdf0e10cSrcweir# 2*cdf0e10cSrcweir# Copyright (C) 2002-2003, International Business Machines Corporation and others. 3*cdf0e10cSrcweir# All Rights Reserved. 4*cdf0e10cSrcweir# 5*cdf0e10cSrcweir# file: dict_word.txt 6*cdf0e10cSrcweir# 7*cdf0e10cSrcweir# ICU Word Break Rules 8*cdf0e10cSrcweir# See Unicode Standard Annex #29. 9*cdf0e10cSrcweir# These rules are based on Version 4.0.0, dated 2003-04-17 10*cdf0e10cSrcweir# 11*cdf0e10cSrcweir 12*cdf0e10cSrcweir 13*cdf0e10cSrcweir 14*cdf0e10cSrcweir#################################################################################### 15*cdf0e10cSrcweir# 16*cdf0e10cSrcweir# Character class definitions from TR 29 17*cdf0e10cSrcweir# 18*cdf0e10cSrcweir#################################################################################### 19*cdf0e10cSrcweir$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 20*cdf0e10cSrcweir [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 21*cdf0e10cSrcweir [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] 22*cdf0e10cSrcweir [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; 23*cdf0e10cSrcweir 24*cdf0e10cSrcweir$Ideographic = [:Ideographic:]; 25*cdf0e10cSrcweir$Hangul = [:Script = HANGUL:]; 26*cdf0e10cSrcweir 27*cdf0e10cSrcweir$ALetter = [\u0002 [:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] 28*cdf0e10cSrcweir [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:] 29*cdf0e10cSrcweir [:name = SECTION SIGN:] [:name = DEGREE SIGN:] [:name = EURO SIGN:] 30*cdf0e10cSrcweir [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:] 31*cdf0e10cSrcweir [:name = DIGIT ZERO:] 32*cdf0e10cSrcweir [:name = DIGIT ONE:] 33*cdf0e10cSrcweir [:name = DIGIT TWO:] 34*cdf0e10cSrcweir [:name = DIGIT THREE:] 35*cdf0e10cSrcweir [:name = DIGIT FOUR:] 36*cdf0e10cSrcweir [:name = DIGIT FIVE:] 37*cdf0e10cSrcweir [:name = DIGIT SIX:] 38*cdf0e10cSrcweir [:name = DIGIT SEVEN:] 39*cdf0e10cSrcweir [:name = DIGIT EIGHT:] 40*cdf0e10cSrcweir [:name = DIGIT NINE:] 41*cdf0e10cSrcweir - $Ideographic 42*cdf0e10cSrcweir - $Katakana 43*cdf0e10cSrcweir - $Hangul 44*cdf0e10cSrcweir - [:Script = Thai:] 45*cdf0e10cSrcweir - [:Script = Lao:] 46*cdf0e10cSrcweir - [:Script = Hiragana:]]; 47*cdf0e10cSrcweir 48*cdf0e10cSrcweir$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] 49*cdf0e10cSrcweir [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] 50*cdf0e10cSrcweir [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] [:name = HYPHEN-MINUS:] 51*cdf0e10cSrcweir [:name = EURO SIGN:] [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:] 52*cdf0e10cSrcweir [:name = EN DASH:] [:name = EM DASH:] 53*cdf0e10cSrcweir [:name = FULL STOP:] [:name = PERCENT SIGN:] [:name = SECTION SIGN:] [:name = DEGREE SIGN:]]; 54*cdf0e10cSrcweir 55*cdf0e10cSrcweir$SufixLetter = [:name= FULL STOP:]; 56*cdf0e10cSrcweir 57*cdf0e10cSrcweir$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] 58*cdf0e10cSrcweir [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] 59*cdf0e10cSrcweir [:name = PRIME:]]; 60*cdf0e10cSrcweir$Numeric = [:LineBreak = Numeric:]; 61*cdf0e10cSrcweir 62*cdf0e10cSrcweir 63*cdf0e10cSrcweir$TheZWSP = \u200b; 64*cdf0e10cSrcweir 65*cdf0e10cSrcweir# 66*cdf0e10cSrcweir# Character Class Definitions. 67*cdf0e10cSrcweir# The names are those from TR29. 68*cdf0e10cSrcweir# 69*cdf0e10cSrcweir$CR = \u000d; 70*cdf0e10cSrcweir$LF = \u000a; 71*cdf0e10cSrcweir$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; 72*cdf0e10cSrcweir$Extend = [[:Grapheme_Extend = TRUE:]]; 73*cdf0e10cSrcweir 74*cdf0e10cSrcweir 75*cdf0e10cSrcweir 76*cdf0e10cSrcweir 77*cdf0e10cSrcweir#################################################################################### 78*cdf0e10cSrcweir# 79*cdf0e10cSrcweir# Word Break Rules. Definitions and Rules specific to word break begin Here. 80*cdf0e10cSrcweir# 81*cdf0e10cSrcweir#################################################################################### 82*cdf0e10cSrcweir 83*cdf0e10cSrcweir$Format = [[:Cf:] - $TheZWSP]; 84*cdf0e10cSrcweir 85*cdf0e10cSrcweir 86*cdf0e10cSrcweir 87*cdf0e10cSrcweir# Rule 3: Treat a grapheme cluster as if it were a single character. 88*cdf0e10cSrcweir# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters 89*cdf0e10cSrcweir# because we don't need to find the boundaries between adjacent syllables - 90*cdf0e10cSrcweir# they won't be word boundaries. 91*cdf0e10cSrcweir# 92*cdf0e10cSrcweir 93*cdf0e10cSrcweir 94*cdf0e10cSrcweir# 95*cdf0e10cSrcweir# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. 96*cdf0e10cSrcweir# 97*cdf0e10cSrcweir$ALetterEx = $ALetter $Extend*; 98*cdf0e10cSrcweir$NumericEx = $Numeric $Extend*; 99*cdf0e10cSrcweir$MidNumEx = $MidNum $Extend*; 100*cdf0e10cSrcweir$MidLetterEx = $MidLetter $Extend*; 101*cdf0e10cSrcweir$SufixLetterEx= $SufixLetter $Extend*; 102*cdf0e10cSrcweir$KatakanaEx = $Katakana $Extend*; 103*cdf0e10cSrcweir$IdeographicEx= $Ideographic $Extend*; 104*cdf0e10cSrcweir$HangulEx = $Hangul $Extend*; 105*cdf0e10cSrcweir$FormatEx = $Format $Extend*; 106*cdf0e10cSrcweir 107*cdf0e10cSrcweir 108*cdf0e10cSrcweir# 109*cdf0e10cSrcweir# Numbers. Rules 8, 11, 12 form the TR. 110*cdf0e10cSrcweir# 111*cdf0e10cSrcweir$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; 112*cdf0e10cSrcweir$NumberSequence {100}; 113*cdf0e10cSrcweir 114*cdf0e10cSrcweir# 115*cdf0e10cSrcweir# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 116*cdf0e10cSrcweir# - must include at least one letter. 117*cdf0e10cSrcweir# - may include both letters and numbers. 118*cdf0e10cSrcweir# - may include MideLetter, MidNumber punctuation. 119*cdf0e10cSrcweir# 120*cdf0e10cSrcweir$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 121*cdf0e10cSrcweir($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; 122*cdf0e10cSrcweir 123*cdf0e10cSrcweir[[:P:][:S:]]*; 124*cdf0e10cSrcweir 125*cdf0e10cSrcweir# 126*cdf0e10cSrcweir# Do not break between Katakana. Rule #13. 127*cdf0e10cSrcweir# 128*cdf0e10cSrcweir$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; 129*cdf0e10cSrcweir[:Hiragana:] $Extend* {300}; 130*cdf0e10cSrcweir 131*cdf0e10cSrcweir# 132*cdf0e10cSrcweir# Ideographic Characters. Stand by themselves as words. 133*cdf0e10cSrcweir# Separated from the "Everything Else" rule, below, only so that they 134*cdf0e10cSrcweir# can be tagged with a return value. TODO: is this what we want? 135*cdf0e10cSrcweir# 136*cdf0e10cSrcweir$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; 137*cdf0e10cSrcweir$HangulEx ($FormatEx* $HangulEx)* {400}; 138*cdf0e10cSrcweir 139*cdf0e10cSrcweir# 140*cdf0e10cSrcweir# Everything Else, with no tag. 141*cdf0e10cSrcweir# Non-Control chars combine with $Extend (combining) chars. 142*cdf0e10cSrcweir# Controls are do not. 143*cdf0e10cSrcweir# 144*cdf0e10cSrcweir[^$Control [:Ideographic:]] $Extend*; 145*cdf0e10cSrcweir$CR $LF; 146*cdf0e10cSrcweir 147*cdf0e10cSrcweir# 148*cdf0e10cSrcweir# Reverse Rules. Back up over any of the chars that can group together. 149*cdf0e10cSrcweir# (Reverse rules do not need to be exact; they can back up too far, 150*cdf0e10cSrcweir# but must back up at least enough, and must stop on a boundary.) 151*cdf0e10cSrcweir# 152*cdf0e10cSrcweir 153*cdf0e10cSrcweir# NonStarters are the set of all characters that can appear at the 2nd - nth position of 154*cdf0e10cSrcweir# a word. (They may also be the first.) The reverse rule skips over these, until it 155*cdf0e10cSrcweir# reaches something that can only be the start (and probably only) char in a "word". 156*cdf0e10cSrcweir# A space or punctuation meets the test. 157*cdf0e10cSrcweir# 158*cdf0e10cSrcweir$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; 159*cdf0e10cSrcweir 160*cdf0e10cSrcweir#!.*; 161*cdf0e10cSrcweir! ($NonStarters* | \n \r) .; 162*cdf0e10cSrcweir 163