1*cdf0e10cSrcweir# 2*cdf0e10cSrcweir# Copyright (C) 2002-2003, International Business Machines Corporation and others. 3*cdf0e10cSrcweir# All Rights Reserved. 4*cdf0e10cSrcweir# 5*cdf0e10cSrcweir# file: count_word.txt 6*cdf0e10cSrcweir# 7*cdf0e10cSrcweir# ICU Word Break Rules 8*cdf0e10cSrcweir# See Unicode Standard Annex #29. 9*cdf0e10cSrcweir# These rules are based on Version 4.0.0, dated 2003-04-17 10*cdf0e10cSrcweir# 11*cdf0e10cSrcweir 12*cdf0e10cSrcweir 13*cdf0e10cSrcweir 14*cdf0e10cSrcweir#################################################################################### 15*cdf0e10cSrcweir# 16*cdf0e10cSrcweir# Character class definitions from TR 29 17*cdf0e10cSrcweir# 18*cdf0e10cSrcweir#################################################################################### 19*cdf0e10cSrcweir$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 20*cdf0e10cSrcweir [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 21*cdf0e10cSrcweir [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] 22*cdf0e10cSrcweir [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; 23*cdf0e10cSrcweir 24*cdf0e10cSrcweir 25*cdf0e10cSrcweir$dash = \u002d; 26*cdf0e10cSrcweir 27*cdf0e10cSrcweir$ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:] 28*cdf0e10cSrcweir [:P:] [:S:] [:LineBreak = Numeric:] 29*cdf0e10cSrcweir - $dash 30*cdf0e10cSrcweir - $Katakana 31*cdf0e10cSrcweir - [:Script = Thai:] 32*cdf0e10cSrcweir - [:Script = Lao:] 33*cdf0e10cSrcweir - [:Script = Hiragana:]]; 34*cdf0e10cSrcweir 35*cdf0e10cSrcweir$TheZWSP = \u200b; 36*cdf0e10cSrcweir 37*cdf0e10cSrcweir# 38*cdf0e10cSrcweir# Character Class Definitions. 39*cdf0e10cSrcweir# The names are those from TR29. 40*cdf0e10cSrcweir# 41*cdf0e10cSrcweir$CR = \u000d; 42*cdf0e10cSrcweir$LF = \u000a; 43*cdf0e10cSrcweir$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; 44*cdf0e10cSrcweir$Extend = [[:Grapheme_Extend = TRUE:]]; 45*cdf0e10cSrcweir 46*cdf0e10cSrcweir 47*cdf0e10cSrcweir 48*cdf0e10cSrcweir 49*cdf0e10cSrcweir#################################################################################### 50*cdf0e10cSrcweir# 51*cdf0e10cSrcweir# Word Break Rules. Definitions and Rules specific to word break begin Here. 52*cdf0e10cSrcweir# 53*cdf0e10cSrcweir#################################################################################### 54*cdf0e10cSrcweir 55*cdf0e10cSrcweir$Format = [[:Cf:] - $TheZWSP]; 56*cdf0e10cSrcweir 57*cdf0e10cSrcweir 58*cdf0e10cSrcweir 59*cdf0e10cSrcweir# Rule 3: Treat a grapheme cluster as if it were a single character. 60*cdf0e10cSrcweir# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters 61*cdf0e10cSrcweir# because we don't need to find the boundaries between adjacent syllables - 62*cdf0e10cSrcweir# they won't be word boundaries. 63*cdf0e10cSrcweir# 64*cdf0e10cSrcweir 65*cdf0e10cSrcweir 66*cdf0e10cSrcweir# 67*cdf0e10cSrcweir# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. 68*cdf0e10cSrcweir# 69*cdf0e10cSrcweir$ALetterEx = $ALetter $Extend*; 70*cdf0e10cSrcweir$KatakanaEx = $Katakana $Extend*; 71*cdf0e10cSrcweir$FormatEx = $Format $Extend*; 72*cdf0e10cSrcweir 73*cdf0e10cSrcweir# 74*cdf0e10cSrcweir# Numbers. Rules 8, 11, 12 form the TR. 75*cdf0e10cSrcweir# 76*cdf0e10cSrcweir 77*cdf0e10cSrcweir# 78*cdf0e10cSrcweir# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 79*cdf0e10cSrcweir# - must include at least one letter. 80*cdf0e10cSrcweir# - may include both letters and numbers. 81*cdf0e10cSrcweir# - may include MideLetter, MidNumber punctuation. 82*cdf0e10cSrcweir# 83*cdf0e10cSrcweir$LetterSequence = $ALetterEx ($FormatEx* $ALetterEx)*; # rules #6, #7 84*cdf0e10cSrcweir$LetterSequence {200}; 85*cdf0e10cSrcweir 86*cdf0e10cSrcweir$ALetterEx* $dash+ {200}; 87*cdf0e10cSrcweir$ALetterEx* ($dash $LetterSequence)+ $dash* {200}; 88*cdf0e10cSrcweir 89*cdf0e10cSrcweir# 90*cdf0e10cSrcweir# Do not break between Katakana. Rule #13. 91*cdf0e10cSrcweir# 92*cdf0e10cSrcweir$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; 93*cdf0e10cSrcweir[:Hiragana:] $Extend* {300}; 94*cdf0e10cSrcweir 95*cdf0e10cSrcweir# 96*cdf0e10cSrcweir# Ideographic Characters. Stand by themselves as words. 97*cdf0e10cSrcweir# Separated from the "Everything Else" rule, below, only so that they 98*cdf0e10cSrcweir# can be tagged with a return value. TODO: is this what we want? 99*cdf0e10cSrcweir# 100*cdf0e10cSrcweir# [:IDEOGRAPHIC:] $Extend* {400}; 101*cdf0e10cSrcweir 102*cdf0e10cSrcweir# 103*cdf0e10cSrcweir# Everything Else, with no tag. 104*cdf0e10cSrcweir# Non-Control chars combine with $Extend (combining) chars. 105*cdf0e10cSrcweir# Controls are do not. 106*cdf0e10cSrcweir# 107*cdf0e10cSrcweir[^$Control [:Ideographic:]] $Extend*; 108*cdf0e10cSrcweir$CR $LF; 109*cdf0e10cSrcweir 110*cdf0e10cSrcweir# 111*cdf0e10cSrcweir# Reverse Rules. Back up over any of the chars that can group together. 112*cdf0e10cSrcweir# (Reverse rules do not need to be exact; they can back up too far, 113*cdf0e10cSrcweir# but must back up at least enough, and must stop on a boundary.) 114*cdf0e10cSrcweir# 115*cdf0e10cSrcweir 116*cdf0e10cSrcweir# NonStarters are the set of all characters that can appear at the 2nd - nth position of 117*cdf0e10cSrcweir# a word. (They may also be the first.) The reverse rule skips over these, until it 118*cdf0e10cSrcweir# reaches something that can only be the start (and probably only) char in a "word". 119*cdf0e10cSrcweir# A space or punctuation meets the test. 120*cdf0e10cSrcweir# 121*cdf0e10cSrcweir$NonStarters = [$ALetter $Katakana $Extend $Format]; 122*cdf0e10cSrcweir 123*cdf0e10cSrcweir#!.*; 124*cdf0e10cSrcweir! ($NonStarters* | \n \r) .; 125*cdf0e10cSrcweir 126