1*449ab281SAndrew Rist /**************************************************************
2cdf0e10cSrcweir *
3*449ab281SAndrew Rist * Licensed to the Apache Software Foundation (ASF) under one
4*449ab281SAndrew Rist * or more contributor license agreements. See the NOTICE file
5*449ab281SAndrew Rist * distributed with this work for additional information
6*449ab281SAndrew Rist * regarding copyright ownership. The ASF licenses this file
7*449ab281SAndrew Rist * to you under the Apache License, Version 2.0 (the
8*449ab281SAndrew Rist * "License"); you may not use this file except in compliance
9*449ab281SAndrew Rist * with the License. You may obtain a copy of the License at
10*449ab281SAndrew Rist *
11*449ab281SAndrew Rist * http://www.apache.org/licenses/LICENSE-2.0
12*449ab281SAndrew Rist *
13*449ab281SAndrew Rist * Unless required by applicable law or agreed to in writing,
14*449ab281SAndrew Rist * software distributed under the License is distributed on an
15*449ab281SAndrew Rist * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16*449ab281SAndrew Rist * KIND, either express or implied. See the License for the
17*449ab281SAndrew Rist * specific language governing permissions and limitations
18*449ab281SAndrew Rist * under the License.
19*449ab281SAndrew Rist *
20*449ab281SAndrew Rist *************************************************************/
21*449ab281SAndrew Rist
22*449ab281SAndrew Rist
23cdf0e10cSrcweir
24cdf0e10cSrcweir // MARKER(update_precomp.py): autogen include statement, do not remove
25cdf0e10cSrcweir #include "precompiled_i18npool.hxx"
26cdf0e10cSrcweir
27cdf0e10cSrcweir #include <breakiteratorImpl.hxx>
28cdf0e10cSrcweir #include <unicode/uchar.h>
29cdf0e10cSrcweir #include <rtl/ustrbuf.hxx>
30cdf0e10cSrcweir
31cdf0e10cSrcweir using namespace ::com::sun::star::uno;
32cdf0e10cSrcweir using namespace ::com::sun::star::lang;
33cdf0e10cSrcweir using namespace ::rtl;
34cdf0e10cSrcweir
35cdf0e10cSrcweir namespace com { namespace sun { namespace star { namespace i18n {
36cdf0e10cSrcweir
BreakIteratorImpl(const Reference<XMultiServiceFactory> & rxMSF)37cdf0e10cSrcweir BreakIteratorImpl::BreakIteratorImpl( const Reference < XMultiServiceFactory >& rxMSF ) : xMSF( rxMSF )
38cdf0e10cSrcweir {
39cdf0e10cSrcweir }
40cdf0e10cSrcweir
BreakIteratorImpl()41cdf0e10cSrcweir BreakIteratorImpl::BreakIteratorImpl()
42cdf0e10cSrcweir {
43cdf0e10cSrcweir }
44cdf0e10cSrcweir
~BreakIteratorImpl()45cdf0e10cSrcweir BreakIteratorImpl::~BreakIteratorImpl()
46cdf0e10cSrcweir {
47cdf0e10cSrcweir // Clear lookuptable
48cdf0e10cSrcweir for (size_t l = 0; l < lookupTable.size(); l++)
49cdf0e10cSrcweir delete lookupTable[l];
50cdf0e10cSrcweir lookupTable.clear();
51cdf0e10cSrcweir }
52cdf0e10cSrcweir
53cdf0e10cSrcweir #define LBI getLocaleSpecificBreakIterator(rLocale)
54cdf0e10cSrcweir
nextCharacters(const OUString & Text,sal_Int32 nStartPos,const Locale & rLocale,sal_Int16 nCharacterIteratorMode,sal_Int32 nCount,sal_Int32 & nDone)55cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::nextCharacters( const OUString& Text, sal_Int32 nStartPos,
56cdf0e10cSrcweir const Locale &rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
57cdf0e10cSrcweir throw(RuntimeException)
58cdf0e10cSrcweir {
59cdf0e10cSrcweir if (nCount < 0) throw RuntimeException();
60cdf0e10cSrcweir
61cdf0e10cSrcweir return LBI->nextCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone);
62cdf0e10cSrcweir }
63cdf0e10cSrcweir
previousCharacters(const OUString & Text,sal_Int32 nStartPos,const Locale & rLocale,sal_Int16 nCharacterIteratorMode,sal_Int32 nCount,sal_Int32 & nDone)64cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::previousCharacters( const OUString& Text, sal_Int32 nStartPos,
65cdf0e10cSrcweir const Locale& rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
66cdf0e10cSrcweir throw(RuntimeException)
67cdf0e10cSrcweir {
68cdf0e10cSrcweir if (nCount < 0) throw RuntimeException();
69cdf0e10cSrcweir
70cdf0e10cSrcweir return LBI->previousCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone);
71cdf0e10cSrcweir }
72cdf0e10cSrcweir
73cdf0e10cSrcweir #define isZWSP(c) (ch == 0x200B)
74cdf0e10cSrcweir
skipSpace(const OUString & Text,sal_Int32 nPos,sal_Int32 len,sal_Int16 rWordType,sal_Bool bDirection)75cdf0e10cSrcweir static sal_Int32 skipSpace(const OUString& Text, sal_Int32 nPos, sal_Int32 len, sal_Int16 rWordType, sal_Bool bDirection)
76cdf0e10cSrcweir {
77cdf0e10cSrcweir sal_uInt32 ch=0;
78cdf0e10cSrcweir sal_Int32 pos=nPos;
79cdf0e10cSrcweir switch (rWordType) {
80cdf0e10cSrcweir case WordType::ANYWORD_IGNOREWHITESPACES:
81cdf0e10cSrcweir if (bDirection)
82cdf0e10cSrcweir while (nPos < len && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, 1)) || isZWSP(ch))) nPos=pos;
83cdf0e10cSrcweir else
84cdf0e10cSrcweir while (nPos > 0 && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, -1)) || isZWSP(ch))) nPos=pos;
85cdf0e10cSrcweir break;
86cdf0e10cSrcweir case WordType::DICTIONARY_WORD:
87cdf0e10cSrcweir if (bDirection)
88cdf0e10cSrcweir while (nPos < len && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, 1)) || isZWSP(ch) ||
89cdf0e10cSrcweir ! (ch == 0x002E || u_isalnum(ch)))) nPos=pos;
90cdf0e10cSrcweir else
91cdf0e10cSrcweir while (nPos > 0 && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, -1)) || isZWSP(ch) ||
92cdf0e10cSrcweir ! (ch == 0x002E || u_isalnum(ch)))) nPos=pos;
93cdf0e10cSrcweir break;
94cdf0e10cSrcweir case WordType::WORD_COUNT:
95cdf0e10cSrcweir if (bDirection)
96cdf0e10cSrcweir while (nPos < len && (u_isUWhiteSpace(ch = Text.iterateCodePoints(&pos, 1)) || isZWSP(ch))) nPos=pos;
97cdf0e10cSrcweir else
98cdf0e10cSrcweir while (nPos > 0 && (u_isUWhiteSpace(ch = Text.iterateCodePoints(&pos, -1)) || isZWSP(ch))) nPos=pos;
99cdf0e10cSrcweir break;
100cdf0e10cSrcweir }
101cdf0e10cSrcweir return nPos;
102cdf0e10cSrcweir }
103cdf0e10cSrcweir
nextWord(const OUString & Text,sal_Int32 nStartPos,const Locale & rLocale,sal_Int16 rWordType)104cdf0e10cSrcweir Boundary SAL_CALL BreakIteratorImpl::nextWord( const OUString& Text, sal_Int32 nStartPos,
105cdf0e10cSrcweir const Locale& rLocale, sal_Int16 rWordType ) throw(RuntimeException)
106cdf0e10cSrcweir {
107cdf0e10cSrcweir sal_Int32 len = Text.getLength();
108cdf0e10cSrcweir if( nStartPos < 0 || len == 0 )
109cdf0e10cSrcweir result.endPos = result.startPos = 0;
110cdf0e10cSrcweir else if (nStartPos >= len)
111cdf0e10cSrcweir result.endPos = result.startPos = len;
112cdf0e10cSrcweir else {
113cdf0e10cSrcweir result = LBI->nextWord(Text, nStartPos, rLocale, rWordType);
114cdf0e10cSrcweir
115cdf0e10cSrcweir nStartPos = skipSpace(Text, result.startPos, len, rWordType, sal_True);
116cdf0e10cSrcweir
117cdf0e10cSrcweir if ( nStartPos != result.startPos) {
118cdf0e10cSrcweir if( nStartPos >= len )
119cdf0e10cSrcweir result.startPos = result.endPos = len;
120cdf0e10cSrcweir else {
121cdf0e10cSrcweir result = LBI->getWordBoundary(Text, nStartPos, rLocale, rWordType, sal_True);
122cdf0e10cSrcweir // i88041: avoid startPos goes back to nStartPos when switching between Latin and CJK scripts
123cdf0e10cSrcweir if (result.startPos < nStartPos) result.startPos = nStartPos;
124cdf0e10cSrcweir }
125cdf0e10cSrcweir }
126cdf0e10cSrcweir }
127cdf0e10cSrcweir return result;
128cdf0e10cSrcweir }
129cdf0e10cSrcweir
isCJK(const Locale & rLocale)130cdf0e10cSrcweir static inline sal_Bool SAL_CALL isCJK( const Locale& rLocale ) {
131cdf0e10cSrcweir return rLocale.Language.equalsAscii("zh") || rLocale.Language.equalsAscii("ja") || rLocale.Language.equalsAscii("ko");
132cdf0e10cSrcweir }
133cdf0e10cSrcweir
previousWord(const OUString & Text,sal_Int32 nStartPos,const Locale & rLocale,sal_Int16 rWordType)134cdf0e10cSrcweir Boundary SAL_CALL BreakIteratorImpl::previousWord( const OUString& Text, sal_Int32 nStartPos,
135cdf0e10cSrcweir const Locale& rLocale, sal_Int16 rWordType) throw(RuntimeException)
136cdf0e10cSrcweir {
137cdf0e10cSrcweir sal_Int32 len = Text.getLength();
138cdf0e10cSrcweir if( nStartPos <= 0 || len == 0 ) {
139cdf0e10cSrcweir result.endPos = result.startPos = 0;
140cdf0e10cSrcweir return result;
141cdf0e10cSrcweir } else if (nStartPos > len) {
142cdf0e10cSrcweir result.endPos = result.startPos = len;
143cdf0e10cSrcweir return result;
144cdf0e10cSrcweir }
145cdf0e10cSrcweir
146cdf0e10cSrcweir sal_Int32 nPos = skipSpace(Text, nStartPos, len, rWordType, sal_False);
147cdf0e10cSrcweir
148cdf0e10cSrcweir // if some spaces are skiped, and the script type is Asian with no CJK rLocale, we have to return
149cdf0e10cSrcweir // (nStartPos, -1) for caller to send correct rLocale for loading correct dictionary.
150cdf0e10cSrcweir result.startPos = nPos;
151cdf0e10cSrcweir if (nPos != nStartPos && nPos > 0 && !isCJK(rLocale) && getScriptClass(Text.iterateCodePoints(&nPos, -1)) == ScriptType::ASIAN) {
152cdf0e10cSrcweir result.endPos = -1;
153cdf0e10cSrcweir return result;
154cdf0e10cSrcweir }
155cdf0e10cSrcweir
156cdf0e10cSrcweir return LBI->previousWord(Text, result.startPos, rLocale, rWordType);
157cdf0e10cSrcweir }
158cdf0e10cSrcweir
159cdf0e10cSrcweir
getWordBoundary(const OUString & Text,sal_Int32 nPos,const Locale & rLocale,sal_Int16 rWordType,sal_Bool bDirection)160cdf0e10cSrcweir Boundary SAL_CALL BreakIteratorImpl::getWordBoundary( const OUString& Text, sal_Int32 nPos, const Locale& rLocale,
161cdf0e10cSrcweir sal_Int16 rWordType, sal_Bool bDirection ) throw(RuntimeException)
162cdf0e10cSrcweir {
163cdf0e10cSrcweir sal_Int32 len = Text.getLength();
164cdf0e10cSrcweir if( nPos < 0 || len == 0 )
165cdf0e10cSrcweir result.endPos = result.startPos = 0;
166cdf0e10cSrcweir else if (nPos > len)
167cdf0e10cSrcweir result.endPos = result.startPos = len;
168cdf0e10cSrcweir else {
169cdf0e10cSrcweir sal_Int32 next, prev;
170cdf0e10cSrcweir next = skipSpace(Text, nPos, len, rWordType, sal_True);
171cdf0e10cSrcweir prev = skipSpace(Text, nPos, len, rWordType, sal_False);
172cdf0e10cSrcweir if (prev == 0 && next == len) {
173cdf0e10cSrcweir result.endPos = result.startPos = nPos;
174cdf0e10cSrcweir } else if (prev == 0 && ! bDirection) {
175cdf0e10cSrcweir result.endPos = result.startPos = 0;
176cdf0e10cSrcweir } else if (next == len && bDirection) {
177cdf0e10cSrcweir result.endPos = result.startPos = len;
178cdf0e10cSrcweir } else {
179cdf0e10cSrcweir if (next != prev) {
180cdf0e10cSrcweir if (next == nPos && next != len)
181cdf0e10cSrcweir bDirection = sal_True;
182cdf0e10cSrcweir else if (prev == nPos && prev != 0)
183cdf0e10cSrcweir bDirection = sal_False;
184cdf0e10cSrcweir else
185cdf0e10cSrcweir nPos = bDirection ? next : prev;
186cdf0e10cSrcweir }
187cdf0e10cSrcweir result = LBI->getWordBoundary(Text, nPos, rLocale, rWordType, bDirection);
188cdf0e10cSrcweir }
189cdf0e10cSrcweir }
190cdf0e10cSrcweir return result;
191cdf0e10cSrcweir }
192cdf0e10cSrcweir
isBeginWord(const OUString & Text,sal_Int32 nPos,const Locale & rLocale,sal_Int16 rWordType)193cdf0e10cSrcweir sal_Bool SAL_CALL BreakIteratorImpl::isBeginWord( const OUString& Text, sal_Int32 nPos,
194cdf0e10cSrcweir const Locale& rLocale, sal_Int16 rWordType ) throw(RuntimeException)
195cdf0e10cSrcweir {
196cdf0e10cSrcweir sal_Int32 len = Text.getLength();
197cdf0e10cSrcweir
198cdf0e10cSrcweir if (nPos < 0 || nPos >= len) return sal_False;
199cdf0e10cSrcweir
200cdf0e10cSrcweir sal_Int32 tmp = skipSpace(Text, nPos, len, rWordType, sal_True);
201cdf0e10cSrcweir
202cdf0e10cSrcweir if (tmp != nPos) return sal_False;
203cdf0e10cSrcweir
204cdf0e10cSrcweir result = getWordBoundary(Text, nPos, rLocale, rWordType, sal_True);
205cdf0e10cSrcweir
206cdf0e10cSrcweir return result.startPos == nPos;
207cdf0e10cSrcweir }
208cdf0e10cSrcweir
isEndWord(const OUString & Text,sal_Int32 nPos,const Locale & rLocale,sal_Int16 rWordType)209cdf0e10cSrcweir sal_Bool SAL_CALL BreakIteratorImpl::isEndWord( const OUString& Text, sal_Int32 nPos,
210cdf0e10cSrcweir const Locale& rLocale, sal_Int16 rWordType ) throw(RuntimeException)
211cdf0e10cSrcweir {
212cdf0e10cSrcweir sal_Int32 len = Text.getLength();
213cdf0e10cSrcweir
214cdf0e10cSrcweir if (nPos <= 0 || nPos > len) return sal_False;
215cdf0e10cSrcweir
216cdf0e10cSrcweir sal_Int32 tmp = skipSpace(Text, nPos, len, rWordType, sal_False);
217cdf0e10cSrcweir
218cdf0e10cSrcweir if (tmp != nPos) return sal_False;
219cdf0e10cSrcweir
220cdf0e10cSrcweir result = getWordBoundary(Text, nPos, rLocale, rWordType, sal_False);
221cdf0e10cSrcweir
222cdf0e10cSrcweir return result.endPos == nPos;
223cdf0e10cSrcweir }
224cdf0e10cSrcweir
beginOfSentence(const OUString & Text,sal_Int32 nStartPos,const Locale & rLocale)225cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
226cdf0e10cSrcweir const Locale &rLocale ) throw(RuntimeException)
227cdf0e10cSrcweir {
228cdf0e10cSrcweir if (nStartPos < 0 || nStartPos > Text.getLength())
229cdf0e10cSrcweir return -1;
230cdf0e10cSrcweir if (Text.getLength() == 0) return 0;
231cdf0e10cSrcweir return LBI->beginOfSentence(Text, nStartPos, rLocale);
232cdf0e10cSrcweir }
233cdf0e10cSrcweir
endOfSentence(const OUString & Text,sal_Int32 nStartPos,const Locale & rLocale)234cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
235cdf0e10cSrcweir const Locale &rLocale ) throw(RuntimeException)
236cdf0e10cSrcweir {
237cdf0e10cSrcweir if (nStartPos < 0 || nStartPos > Text.getLength())
238cdf0e10cSrcweir return -1;
239cdf0e10cSrcweir if (Text.getLength() == 0) return 0;
240cdf0e10cSrcweir return LBI->endOfSentence(Text, nStartPos, rLocale);
241cdf0e10cSrcweir }
242cdf0e10cSrcweir
getLineBreak(const OUString & Text,sal_Int32 nStartPos,const Locale & rLocale,sal_Int32 nMinBreakPos,const LineBreakHyphenationOptions & hOptions,const LineBreakUserOptions & bOptions)243cdf0e10cSrcweir LineBreakResults SAL_CALL BreakIteratorImpl::getLineBreak( const OUString& Text, sal_Int32 nStartPos,
244cdf0e10cSrcweir const Locale& rLocale, sal_Int32 nMinBreakPos, const LineBreakHyphenationOptions& hOptions,
245cdf0e10cSrcweir const LineBreakUserOptions& bOptions ) throw(RuntimeException)
246cdf0e10cSrcweir {
247cdf0e10cSrcweir return LBI->getLineBreak(Text, nStartPos, rLocale, nMinBreakPos, hOptions, bOptions);
248cdf0e10cSrcweir }
249cdf0e10cSrcweir
getScriptType(const OUString & Text,sal_Int32 nPos)250cdf0e10cSrcweir sal_Int16 SAL_CALL BreakIteratorImpl::getScriptType( const OUString& Text, sal_Int32 nPos )
251cdf0e10cSrcweir throw(RuntimeException)
252cdf0e10cSrcweir {
253cdf0e10cSrcweir return (nPos < 0 || nPos >= Text.getLength()) ? ScriptType::WEAK :
254cdf0e10cSrcweir getScriptClass(Text.iterateCodePoints(&nPos, 0));
255cdf0e10cSrcweir }
256cdf0e10cSrcweir
257cdf0e10cSrcweir
258cdf0e10cSrcweir /** Increments/decrements position first, then obtains character.
259cdf0e10cSrcweir @return current position, may be -1 or text length if string was consumed.
260cdf0e10cSrcweir */
iterateCodePoints(const OUString & Text,sal_Int32 & nStartPos,sal_Int32 inc,sal_uInt32 & ch)261cdf0e10cSrcweir static sal_Int32 SAL_CALL iterateCodePoints(const OUString& Text, sal_Int32 &nStartPos, sal_Int32 inc, sal_uInt32& ch) {
262cdf0e10cSrcweir sal_Int32 nLen = Text.getLength();
263cdf0e10cSrcweir if (nStartPos + inc < 0 || nStartPos + inc >= nLen) {
264cdf0e10cSrcweir ch = 0;
265cdf0e10cSrcweir nStartPos = nStartPos + inc < 0 ? -1 : nLen;
266cdf0e10cSrcweir } else {
267cdf0e10cSrcweir ch = Text.iterateCodePoints(&nStartPos, inc);
268cdf0e10cSrcweir // Fix for #i80436#.
269cdf0e10cSrcweir // erAck: 2009-06-30T21:52+0200 This logic looks somewhat
270cdf0e10cSrcweir // suspicious as if it cures a symptom.. anyway, had to add
271cdf0e10cSrcweir // nStartPos < Text.getLength() to silence the (correct) assertion
272cdf0e10cSrcweir // in rtl_uString_iterateCodePoints() if Text was one character
273cdf0e10cSrcweir // (codepoint) only, made up of a surrogate pair.
274cdf0e10cSrcweir //if (inc > 0 && nStartPos < Text.getLength())
275cdf0e10cSrcweir // ch = Text.iterateCodePoints(&nStartPos, 0);
276cdf0e10cSrcweir // With surrogates, nStartPos may actually point behind string
277cdf0e10cSrcweir // now, even if inc is only +1
278cdf0e10cSrcweir if (inc > 0)
279cdf0e10cSrcweir ch = (nStartPos < nLen ? Text.iterateCodePoints(&nStartPos, 0) : 0);
280cdf0e10cSrcweir }
281cdf0e10cSrcweir return nStartPos;
282cdf0e10cSrcweir }
283cdf0e10cSrcweir
284cdf0e10cSrcweir
beginOfScript(const OUString & Text,sal_Int32 nStartPos,sal_Int16 ScriptType)285cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::beginOfScript( const OUString& Text,
286cdf0e10cSrcweir sal_Int32 nStartPos, sal_Int16 ScriptType ) throw(RuntimeException)
287cdf0e10cSrcweir {
288cdf0e10cSrcweir if (nStartPos < 0 || nStartPos >= Text.getLength())
289cdf0e10cSrcweir return -1;
290cdf0e10cSrcweir
291cdf0e10cSrcweir if(ScriptType != getScriptClass(Text.iterateCodePoints(&nStartPos, 0)))
292cdf0e10cSrcweir return -1;
293cdf0e10cSrcweir
294cdf0e10cSrcweir if (nStartPos == 0) return 0;
295cdf0e10cSrcweir sal_uInt32 ch=0;
296cdf0e10cSrcweir while (iterateCodePoints(Text, nStartPos, -1, ch) >= 0 && ScriptType == getScriptClass(ch)) {
297cdf0e10cSrcweir if (nStartPos == 0) return 0;
298cdf0e10cSrcweir }
299cdf0e10cSrcweir
300cdf0e10cSrcweir return iterateCodePoints(Text, nStartPos, 1, ch);
301cdf0e10cSrcweir }
302cdf0e10cSrcweir
endOfScript(const OUString & Text,sal_Int32 nStartPos,sal_Int16 ScriptType)303cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::endOfScript( const OUString& Text,
304cdf0e10cSrcweir sal_Int32 nStartPos, sal_Int16 ScriptType ) throw(RuntimeException)
305cdf0e10cSrcweir {
306cdf0e10cSrcweir if (nStartPos < 0 || nStartPos >= Text.getLength())
307cdf0e10cSrcweir return -1;
308cdf0e10cSrcweir
309cdf0e10cSrcweir if(ScriptType != getScriptClass(Text.iterateCodePoints(&nStartPos, 0)))
310cdf0e10cSrcweir return -1;
311cdf0e10cSrcweir
312cdf0e10cSrcweir sal_Int32 strLen = Text.getLength();
313cdf0e10cSrcweir sal_uInt32 ch=0;
314cdf0e10cSrcweir while(iterateCodePoints(Text, nStartPos, 1, ch) < strLen ) {
315cdf0e10cSrcweir sal_Int16 currentCharScriptType = getScriptClass(ch);
316cdf0e10cSrcweir if(ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK)
317cdf0e10cSrcweir break;
318cdf0e10cSrcweir }
319cdf0e10cSrcweir return nStartPos;
320cdf0e10cSrcweir }
321cdf0e10cSrcweir
previousScript(const OUString & Text,sal_Int32 nStartPos,sal_Int16 ScriptType)322cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::previousScript( const OUString& Text,
323cdf0e10cSrcweir sal_Int32 nStartPos, sal_Int16 ScriptType ) throw(RuntimeException)
324cdf0e10cSrcweir {
325cdf0e10cSrcweir if (nStartPos < 0)
326cdf0e10cSrcweir return -1;
327cdf0e10cSrcweir if (nStartPos > Text.getLength())
328cdf0e10cSrcweir nStartPos = Text.getLength();
329cdf0e10cSrcweir
330cdf0e10cSrcweir sal_Int16 numberOfChange = (ScriptType == getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 3 : 2;
331cdf0e10cSrcweir
332cdf0e10cSrcweir sal_uInt32 ch=0;
333cdf0e10cSrcweir while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, -1, ch) >= 0) {
334cdf0e10cSrcweir if ((((numberOfChange % 2) == 0) ^ (ScriptType != getScriptClass(ch))))
335cdf0e10cSrcweir numberOfChange--;
336cdf0e10cSrcweir else if (nStartPos == 0) {
337cdf0e10cSrcweir if (numberOfChange > 0)
338cdf0e10cSrcweir numberOfChange--;
339cdf0e10cSrcweir if (nStartPos > 0)
340cdf0e10cSrcweir Text.iterateCodePoints(&nStartPos, -1);
341cdf0e10cSrcweir else
342cdf0e10cSrcweir return -1;
343cdf0e10cSrcweir }
344cdf0e10cSrcweir }
345cdf0e10cSrcweir return numberOfChange == 0 ? iterateCodePoints(Text, nStartPos, 1, ch) : -1;
346cdf0e10cSrcweir }
347cdf0e10cSrcweir
nextScript(const OUString & Text,sal_Int32 nStartPos,sal_Int16 ScriptType)348cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::nextScript( const OUString& Text, sal_Int32 nStartPos,
349cdf0e10cSrcweir sal_Int16 ScriptType ) throw(RuntimeException)
350cdf0e10cSrcweir
351cdf0e10cSrcweir {
352cdf0e10cSrcweir if (nStartPos < 0)
353cdf0e10cSrcweir nStartPos = 0;
354cdf0e10cSrcweir sal_Int32 strLen = Text.getLength();
355cdf0e10cSrcweir if (nStartPos > strLen)
356cdf0e10cSrcweir return -1;
357cdf0e10cSrcweir
358cdf0e10cSrcweir sal_Int16 numberOfChange = (ScriptType == getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 2 : 1;
359cdf0e10cSrcweir
360cdf0e10cSrcweir sal_uInt32 ch=0;
361cdf0e10cSrcweir while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, 1, ch) < strLen) {
362cdf0e10cSrcweir sal_Int16 currentCharScriptType = getScriptClass(ch);
363cdf0e10cSrcweir if ((numberOfChange == 1) ? (ScriptType == currentCharScriptType) :
364cdf0e10cSrcweir (ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK))
365cdf0e10cSrcweir numberOfChange--;
366cdf0e10cSrcweir }
367cdf0e10cSrcweir return numberOfChange == 0 ? nStartPos : -1;
368cdf0e10cSrcweir }
369cdf0e10cSrcweir
beginOfCharBlock(const OUString & Text,sal_Int32 nStartPos,const Locale &,sal_Int16 CharType)370cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::beginOfCharBlock( const OUString& Text, sal_Int32 nStartPos,
371cdf0e10cSrcweir const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException)
372cdf0e10cSrcweir {
373cdf0e10cSrcweir if (CharType == CharType::ANY_CHAR) return 0;
374cdf0e10cSrcweir if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
375cdf0e10cSrcweir if (CharType != (sal_Int16)u_charType( Text.iterateCodePoints(&nStartPos, 0))) return -1;
376cdf0e10cSrcweir
377cdf0e10cSrcweir sal_Int32 nPos=nStartPos;
378cdf0e10cSrcweir while(nStartPos > 0 && CharType == (sal_Int16)u_charType(Text.iterateCodePoints(&nPos, -1))) { nStartPos=nPos; }
379cdf0e10cSrcweir return nStartPos; // begin of char block is inclusive
380cdf0e10cSrcweir }
381cdf0e10cSrcweir
endOfCharBlock(const OUString & Text,sal_Int32 nStartPos,const Locale &,sal_Int16 CharType)382cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::endOfCharBlock( const OUString& Text, sal_Int32 nStartPos,
383cdf0e10cSrcweir const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException)
384cdf0e10cSrcweir {
385cdf0e10cSrcweir sal_Int32 strLen = Text.getLength();
386cdf0e10cSrcweir
387cdf0e10cSrcweir if (CharType == CharType::ANY_CHAR) return strLen; // end of char block is exclusive
388cdf0e10cSrcweir if (nStartPos < 0 || nStartPos >= strLen) return -1;
389cdf0e10cSrcweir if (CharType != (sal_Int16)u_charType(Text.iterateCodePoints(&nStartPos, 0))) return -1;
390cdf0e10cSrcweir
391cdf0e10cSrcweir sal_uInt32 ch=0;
392cdf0e10cSrcweir while(iterateCodePoints(Text, nStartPos, 1, ch) < strLen && CharType == (sal_Int16)u_charType(ch)) {}
393cdf0e10cSrcweir return nStartPos; // end of char block is exclusive
394cdf0e10cSrcweir }
395cdf0e10cSrcweir
nextCharBlock(const OUString & Text,sal_Int32 nStartPos,const Locale &,sal_Int16 CharType)396cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::nextCharBlock( const OUString& Text, sal_Int32 nStartPos,
397cdf0e10cSrcweir const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException)
398cdf0e10cSrcweir {
399cdf0e10cSrcweir if (CharType == CharType::ANY_CHAR) return -1;
400cdf0e10cSrcweir if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
401cdf0e10cSrcweir
402cdf0e10cSrcweir sal_Int16 numberOfChange = (CharType == (sal_Int16)u_charType(Text.iterateCodePoints(&nStartPos, 0))) ? 2 : 1;
403cdf0e10cSrcweir sal_Int32 strLen = Text.getLength();
404cdf0e10cSrcweir
405cdf0e10cSrcweir sal_uInt32 ch=0;
406cdf0e10cSrcweir while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, 1, ch) < strLen) {
407cdf0e10cSrcweir if ((CharType != (sal_Int16)u_charType(ch)) ^ (numberOfChange == 1))
408cdf0e10cSrcweir numberOfChange--;
409cdf0e10cSrcweir }
410cdf0e10cSrcweir return numberOfChange == 0 ? nStartPos : -1;
411cdf0e10cSrcweir }
412cdf0e10cSrcweir
previousCharBlock(const OUString & Text,sal_Int32 nStartPos,const Locale &,sal_Int16 CharType)413cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::previousCharBlock( const OUString& Text, sal_Int32 nStartPos,
414cdf0e10cSrcweir const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException)
415cdf0e10cSrcweir {
416cdf0e10cSrcweir if(CharType == CharType::ANY_CHAR) return -1;
417cdf0e10cSrcweir if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
418cdf0e10cSrcweir
419cdf0e10cSrcweir sal_Int16 numberOfChange = (CharType == (sal_Int16)u_charType(Text.iterateCodePoints(&nStartPos, 0))) ? 3 : 2;
420cdf0e10cSrcweir
421cdf0e10cSrcweir sal_uInt32 ch=0;
422cdf0e10cSrcweir while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, -1, ch) >= 0) {
423cdf0e10cSrcweir if (((numberOfChange % 2) == 0) ^ (CharType != (sal_Int16)u_charType(ch)))
424cdf0e10cSrcweir numberOfChange--;
425cdf0e10cSrcweir if (nStartPos == 0 && numberOfChange > 0) {
426cdf0e10cSrcweir numberOfChange--;
427cdf0e10cSrcweir if (numberOfChange == 0) return nStartPos;
428cdf0e10cSrcweir }
429cdf0e10cSrcweir }
430cdf0e10cSrcweir return numberOfChange == 0 ? iterateCodePoints(Text, nStartPos, 1, ch) : -1;
431cdf0e10cSrcweir }
432cdf0e10cSrcweir
433cdf0e10cSrcweir
434cdf0e10cSrcweir
getWordType(const OUString &,sal_Int32,const Locale &)435cdf0e10cSrcweir sal_Int16 SAL_CALL BreakIteratorImpl::getWordType( const OUString& /*Text*/,
436cdf0e10cSrcweir sal_Int32 /*nPos*/, const Locale& /*rLocale*/ ) throw(RuntimeException)
437cdf0e10cSrcweir {
438cdf0e10cSrcweir return 0;
439cdf0e10cSrcweir }
440cdf0e10cSrcweir
441cdf0e10cSrcweir typedef struct {
442cdf0e10cSrcweir UBlockCode from;
443cdf0e10cSrcweir UBlockCode to;
444cdf0e10cSrcweir sal_Int16 script;
445cdf0e10cSrcweir } UBlock2Script;
446cdf0e10cSrcweir
447cdf0e10cSrcweir // for a list of the UBLOCK_... values see:
448cdf0e10cSrcweir // http://icu-project.org/apiref/icu4c/uchar_8h.html
449cdf0e10cSrcweir // where enum UBlockCode is defined.
450cdf0e10cSrcweir // See also http://www.unicode.org/charts/ for general reference
451cdf0e10cSrcweir static UBlock2Script scriptList[] = {
452cdf0e10cSrcweir {UBLOCK_NO_BLOCK, UBLOCK_NO_BLOCK, ScriptType::WEAK},
453cdf0e10cSrcweir {UBLOCK_BASIC_LATIN, UBLOCK_ARMENIAN, ScriptType::LATIN},
454cdf0e10cSrcweir {UBLOCK_HEBREW, UBLOCK_MYANMAR, ScriptType::COMPLEX},
455cdf0e10cSrcweir {UBLOCK_GEORGIAN, UBLOCK_GEORGIAN, ScriptType::LATIN},
456cdf0e10cSrcweir {UBLOCK_HANGUL_JAMO, UBLOCK_HANGUL_JAMO, ScriptType::ASIAN},
457cdf0e10cSrcweir {UBLOCK_ETHIOPIC, UBLOCK_ETHIOPIC, ScriptType::COMPLEX},
458cdf0e10cSrcweir {UBLOCK_CHEROKEE, UBLOCK_RUNIC, ScriptType::LATIN},
459cdf0e10cSrcweir {UBLOCK_KHMER, UBLOCK_MONGOLIAN, ScriptType::COMPLEX},
460cdf0e10cSrcweir {UBLOCK_LATIN_EXTENDED_ADDITIONAL, UBLOCK_GREEK_EXTENDED, ScriptType::LATIN},
461cdf0e10cSrcweir {UBLOCK_CJK_RADICALS_SUPPLEMENT, UBLOCK_HANGUL_SYLLABLES, ScriptType::ASIAN},
462cdf0e10cSrcweir {UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, ScriptType::ASIAN},
463cdf0e10cSrcweir {UBLOCK_ARABIC_PRESENTATION_FORMS_A, UBLOCK_ARABIC_PRESENTATION_FORMS_A, ScriptType::COMPLEX},
464cdf0e10cSrcweir {UBLOCK_CJK_COMPATIBILITY_FORMS, UBLOCK_CJK_COMPATIBILITY_FORMS, ScriptType::ASIAN},
465cdf0e10cSrcweir {UBLOCK_ARABIC_PRESENTATION_FORMS_B, UBLOCK_ARABIC_PRESENTATION_FORMS_B, ScriptType::COMPLEX},
466cdf0e10cSrcweir {UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, ScriptType::ASIAN},
467cdf0e10cSrcweir {UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, ScriptType::ASIAN},
468cdf0e10cSrcweir {UBLOCK_CJK_STROKES, UBLOCK_CJK_STROKES, ScriptType::ASIAN},
469cdf0e10cSrcweir {UBLOCK_LATIN_EXTENDED_C, UBLOCK_LATIN_EXTENDED_D, ScriptType::LATIN}
470cdf0e10cSrcweir };
471cdf0e10cSrcweir
472cdf0e10cSrcweir #define scriptListCount sizeof (scriptList) / sizeof (UBlock2Script)
473cdf0e10cSrcweir
getScriptClass(sal_uInt32 currentChar)474cdf0e10cSrcweir sal_Int16 BreakIteratorImpl::getScriptClass(sal_uInt32 currentChar)
475cdf0e10cSrcweir {
476cdf0e10cSrcweir static sal_uInt32 lastChar = 0;
477cdf0e10cSrcweir static sal_Int16 nRet = 0;
478cdf0e10cSrcweir
479cdf0e10cSrcweir if (currentChar != lastChar) {
480cdf0e10cSrcweir lastChar = currentChar;
481cdf0e10cSrcweir
482cdf0e10cSrcweir //JP 21.9.2001: handle specific characters - always as weak
483cdf0e10cSrcweir // definition of 1 - this breaks a word
484cdf0e10cSrcweir // 2 - this can be inside a word
485cdf0e10cSrcweir // 0x20 & 0xA0 - Bug 102975, declare western space and non-break space as WEAK char.
486cdf0e10cSrcweir if( 1 == currentChar || 2 == currentChar || 0x20 == currentChar || 0xA0 == currentChar)
487cdf0e10cSrcweir nRet = ScriptType::WEAK;
488cdf0e10cSrcweir // workaround for Coptic
489cdf0e10cSrcweir else if ( 0x2C80 <= currentChar && 0x2CE3 >= currentChar)
490cdf0e10cSrcweir nRet = ScriptType::LATIN;
491cdf0e10cSrcweir // work-around for ligatures (see http://www.unicode.org/charts/PDF/UFB00.pdf)
492cdf0e10cSrcweir else if ((0xFB00 <= currentChar && currentChar <= 0xFB06) ||
493cdf0e10cSrcweir (0xFB13 <= currentChar && currentChar <= 0xFB17))
494cdf0e10cSrcweir nRet = ScriptType::LATIN;
495cdf0e10cSrcweir else {
496cdf0e10cSrcweir UBlockCode block=ublock_getCode(currentChar);
497cdf0e10cSrcweir sal_uInt16 i;
498cdf0e10cSrcweir for ( i = 0; i < scriptListCount; i++) {
499cdf0e10cSrcweir if (block <= scriptList[i].to) break;
500cdf0e10cSrcweir }
501cdf0e10cSrcweir nRet=(i < scriptListCount && block >= scriptList[i].from) ? scriptList[i].script : ScriptType::WEAK;
502cdf0e10cSrcweir }
503cdf0e10cSrcweir }
504cdf0e10cSrcweir return nRet;
505cdf0e10cSrcweir }
506cdf0e10cSrcweir
operator ==(const Locale & l1,const Locale & l2)507cdf0e10cSrcweir static inline sal_Bool operator == (const Locale& l1, const Locale& l2) {
508cdf0e10cSrcweir return l1.Language == l2.Language && l1.Country == l2.Country && l1.Variant == l2.Variant;
509cdf0e10cSrcweir }
510cdf0e10cSrcweir
createLocaleSpecificBreakIterator(const OUString & aLocaleName)511cdf0e10cSrcweir sal_Bool SAL_CALL BreakIteratorImpl::createLocaleSpecificBreakIterator(const OUString& aLocaleName) throw( RuntimeException )
512cdf0e10cSrcweir {
513cdf0e10cSrcweir // to share service between same Language but different Country code, like zh_CN and zh_TW
514cdf0e10cSrcweir for (size_t l = 0; l < lookupTable.size(); l++) {
515cdf0e10cSrcweir lookupTableItem *listItem = lookupTable[l];
516cdf0e10cSrcweir if (aLocaleName == listItem->aLocale.Language) {
517cdf0e10cSrcweir xBI = listItem->xBI;
518cdf0e10cSrcweir return sal_True;
519cdf0e10cSrcweir }
520cdf0e10cSrcweir }
521cdf0e10cSrcweir
522cdf0e10cSrcweir Reference < uno::XInterface > xI = xMSF->createInstance(
523cdf0e10cSrcweir OUString::createFromAscii("com.sun.star.i18n.BreakIterator_") + aLocaleName);
524cdf0e10cSrcweir
525cdf0e10cSrcweir if ( xI.is() ) {
526cdf0e10cSrcweir xI->queryInterface( getCppuType((const Reference< XBreakIterator>*)0) ) >>= xBI;
527cdf0e10cSrcweir if (xBI.is()) {
528cdf0e10cSrcweir lookupTable.push_back(new lookupTableItem(Locale(aLocaleName, aLocaleName, aLocaleName), xBI));
529cdf0e10cSrcweir return sal_True;
530cdf0e10cSrcweir }
531cdf0e10cSrcweir }
532cdf0e10cSrcweir return sal_False;
533cdf0e10cSrcweir }
534cdf0e10cSrcweir
535cdf0e10cSrcweir Reference < XBreakIterator > SAL_CALL
getLocaleSpecificBreakIterator(const Locale & rLocale)536cdf0e10cSrcweir BreakIteratorImpl::getLocaleSpecificBreakIterator(const Locale& rLocale) throw (RuntimeException)
537cdf0e10cSrcweir {
538cdf0e10cSrcweir if (xBI.is() && rLocale == aLocale)
539cdf0e10cSrcweir return xBI;
540cdf0e10cSrcweir else if (xMSF.is()) {
541cdf0e10cSrcweir aLocale = rLocale;
542cdf0e10cSrcweir
543cdf0e10cSrcweir for (size_t i = 0; i < lookupTable.size(); i++) {
544cdf0e10cSrcweir lookupTableItem *listItem = lookupTable[i];
545cdf0e10cSrcweir if (rLocale == listItem->aLocale)
546cdf0e10cSrcweir return xBI = listItem->xBI;
547cdf0e10cSrcweir }
548cdf0e10cSrcweir
549cdf0e10cSrcweir sal_Unicode under = (sal_Unicode)'_';
550cdf0e10cSrcweir
551cdf0e10cSrcweir sal_Int32 l = rLocale.Language.getLength();
552cdf0e10cSrcweir sal_Int32 c = rLocale.Country.getLength();
553cdf0e10cSrcweir sal_Int32 v = rLocale.Variant.getLength();
554cdf0e10cSrcweir OUStringBuffer aBuf(l+c+v+3);
555cdf0e10cSrcweir
556cdf0e10cSrcweir if ((l > 0 && c > 0 && v > 0 &&
557cdf0e10cSrcweir // load service with name <base>_<lang>_<country>_<varian>
558cdf0e10cSrcweir createLocaleSpecificBreakIterator(aBuf.append(rLocale.Language).append(under).append(
559cdf0e10cSrcweir rLocale.Country).append(under).append(rLocale.Variant).makeStringAndClear())) ||
560cdf0e10cSrcweir (l > 0 && c > 0 &&
561cdf0e10cSrcweir // load service with name <base>_<lang>_<country>
562cdf0e10cSrcweir createLocaleSpecificBreakIterator(aBuf.append(rLocale.Language).append(under).append(
563cdf0e10cSrcweir rLocale.Country).makeStringAndClear())) ||
564cdf0e10cSrcweir (l > 0 && c > 0 && rLocale.Language.compareToAscii("zh") == 0 &&
565cdf0e10cSrcweir (rLocale.Country.compareToAscii("HK") == 0 ||
566cdf0e10cSrcweir rLocale.Country.compareToAscii("MO") == 0) &&
567cdf0e10cSrcweir // if the country code is HK or MO, one more step to try TW.
568cdf0e10cSrcweir createLocaleSpecificBreakIterator(aBuf.append(rLocale.Language).append(under).appendAscii(
569cdf0e10cSrcweir "TW").makeStringAndClear())) ||
570cdf0e10cSrcweir (l > 0 &&
571cdf0e10cSrcweir // load service with name <base>_<lang>
572cdf0e10cSrcweir createLocaleSpecificBreakIterator(rLocale.Language)) ||
573cdf0e10cSrcweir // load default service with name <base>_Unicode
574cdf0e10cSrcweir createLocaleSpecificBreakIterator(OUString::createFromAscii("Unicode"))) {
575cdf0e10cSrcweir lookupTable.push_back( new lookupTableItem(aLocale, xBI) );
576cdf0e10cSrcweir return xBI;
577cdf0e10cSrcweir }
578cdf0e10cSrcweir }
579cdf0e10cSrcweir throw RuntimeException();
580cdf0e10cSrcweir }
581cdf0e10cSrcweir
582cdf0e10cSrcweir const sal_Char cBreakIterator[] = "com.sun.star.i18n.BreakIterator";
583cdf0e10cSrcweir
584cdf0e10cSrcweir OUString SAL_CALL
getImplementationName(void)585cdf0e10cSrcweir BreakIteratorImpl::getImplementationName(void) throw( RuntimeException )
586cdf0e10cSrcweir {
587cdf0e10cSrcweir return OUString::createFromAscii(cBreakIterator);
588cdf0e10cSrcweir }
589cdf0e10cSrcweir
590cdf0e10cSrcweir sal_Bool SAL_CALL
supportsService(const OUString & rServiceName)591cdf0e10cSrcweir BreakIteratorImpl::supportsService(const OUString& rServiceName) throw( RuntimeException )
592cdf0e10cSrcweir {
593cdf0e10cSrcweir return !rServiceName.compareToAscii(cBreakIterator);
594cdf0e10cSrcweir }
595cdf0e10cSrcweir
596cdf0e10cSrcweir Sequence< OUString > SAL_CALL
getSupportedServiceNames(void)597cdf0e10cSrcweir BreakIteratorImpl::getSupportedServiceNames(void) throw( RuntimeException )
598cdf0e10cSrcweir {
599cdf0e10cSrcweir Sequence< OUString > aRet(1);
600cdf0e10cSrcweir aRet[0] = OUString::createFromAscii(cBreakIterator);
601cdf0e10cSrcweir return aRet;
602cdf0e10cSrcweir }
603cdf0e10cSrcweir
604cdf0e10cSrcweir } } } }
605cdf0e10cSrcweir
606