1 /**************************************************************
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing,
14 * software distributed under the License is distributed on an
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 * KIND, either express or implied. See the License for the
17 * specific language governing permissions and limitations
18 * under the License.
19 *
20 *************************************************************/
21
22
23
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_i18npool.hxx"
26 #include <breakiterator_unicode.hxx>
27 #include <localedata.hxx>
28 #include <unicode/uchar.h>
29 #include <unicode/locid.h>
30 #include <unicode/rbbi.h>
31 #include <unicode/udata.h>
32 #include <rtl/strbuf.hxx>
33 #include <rtl/ustring.hxx>
34
35 U_CDECL_BEGIN
36 extern const char OpenOffice_dat[];
37 U_CDECL_END
38
39 using namespace ::com::sun::star;
40 using namespace ::com::sun::star::lang;
41 using namespace ::rtl;
42
43 namespace com { namespace sun { namespace star { namespace i18n {
44
45 #define ERROR ::com::sun::star::uno::RuntimeException()
46
47 //#define ImplementName "com.sun.star.i18n.BreakIterator_Unicode";
48
49
BreakIterator_Unicode()50 BreakIterator_Unicode::BreakIterator_Unicode() :
51 cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ), // implementation name
52 wordRule( "word" ),
53 lineRule( "line" ),
54 result(),
55 character(),
56 word(),
57 sentence(),
58 line(),
59 icuBI( NULL ),
60 aLocale(),
61 aBreakType(),
62 aWordType()
63 {
64 }
65
66
~BreakIterator_Unicode()67 BreakIterator_Unicode::~BreakIterator_Unicode()
68 {
69 if (icuBI && icuBI->aBreakIterator) {
70 delete icuBI->aBreakIterator;
71 icuBI->aBreakIterator=NULL;
72 }
73 if (character.aBreakIterator) delete character.aBreakIterator;
74 if (word.aBreakIterator) delete word.aBreakIterator;
75 if (sentence.aBreakIterator) delete sentence.aBreakIterator;
76 if (line.aBreakIterator) delete line.aBreakIterator;
77 }
78
79 /*
80 Wrapper class to provide public access to the RuleBasedBreakIterator's
81 setbreakType method.
82 */
83 class OOoRuleBasedBreakIterator : public RuleBasedBreakIterator {
84 public:
publicSetBreakType(int32_t type)85 inline void publicSetBreakType(int32_t type) {
86 setBreakType(type);
87 };
OOoRuleBasedBreakIterator(UDataMemory * image,UErrorCode & status)88 OOoRuleBasedBreakIterator(UDataMemory* image,
89 UErrorCode &status) :
90 RuleBasedBreakIterator(image, status) { };
91
92 };
93
94 // loading ICU breakiterator on demand.
loadICUBreakIterator(const com::sun::star::lang::Locale & rLocale,sal_Int16 rBreakType,sal_Int16 rWordType,const sal_Char * rule,const OUString & rText)95 void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star::lang::Locale& rLocale,
96 sal_Int16 rBreakType, sal_Int16 rWordType, const sal_Char *rule, const OUString& rText) throw(uno::RuntimeException)
97 {
98 sal_Bool newBreak = sal_False;
99 UErrorCode status = U_ZERO_ERROR;
100 sal_Int16 breakType = 0;
101 switch (rBreakType) {
102 case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
103 case LOAD_WORD_BREAKITERATOR: icuBI=&word;
104 switch (rWordType) {
105 case WordType::ANYWORD_IGNOREWHITESPACES: breakType = 0; rule=wordRule = "edit_word"; break;
106 case WordType::DICTIONARY_WORD: breakType = 1; rule=wordRule = "dict_word"; break;
107 case WordType::WORD_COUNT: breakType = 2; rule=wordRule = "count_word"; break;
108 }
109 break;
110 case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
111 case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
112 }
113 if (!icuBI->aBreakIterator || rWordType != aWordType ||
114 rLocale.Language != aLocale.Language || rLocale.Country != aLocale.Country ||
115 rLocale.Variant != aLocale.Variant) {
116 if (icuBI->aBreakIterator) {
117 delete icuBI->aBreakIterator;
118 icuBI->aBreakIterator=NULL;
119 }
120 if (rule) {
121 uno::Sequence< OUString > breakRules = LocaleData().getBreakIteratorRules(rLocale);
122
123 status = U_ZERO_ERROR;
124 udata_setAppData("OpenOffice", OpenOffice_dat, &status);
125 if ( !U_SUCCESS(status) ) throw ERROR;
126
127 OOoRuleBasedBreakIterator *rbi = NULL;
128
129 if (breakRules.getLength() > breakType && breakRules[breakType].getLength() > 0) {
130 rbi = new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
131 OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
132 } else {
133 status = U_ZERO_ERROR;
134 OStringBuffer aUDName(64);
135 aUDName.append(rule);
136 aUDName.append('_');
137 aUDName.append( OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US));
138 UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
139 if( U_SUCCESS(status) )
140 rbi = new OOoRuleBasedBreakIterator( pUData, status);
141 if (!U_SUCCESS(status) ) {
142 status = U_ZERO_ERROR;
143 pUData = udata_open("OpenOffice", "brk", rule, &status);
144 if( U_SUCCESS(status) )
145 rbi = new OOoRuleBasedBreakIterator( pUData, status);
146 if (!U_SUCCESS(status) ) icuBI->aBreakIterator=NULL;
147 }
148 }
149 if (rbi) {
150 switch (rBreakType) {
151 case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break;
152 case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break;
153 case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break;
154 case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break;
155 }
156 icuBI->aBreakIterator = rbi;
157 }
158 }
159
160 if (!icuBI->aBreakIterator) {
161 icu::Locale icuLocale(
162 OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US).getStr(),
163 OUStringToOString(rLocale.Country, RTL_TEXTENCODING_ASCII_US).getStr(),
164 OUStringToOString(rLocale.Variant, RTL_TEXTENCODING_ASCII_US).getStr());
165
166 status = U_ZERO_ERROR;
167 switch (rBreakType) {
168 case LOAD_CHARACTER_BREAKITERATOR:
169 icuBI->aBreakIterator = icu::BreakIterator::createCharacterInstance(icuLocale, status);
170 break;
171 case LOAD_WORD_BREAKITERATOR:
172 icuBI->aBreakIterator = icu::BreakIterator::createWordInstance(icuLocale, status);
173 break;
174 case LOAD_SENTENCE_BREAKITERATOR:
175 icuBI->aBreakIterator = icu::BreakIterator::createSentenceInstance(icuLocale, status);
176 break;
177 case LOAD_LINE_BREAKITERATOR:
178 icuBI->aBreakIterator = icu::BreakIterator::createLineInstance(icuLocale, status);
179 break;
180 }
181 if ( !U_SUCCESS(status) ) {
182 icuBI->aBreakIterator=NULL;
183 throw ERROR;
184 }
185 }
186 if (icuBI->aBreakIterator) {
187 aLocale=rLocale;
188 aWordType=rWordType;
189 aBreakType=rBreakType;
190 newBreak=sal_True;
191 } else {
192 throw ERROR;
193 }
194 }
195
196 if (newBreak || icuBI->aICUText.compare(UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength()))) { // UChar != sal_Unicode in MinGW
197 icuBI->aICUText=UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength());
198 icuBI->aBreakIterator->setText(icuBI->aICUText);
199 }
200 }
201
202
nextCharacters(const OUString & Text,sal_Int32 nStartPos,const lang::Locale & rLocale,sal_Int16 nCharacterIteratorMode,sal_Int32 nCount,sal_Int32 & nDone)203 sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
204 sal_Int32 nStartPos, const lang::Locale &rLocale,
205 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
206 throw(uno::RuntimeException)
207 {
208 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
209 loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
210 for (nDone = 0; nDone < nCount; nDone++) {
211 nStartPos = character.aBreakIterator->following(nStartPos);
212 if (nStartPos == BreakIterator::DONE)
213 return Text.getLength();
214 }
215 } else { // for CHARACTER mode
216 for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
217 Text.iterateCodePoints(&nStartPos, 1);
218 }
219 return nStartPos;
220 }
221
previousCharacters(const OUString & Text,sal_Int32 nStartPos,const lang::Locale & rLocale,sal_Int16 nCharacterIteratorMode,sal_Int32 nCount,sal_Int32 & nDone)222 sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
223 sal_Int32 nStartPos, const lang::Locale& rLocale,
224 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
225 throw(uno::RuntimeException)
226 {
227 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
228 loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
229 for (nDone = 0; nDone < nCount; nDone++) {
230 nStartPos = character.aBreakIterator->preceding(nStartPos);
231 if (nStartPos == BreakIterator::DONE)
232 return 0;
233 }
234 } else { // for BS to delete one char and CHARACTER mode.
235 for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
236 Text.iterateCodePoints(&nStartPos, -1);
237 }
238 return nStartPos;
239 }
240
241
nextWord(const OUString & Text,sal_Int32 nStartPos,const lang::Locale & rLocale,sal_Int16 rWordType)242 Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
243 const lang::Locale& rLocale, sal_Int16 rWordType ) throw(uno::RuntimeException)
244 {
245 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
246
247 result.startPos = word.aBreakIterator->following(nStartPos);
248 if( result.startPos >= Text.getLength() || result.startPos == BreakIterator::DONE )
249 result.endPos = result.startPos;
250 else {
251 if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
252 rWordType == WordType::DICTIONARY_WORD ) &&
253 u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
254 result.startPos = word.aBreakIterator->following(result.startPos);
255
256 result.endPos = word.aBreakIterator->following(result.startPos);
257 if(result.endPos == BreakIterator::DONE)
258 result.endPos = result.startPos;
259 }
260 return result;
261 }
262
263
previousWord(const OUString & Text,sal_Int32 nStartPos,const lang::Locale & rLocale,sal_Int16 rWordType)264 Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
265 const lang::Locale& rLocale, sal_Int16 rWordType) throw(uno::RuntimeException)
266 {
267 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
268
269 result.startPos = word.aBreakIterator->preceding(nStartPos);
270 if( result.startPos < 0 || result.startPos == BreakIterator::DONE)
271 result.endPos = result.startPos;
272 else {
273 if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
274 rWordType == WordType::DICTIONARY_WORD) &&
275 u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
276 result.startPos = word.aBreakIterator->preceding(result.startPos);
277
278 result.endPos = word.aBreakIterator->following(result.startPos);
279 if(result.endPos == BreakIterator::DONE)
280 result.endPos = result.startPos;
281 }
282 return result;
283 }
284
285
getWordBoundary(const OUString & Text,sal_Int32 nPos,const lang::Locale & rLocale,sal_Int16 rWordType,sal_Bool bDirection)286 Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
287 sal_Int16 rWordType, sal_Bool bDirection ) throw(uno::RuntimeException)
288 {
289 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
290 sal_Int32 len = Text.getLength();
291
292 if(word.aBreakIterator->isBoundary(nPos)) {
293 result.startPos = result.endPos = nPos;
294 if((bDirection || nPos == 0) && nPos < len) //forward
295 result.endPos = word.aBreakIterator->following(nPos);
296 else
297 result.startPos = word.aBreakIterator->preceding(nPos);
298 } else {
299 if(nPos <= 0) {
300 result.startPos = 0;
301 result.endPos = len ? word.aBreakIterator->following((sal_Int32)0) : 0;
302 } else if(nPos >= len) {
303 result.startPos = word.aBreakIterator->preceding(len);
304 result.endPos = len;
305 } else {
306 result.startPos = word.aBreakIterator->preceding(nPos);
307 result.endPos = word.aBreakIterator->following(nPos);
308 }
309 }
310 if (result.startPos == BreakIterator::DONE)
311 result.startPos = result.endPos;
312 else if (result.endPos == BreakIterator::DONE)
313 result.endPos = result.startPos;
314
315 return result;
316 }
317
318
beginOfSentence(const OUString & Text,sal_Int32 nStartPos,const lang::Locale & rLocale)319 sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
320 const lang::Locale &rLocale ) throw(uno::RuntimeException)
321 {
322 loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
323
324 sal_Int32 len = Text.getLength();
325 if (len > 0 && nStartPos == len)
326 Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
327 if (!sentence.aBreakIterator->isBoundary(nStartPos))
328 nStartPos = sentence.aBreakIterator->preceding(nStartPos);
329
330 // skip preceding space.
331 sal_uInt32 ch = Text.iterateCodePoints(&nStartPos, 1);
332 while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos, 1);
333 Text.iterateCodePoints(&nStartPos, -1);
334
335 return nStartPos;
336 }
337
endOfSentence(const OUString & Text,sal_Int32 nStartPos,const lang::Locale & rLocale)338 sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
339 const lang::Locale &rLocale ) throw(uno::RuntimeException)
340 {
341 loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
342
343 sal_Int32 len = Text.getLength();
344 if (len > 0 && nStartPos == len)
345 Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
346 nStartPos = sentence.aBreakIterator->following(nStartPos);
347
348 sal_Int32 nPos=nStartPos;
349 while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
350
351 return nStartPos;
352 }
353
getLineBreak(const OUString & Text,sal_Int32 nStartPos,const lang::Locale & rLocale,sal_Int32 nMinBreakPos,const LineBreakHyphenationOptions & hOptions,const LineBreakUserOptions &)354 LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
355 const OUString& Text, sal_Int32 nStartPos,
356 const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
357 const LineBreakHyphenationOptions& hOptions,
358 const LineBreakUserOptions& /*rOptions*/ ) throw(uno::RuntimeException)
359 {
360 LineBreakResults lbr;
361
362 if (nStartPos >= Text.getLength()) {
363 lbr.breakIndex = Text.getLength();
364 lbr.breakType = BreakType::WORDBOUNDARY;
365 return lbr;
366 }
367
368 loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);
369
370 sal_Bool GlueSpace=sal_True;
371 while (GlueSpace) {
372 if (line.aBreakIterator->preceding(nStartPos + 1) == nStartPos) { //Line boundary break
373 lbr.breakIndex = nStartPos;
374 lbr.breakType = BreakType::WORDBOUNDARY;
375 } else if (hOptions.rHyphenator.is()) { //Hyphenation break
376 Boundary wBoundary = getWordBoundary( Text, nStartPos, rLocale,
377 WordType::DICTIONARY_WORD, false);
378 uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord;
379 aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
380 wBoundary.endPos - wBoundary.startPos), rLocale,
381 (sal_Int16) (hOptions.hyphenIndex - wBoundary.startPos), hOptions.aHyphenationOptions);
382 if (aHyphenatedWord.is()) {
383 lbr.rHyphenatedWord = aHyphenatedWord;
384 if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
385 lbr.breakIndex = -1;
386 else
387 lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
388 lbr.breakType = BreakType::HYPHENATION;
389 } else {
390 lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
391 lbr.breakType = BreakType::WORDBOUNDARY;;
392 }
393 } else { //word boundary break
394 lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
395 lbr.breakType = BreakType::WORDBOUNDARY;
396 }
397
398 #define WJ 0x2060 // Word Joiner
399 GlueSpace=sal_False;
400 if (lbr.breakType == BreakType::WORDBOUNDARY) {
401 nStartPos = lbr.breakIndex;
402 if (Text[nStartPos--] == WJ)
403 GlueSpace=sal_True;
404 while (nStartPos >= 0 &&
405 (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
406 if (Text[nStartPos--] == WJ)
407 GlueSpace=sal_True;
408 }
409 if (GlueSpace && nStartPos < 0) {
410 lbr.breakIndex = 0;
411 break;
412 }
413 }
414 }
415
416 return lbr;
417 }
418
419
420
421 OUString SAL_CALL
getImplementationName(void)422 BreakIterator_Unicode::getImplementationName(void) throw( uno::RuntimeException )
423 {
424 return OUString::createFromAscii(cBreakIterator);
425 }
426
427 sal_Bool SAL_CALL
supportsService(const OUString & rServiceName)428 BreakIterator_Unicode::supportsService(const OUString& rServiceName) throw( uno::RuntimeException )
429 {
430 return !rServiceName.compareToAscii(cBreakIterator);
431 }
432
433 uno::Sequence< OUString > SAL_CALL
getSupportedServiceNames(void)434 BreakIterator_Unicode::getSupportedServiceNames(void) throw( uno::RuntimeException )
435 {
436 uno::Sequence< OUString > aRet(1);
437 aRet[0] = OUString::createFromAscii(cBreakIterator);
438 return aRet;
439 }
440
441 } } } }
442