1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_i18npool.hxx"
26 #include <breakiterator_unicode.hxx>
27 #include <localedata.hxx>
28 #include <unicode/uchar.h>
29 #include <unicode/locid.h>
30 #include <unicode/rbbi.h>
31 #include <unicode/udata.h>
32 #include <rtl/strbuf.hxx>
33 #include <rtl/ustring.hxx>
34 
35 U_CDECL_BEGIN
36 extern const char OpenOffice_dat[];
37 U_CDECL_END
38 
39 using namespace ::com::sun::star;
40 using namespace ::com::sun::star::lang;
41 using namespace ::rtl;
42 
43 namespace com { namespace sun { namespace star { namespace i18n {
44 
45 #define ERROR ::com::sun::star::uno::RuntimeException()
46 
47 //#define ImplementName "com.sun.star.i18n.BreakIterator_Unicode";
48 
49 
BreakIterator_Unicode()50 BreakIterator_Unicode::BreakIterator_Unicode() :
51     cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ),    // implementation name
52     wordRule( "word" ),
53     lineRule( "line" ),
54     result(),
55     character(),
56     word(),
57     sentence(),
58     line(),
59     icuBI( NULL ),
60     aLocale(),
61     aBreakType(),
62     aWordType()
63 {
64 }
65 
66 
~BreakIterator_Unicode()67 BreakIterator_Unicode::~BreakIterator_Unicode()
68 {
69         if (icuBI && icuBI->aBreakIterator) {
70             delete icuBI->aBreakIterator;
71             icuBI->aBreakIterator=NULL;
72         }
73         if (character.aBreakIterator) delete character.aBreakIterator;
74         if (word.aBreakIterator) delete word.aBreakIterator;
75         if (sentence.aBreakIterator) delete sentence.aBreakIterator;
76         if (line.aBreakIterator) delete line.aBreakIterator;
77 }
78 
79 /*
80 	Wrapper class to provide public access to the RuleBasedBreakIterator's
81 	setbreakType method.
82 */
83 class OOoRuleBasedBreakIterator : public RuleBasedBreakIterator {
84 	public:
publicSetBreakType(int32_t type)85 		inline void publicSetBreakType(int32_t type) {
86 			setBreakType(type);
87 		};
OOoRuleBasedBreakIterator(UDataMemory * image,UErrorCode & status)88 		OOoRuleBasedBreakIterator(UDataMemory* image,
89 				UErrorCode &status) :
90 			RuleBasedBreakIterator(image, status) { };
91 
92 };
93 
94 // loading ICU breakiterator on demand.
loadICUBreakIterator(const com::sun::star::lang::Locale & rLocale,sal_Int16 rBreakType,sal_Int16 rWordType,const sal_Char * rule,const OUString & rText)95 void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star::lang::Locale& rLocale,
96         sal_Int16 rBreakType, sal_Int16 rWordType, const sal_Char *rule, const OUString& rText) throw(uno::RuntimeException)
97 {
98     sal_Bool newBreak = sal_False;
99     UErrorCode status = U_ZERO_ERROR;
100     sal_Int16 breakType = 0;
101     switch (rBreakType) {
102         case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
103         case LOAD_WORD_BREAKITERATOR: icuBI=&word;
104             switch (rWordType) {
105                 case WordType::ANYWORD_IGNOREWHITESPACES: breakType = 0; rule=wordRule = "edit_word"; break;
106                 case WordType::DICTIONARY_WORD: breakType = 1; rule=wordRule = "dict_word"; break;
107                 case WordType::WORD_COUNT: breakType = 2; rule=wordRule = "count_word"; break;
108             }
109             break;
110         case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
111         case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
112     }
113     if (!icuBI->aBreakIterator || rWordType != aWordType ||
114             rLocale.Language != aLocale.Language || rLocale.Country != aLocale.Country ||
115             rLocale.Variant != aLocale.Variant) {
116         if (icuBI->aBreakIterator) {
117             delete icuBI->aBreakIterator;
118             icuBI->aBreakIterator=NULL;
119         }
120         if (rule) {
121             uno::Sequence< OUString > breakRules = LocaleData().getBreakIteratorRules(rLocale);
122 
123             status = U_ZERO_ERROR;
124             udata_setAppData("OpenOffice", OpenOffice_dat, &status);
125             if ( !U_SUCCESS(status) ) throw ERROR;
126 
127             OOoRuleBasedBreakIterator *rbi = NULL;
128 
129             if (breakRules.getLength() > breakType && breakRules[breakType].getLength() > 0) {
130                 rbi = new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
131                     OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
132             } else {
133                 status = U_ZERO_ERROR;
134                 OStringBuffer aUDName(64);
135                 aUDName.append(rule);
136                 aUDName.append('_');
137                 aUDName.append( OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US));
138                 UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
139                 if( U_SUCCESS(status) )
140                     rbi = new OOoRuleBasedBreakIterator( pUData, status);
141                 if (!U_SUCCESS(status) ) {
142                     status = U_ZERO_ERROR;
143                     pUData = udata_open("OpenOffice", "brk", rule, &status);
144                     if( U_SUCCESS(status) )
145                         rbi = new OOoRuleBasedBreakIterator( pUData, status);
146                     if (!U_SUCCESS(status) ) icuBI->aBreakIterator=NULL;
147                 }
148             }
149             if (rbi) {
150                 switch (rBreakType) {
151                     case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break;
152                     case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break;
153                     case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break;
154                     case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break;
155                 }
156                 icuBI->aBreakIterator = rbi;
157             }
158         }
159 
160         if (!icuBI->aBreakIterator) {
161             icu::Locale icuLocale(
162                     OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US).getStr(),
163                     OUStringToOString(rLocale.Country, RTL_TEXTENCODING_ASCII_US).getStr(),
164                     OUStringToOString(rLocale.Variant, RTL_TEXTENCODING_ASCII_US).getStr());
165 
166             status = U_ZERO_ERROR;
167             switch (rBreakType) {
168                 case LOAD_CHARACTER_BREAKITERATOR:
169                     icuBI->aBreakIterator =  icu::BreakIterator::createCharacterInstance(icuLocale, status);
170                     break;
171                 case LOAD_WORD_BREAKITERATOR:
172                     icuBI->aBreakIterator =  icu::BreakIterator::createWordInstance(icuLocale, status);
173                     break;
174                 case LOAD_SENTENCE_BREAKITERATOR:
175                     icuBI->aBreakIterator = icu::BreakIterator::createSentenceInstance(icuLocale, status);
176                     break;
177                 case LOAD_LINE_BREAKITERATOR:
178                     icuBI->aBreakIterator = icu::BreakIterator::createLineInstance(icuLocale, status);
179                     break;
180             }
181             if ( !U_SUCCESS(status) ) {
182                 icuBI->aBreakIterator=NULL;
183                 throw ERROR;
184             }
185         }
186         if (icuBI->aBreakIterator) {
187             aLocale=rLocale;
188             aWordType=rWordType;
189             aBreakType=rBreakType;
190             newBreak=sal_True;
191         } else {
192             throw ERROR;
193         }
194     }
195 
196     if (newBreak || icuBI->aICUText.compare(UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength()))) {	// UChar != sal_Unicode in MinGW
197         icuBI->aICUText=UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength());
198         icuBI->aBreakIterator->setText(icuBI->aICUText);
199     }
200 }
201 
202 
nextCharacters(const OUString & Text,sal_Int32 nStartPos,const lang::Locale & rLocale,sal_Int16 nCharacterIteratorMode,sal_Int32 nCount,sal_Int32 & nDone)203 sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
204         sal_Int32 nStartPos, const lang::Locale &rLocale,
205         sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
206         throw(uno::RuntimeException)
207 {
208         if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
209             loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
210             for (nDone = 0; nDone < nCount; nDone++) {
211                 nStartPos = character.aBreakIterator->following(nStartPos);
212                 if (nStartPos == BreakIterator::DONE)
213                     return Text.getLength();
214             }
215         } else { // for CHARACTER mode
216             for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
217                 Text.iterateCodePoints(&nStartPos, 1);
218         }
219         return nStartPos;
220 }
221 
previousCharacters(const OUString & Text,sal_Int32 nStartPos,const lang::Locale & rLocale,sal_Int16 nCharacterIteratorMode,sal_Int32 nCount,sal_Int32 & nDone)222 sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
223         sal_Int32 nStartPos, const lang::Locale& rLocale,
224         sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
225         throw(uno::RuntimeException)
226 {
227         if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
228             loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
229             for (nDone = 0; nDone < nCount; nDone++) {
230                 nStartPos = character.aBreakIterator->preceding(nStartPos);
231                 if (nStartPos == BreakIterator::DONE)
232                     return 0;
233             }
234         } else { // for BS to delete one char and CHARACTER mode.
235             for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
236                 Text.iterateCodePoints(&nStartPos, -1);
237         }
238         return nStartPos;
239 }
240 
241 
nextWord(const OUString & Text,sal_Int32 nStartPos,const lang::Locale & rLocale,sal_Int16 rWordType)242 Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
243     const lang::Locale& rLocale, sal_Int16 rWordType ) throw(uno::RuntimeException)
244 {
245         loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
246 
247         result.startPos = word.aBreakIterator->following(nStartPos);
248         if( result.startPos >= Text.getLength() || result.startPos == BreakIterator::DONE )
249             result.endPos = result.startPos;
250         else {
251             if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
252                     rWordType == WordType::DICTIONARY_WORD ) &&
253                         u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
254                 result.startPos = word.aBreakIterator->following(result.startPos);
255 
256             result.endPos = word.aBreakIterator->following(result.startPos);
257             if(result.endPos == BreakIterator::DONE)
258                 result.endPos = result.startPos;
259         }
260         return result;
261 }
262 
263 
previousWord(const OUString & Text,sal_Int32 nStartPos,const lang::Locale & rLocale,sal_Int16 rWordType)264 Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
265         const lang::Locale& rLocale, sal_Int16 rWordType) throw(uno::RuntimeException)
266 {
267         loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
268 
269         result.startPos = word.aBreakIterator->preceding(nStartPos);
270         if( result.startPos < 0 || result.startPos == BreakIterator::DONE)
271             result.endPos = result.startPos;
272         else {
273             if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
274                     rWordType == WordType::DICTIONARY_WORD) &&
275                         u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
276                 result.startPos = word.aBreakIterator->preceding(result.startPos);
277 
278             result.endPos = word.aBreakIterator->following(result.startPos);
279             if(result.endPos == BreakIterator::DONE)
280                 result.endPos = result.startPos;
281         }
282         return result;
283 }
284 
285 
getWordBoundary(const OUString & Text,sal_Int32 nPos,const lang::Locale & rLocale,sal_Int16 rWordType,sal_Bool bDirection)286 Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
287         sal_Int16 rWordType, sal_Bool bDirection ) throw(uno::RuntimeException)
288 {
289         loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
290         sal_Int32 len = Text.getLength();
291 
292         if(word.aBreakIterator->isBoundary(nPos)) {
293             result.startPos = result.endPos = nPos;
294             if((bDirection || nPos == 0) && nPos < len) //forward
295                 result.endPos = word.aBreakIterator->following(nPos);
296             else
297                 result.startPos = word.aBreakIterator->preceding(nPos);
298         } else {
299             if(nPos <= 0) {
300                 result.startPos = 0;
301                 result.endPos = len ? word.aBreakIterator->following((sal_Int32)0) : 0;
302             } else if(nPos >= len) {
303                 result.startPos = word.aBreakIterator->preceding(len);
304                 result.endPos = len;
305             } else {
306                 result.startPos = word.aBreakIterator->preceding(nPos);
307                 result.endPos = word.aBreakIterator->following(nPos);
308             }
309         }
310         if (result.startPos == BreakIterator::DONE)
311             result.startPos = result.endPos;
312         else if (result.endPos == BreakIterator::DONE)
313             result.endPos = result.startPos;
314 
315         return result;
316 }
317 
318 
beginOfSentence(const OUString & Text,sal_Int32 nStartPos,const lang::Locale & rLocale)319 sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
320         const lang::Locale &rLocale ) throw(uno::RuntimeException)
321 {
322         loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
323 
324         sal_Int32 len = Text.getLength();
325         if (len > 0 && nStartPos == len)
326             Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
327         if (!sentence.aBreakIterator->isBoundary(nStartPos))
328             nStartPos = sentence.aBreakIterator->preceding(nStartPos);
329 
330         // skip preceding space.
331         sal_uInt32 ch = Text.iterateCodePoints(&nStartPos, 1);
332         while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos, 1);
333 		Text.iterateCodePoints(&nStartPos, -1);
334 
335         return nStartPos;
336 }
337 
endOfSentence(const OUString & Text,sal_Int32 nStartPos,const lang::Locale & rLocale)338 sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
339         const lang::Locale &rLocale ) throw(uno::RuntimeException)
340 {
341         loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
342 
343         sal_Int32 len = Text.getLength();
344         if (len > 0 && nStartPos == len)
345             Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
346         nStartPos = sentence.aBreakIterator->following(nStartPos);
347 
348         sal_Int32 nPos=nStartPos;
349         while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
350 
351         return nStartPos;
352 }
353 
getLineBreak(const OUString & Text,sal_Int32 nStartPos,const lang::Locale & rLocale,sal_Int32 nMinBreakPos,const LineBreakHyphenationOptions & hOptions,const LineBreakUserOptions &)354 LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
355         const OUString& Text, sal_Int32 nStartPos,
356         const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
357         const LineBreakHyphenationOptions& hOptions,
358         const LineBreakUserOptions& /*rOptions*/ ) throw(uno::RuntimeException)
359 {
360         LineBreakResults lbr;
361 
362         if (nStartPos >= Text.getLength()) {
363             lbr.breakIndex = Text.getLength();
364             lbr.breakType = BreakType::WORDBOUNDARY;
365             return lbr;
366         }
367 
368         loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);
369 
370         sal_Bool GlueSpace=sal_True;
371         while (GlueSpace) {
372             if (line.aBreakIterator->preceding(nStartPos + 1) == nStartPos) { //Line boundary break
373                 lbr.breakIndex = nStartPos;
374                 lbr.breakType = BreakType::WORDBOUNDARY;
375             } else if (hOptions.rHyphenator.is()) { //Hyphenation break
376                 Boundary wBoundary = getWordBoundary( Text, nStartPos, rLocale,
377                                                 WordType::DICTIONARY_WORD, false);
378                 uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord;
379                 aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
380                     wBoundary.endPos - wBoundary.startPos), rLocale,
381                     (sal_Int16) (hOptions.hyphenIndex - wBoundary.startPos), hOptions.aHyphenationOptions);
382                 if (aHyphenatedWord.is()) {
383                     lbr.rHyphenatedWord = aHyphenatedWord;
384                     if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
385                         lbr.breakIndex = -1;
386                     else
387                         lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
388                     lbr.breakType = BreakType::HYPHENATION;
389                 } else {
390                     lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
391                     lbr.breakType = BreakType::WORDBOUNDARY;;
392                 }
393             } else { //word boundary break
394                 lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
395                 lbr.breakType = BreakType::WORDBOUNDARY;
396             }
397 
398 #define WJ 0x2060   // Word Joiner
399             GlueSpace=sal_False;
400             if (lbr.breakType == BreakType::WORDBOUNDARY) {
401                 nStartPos = lbr.breakIndex;
402                 if (Text[nStartPos--] == WJ)
403                     GlueSpace=sal_True;
404                 while (nStartPos >= 0 &&
405                     (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
406                     if (Text[nStartPos--] == WJ)
407                         GlueSpace=sal_True;
408                 }
409                 if (GlueSpace && nStartPos < 0)  {
410                     lbr.breakIndex = 0;
411                     break;
412                 }
413             }
414         }
415 
416         return lbr;
417 }
418 
419 
420 
421 OUString SAL_CALL
getImplementationName(void)422 BreakIterator_Unicode::getImplementationName(void) throw( uno::RuntimeException )
423 {
424         return OUString::createFromAscii(cBreakIterator);
425 }
426 
427 sal_Bool SAL_CALL
supportsService(const OUString & rServiceName)428 BreakIterator_Unicode::supportsService(const OUString& rServiceName) throw( uno::RuntimeException )
429 {
430         return !rServiceName.compareToAscii(cBreakIterator);
431 }
432 
433 uno::Sequence< OUString > SAL_CALL
getSupportedServiceNames(void)434 BreakIterator_Unicode::getSupportedServiceNames(void) throw( uno::RuntimeException )
435 {
436         uno::Sequence< OUString > aRet(1);
437         aRet[0] = OUString::createFromAscii(cBreakIterator);
438         return aRet;
439 }
440 
441 } } } }
442