1 /************************************************************************* 2 * 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * Copyright 2000, 2010 Oracle and/or its affiliates. 6 * 7 * OpenOffice.org - a multi-platform office productivity suite 8 * 9 * This file is part of OpenOffice.org. 10 * 11 * OpenOffice.org is free software: you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser General Public License version 3 13 * only, as published by the Free Software Foundation. 14 * 15 * OpenOffice.org is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License version 3 for more details 19 * (a copy is included in the LICENSE file that accompanied this code). 20 * 21 * You should have received a copy of the GNU Lesser General Public License 22 * version 3 along with OpenOffice.org. If not, see 23 * <http://www.openoffice.org/license.html> 24 * for a copy of the LGPLv3 License. 25 * 26 ************************************************************************/ 27 28 // MARKER(update_precomp.py): autogen include statement, do not remove 29 #include "precompiled_i18npool.hxx" 30 31 // xdictionary.cpp: implementation of the xdictionary class. 32 // 33 ////////////////////////////////////////////////////////////////////// 34 35 36 #include <rtl/ustrbuf.hxx> 37 38 #include <com/sun/star/i18n/WordType.hpp> 39 #include <xdictionary.hxx> 40 #include <unicode/uchar.h> 41 #include <string.h> 42 #include <breakiteratorImpl.hxx> 43 44 ////////////////////////////////////////////////////////////////////// 45 // Construction/Destruction 46 ////////////////////////////////////////////////////////////////////// 47 48 using namespace rtl; 49 50 namespace com { namespace sun { namespace star { namespace i18n { 51 52 extern "C" { static void SAL_CALL thisModule() {} } 53 54 xdictionary::xdictionary(const sal_Char *lang) : 55 existMark( NULL ), 56 index1( NULL ), 57 index2( NULL ), 58 lenArray( NULL ), 59 dataArea( NULL ), 60 hModule( NULL ), 61 boundary(), 62 japaneseWordBreak( sal_False ) 63 #if USE_CELL_BOUNDARY_CODE 64 // For CTL breakiterator, where the word boundary should not be inside cell. 65 , 66 useCellBoundary( sal_False ), 67 cellBoundary( NULL ) 68 #endif 69 { 70 index1 = 0; 71 #ifdef SAL_DLLPREFIX 72 OUStringBuffer aBuf( strlen(lang) + 7 + 6 ); // mostly "lib*.so" (with * == dict_zh) 73 aBuf.appendAscii( SAL_DLLPREFIX ); 74 #else 75 OUStringBuffer aBuf( strlen(lang) + 7 + 4 ); // mostly "*.dll" (with * == dict_zh) 76 #endif 77 aBuf.appendAscii( "dict_" ).appendAscii( lang ).appendAscii( SAL_DLLEXTENSION ); 78 hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT ); 79 if( hModule ) { 80 sal_IntPtr (*func)(); 81 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getExistMark").pData ); 82 existMark = (sal_uInt8*) (*func)(); 83 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex1").pData ); 84 index1 = (sal_Int16*) (*func)(); 85 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex2").pData ); 86 index2 = (sal_Int32*) (*func)(); 87 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getLenArray").pData ); 88 lenArray = (sal_Int32*) (*func)(); 89 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getDataArea").pData ); 90 dataArea = (sal_Unicode*) (*func)(); 91 } 92 else 93 { 94 existMark = NULL; 95 index1 = NULL; 96 index2 = NULL; 97 lenArray = NULL; 98 dataArea = NULL; 99 } 100 101 for (sal_Int32 i = 0; i < CACHE_MAX; i++) 102 cache[i].size = 0; 103 104 #if USE_CELL_BOUNDARY_CODE 105 useCellBoundary = sal_False; 106 cellBoundary = NULL; 107 #endif 108 japaneseWordBreak = sal_False; 109 } 110 111 xdictionary::~xdictionary() { 112 osl_unloadModule(hModule); 113 for (sal_Int32 i = 0; i < CACHE_MAX; i++) { 114 if (cache[i].size > 0) { 115 delete cache[i].contents; 116 delete cache[i].wordboundary; 117 } 118 } 119 } 120 121 void xdictionary::setJapaneseWordBreak() 122 { 123 japaneseWordBreak = sal_True; 124 } 125 126 sal_Bool xdictionary::exists(const sal_uInt32 c) { 127 // 0x1FFF is the hardcoded limit in gendict for existMarks 128 sal_Bool exist = (existMark && ((c>>3) < 0x1FFF)) ? sal::static_int_cast<sal_Bool>((existMark[c>>3] & (1<<(c&0x07))) != 0) : sal_False; 129 if (!exist && japaneseWordBreak) 130 return BreakIteratorImpl::getScriptClass(c) == ScriptType::ASIAN; 131 else 132 return exist; 133 } 134 135 sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen) { 136 137 if ( !index1 ) return 0; 138 139 sal_Int16 idx = index1[str[0] >> 8]; 140 141 if (idx == 0xFF) return 0; 142 143 idx = (idx<<8) | (str[0]&0xff); 144 145 sal_uInt32 begin = index2[idx], end = index2[idx+1]; 146 147 if (begin == 0) return 0; 148 149 str++; sLen--; // first character is not stored in the dictionary 150 for (sal_uInt32 i = end; i > begin; i--) { 151 sal_Int32 len = lenArray[i] - lenArray[i - 1]; 152 if (sLen >= len) { 153 const sal_Unicode *dstr = dataArea + lenArray[i-1]; 154 sal_Int32 pos = 0; 155 156 while (pos < len && dstr[pos] == str[pos]) { pos++; } 157 158 if (pos == len) 159 return len + 1; 160 } 161 } 162 return 0; 163 } 164 165 166 /* 167 * c-tor 168 */ 169 170 WordBreakCache::WordBreakCache() : 171 length( 0 ), 172 contents( NULL ), 173 wordboundary( NULL ), 174 size( 0 ) 175 { 176 } 177 178 /* 179 * Compare two unicode string, 180 */ 181 182 sal_Bool WordBreakCache::equals(const sal_Unicode* str, Boundary& boundary) { 183 // Different length, different string. 184 if (length != boundary.endPos - boundary.startPos) return sal_False; 185 186 for (sal_Int32 i = 0; i < length; i++) 187 if (contents[i] != str[i + boundary.startPos]) return sal_False; 188 189 return sal_True; 190 } 191 192 193 /* 194 * Retrieve the segment containing the character at pos. 195 * @param pos : Position of the given character. 196 * @return true if CJK. 197 */ 198 sal_Bool xdictionary::seekSegment(const rtl::OUString &rText, sal_Int32 pos, 199 Boundary& segBoundary) 200 { 201 sal_Int32 indexUtf16; 202 segBoundary.endPos = segBoundary.startPos = pos; 203 204 indexUtf16 = pos; 205 while (indexUtf16 > 0) 206 { 207 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1); 208 if (u_isWhitespace(ch) || exists(ch)) 209 segBoundary.startPos = indexUtf16; 210 else 211 break; 212 } 213 214 indexUtf16 = pos; 215 while (indexUtf16 < rText.getLength()) 216 { 217 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1); 218 if (u_isWhitespace(ch) || exists(ch)) 219 segBoundary.endPos = indexUtf16; 220 else 221 break; 222 } 223 224 indexUtf16 = segBoundary.startPos; 225 rText.iterateCodePoints(&indexUtf16, 1); 226 return segBoundary.endPos > indexUtf16; 227 } 228 229 #define KANJA 1 230 #define KATAKANA 2 231 #define HIRAKANA 3 232 233 static sal_Int16 JapaneseCharType(sal_Unicode c) 234 { 235 if (0x3041 <= c && c <= 0x309e) 236 return HIRAKANA; 237 if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f)) 238 return KATAKANA; 239 return KANJA; 240 } 241 242 WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary& wordBoundary) 243 { 244 245 WordBreakCache& aCache = cache[text[0] & 0x1f]; 246 247 if (aCache.size != 0 && aCache.equals(text, wordBoundary)) 248 return aCache; 249 250 sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos; 251 252 if (aCache.size == 0 || len > aCache.size) { 253 if (aCache.size != 0) { 254 delete aCache.contents; 255 delete aCache.wordboundary; 256 aCache.size = len; 257 } 258 else 259 aCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE; 260 aCache.contents = new sal_Unicode[aCache.size + 1]; 261 aCache.wordboundary = new sal_Int32[aCache.size + 2]; 262 } 263 aCache.length = len; 264 memcpy(aCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode)); 265 *(aCache.contents + len) = 0x0000; 266 // reset the wordboundary in cache 267 memset(aCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2)); 268 269 sal_Int32 i = 0; // loop variable 270 while (aCache.wordboundary[i] < aCache.length) { 271 len = 0; 272 // look the continuous white space as one word and cashe it 273 while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + aCache.wordboundary[i] + len])) 274 len ++; 275 276 if (len == 0) { 277 const sal_Unicode *str = text + wordBoundary.startPos + aCache.wordboundary[i]; 278 sal_Int32 slen = aCache.length - aCache.wordboundary[i]; 279 sal_Int16 type = 0, count = 0; 280 for (;len == 0 && slen > 0; str++, slen--) { 281 len = getLongestMatch(str, slen); 282 if (len == 0) { 283 if (!japaneseWordBreak) { 284 len = 1; 285 } else { 286 if (count == 0) 287 type = JapaneseCharType(*str); 288 else if (type != JapaneseCharType(*str)) 289 break; 290 count++; 291 } 292 } 293 } 294 if (count) { 295 aCache.wordboundary[i+1] = aCache.wordboundary[i] + count; 296 i++; 297 298 #if USE_CELL_BOUNDARY_CODE 299 if (useCellBoundary) { 300 sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1]; 301 if (cBoundary > 0) 302 aCache.wordboundary[i] = cBoundary - wordBoundary.startPos; 303 } 304 #endif 305 } 306 } 307 308 if (len) { 309 aCache.wordboundary[i+1] = aCache.wordboundary[i] + len; 310 i++; 311 312 #if USE_CELL_BOUNDARY_CODE 313 if (useCellBoundary) { 314 sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1]; 315 if (cBoundary > 0) 316 aCache.wordboundary[i] = cBoundary - wordBoundary.startPos; 317 } 318 #endif 319 } 320 } 321 aCache.wordboundary[i + 1] = aCache.length + 1; 322 323 return aCache; 324 } 325 326 Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType) 327 { 328 // looking for the first non-whitespace character from anyPos 329 sal_uInt32 ch = rText.iterateCodePoints(&anyPos, -1); 330 331 while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1); 332 333 return getWordBoundary(rText, anyPos, wordType, true); 334 } 335 336 Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType) 337 { 338 boundary = getWordBoundary(rText, anyPos, wordType, true); 339 anyPos = boundary.endPos; 340 if (anyPos < rText.getLength()) { 341 // looknig for the first non-whitespace character from anyPos 342 sal_uInt32 ch = rText.iterateCodePoints(&anyPos, 1); 343 while (u_isWhitespace(ch)) ch=rText.iterateCodePoints(&anyPos, 1); 344 rText.iterateCodePoints(&anyPos, -1); 345 } 346 347 return getWordBoundary(rText, anyPos, wordType, true); 348 } 349 350 Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, sal_Bool bDirection) 351 { 352 const sal_Unicode *text=rText.getStr(); 353 sal_Int32 len=rText.getLength(); 354 if (anyPos >= len || anyPos < 0) { 355 boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len; 356 } else if (seekSegment(rText, anyPos, boundary)) { // character in dict 357 WordBreakCache& aCache = getCache(text, boundary); 358 sal_Int32 i = 0; 359 360 while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++; 361 362 sal_Int32 startPos = aCache.wordboundary[i - 1]; 363 // if bDirection is false 364 if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos)) 365 { 366 sal_Int32 indexUtf16 = anyPos-1; 367 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1); 368 if (u_isWhitespace(ch)) 369 i--; 370 } 371 boundary.endPos = boundary.startPos; 372 rText.iterateCodePoints(&boundary.endPos, aCache.wordboundary[i]); 373 rText.iterateCodePoints(&boundary.startPos, aCache.wordboundary[i-1]); 374 } else { 375 boundary.startPos = anyPos; 376 if (anyPos < len) rText.iterateCodePoints(&anyPos, 1); 377 boundary.endPos = anyPos < len ? anyPos : len; 378 } 379 if (wordType == WordType::WORD_COUNT) { 380 // skip punctuation for word count. 381 while (boundary.endPos < len) 382 { 383 sal_Int32 indexUtf16 = boundary.endPos; 384 if (u_ispunct(rText.iterateCodePoints(&indexUtf16, 1))) 385 boundary.endPos = indexUtf16; 386 else 387 break; 388 } 389 } 390 391 return boundary; 392 } 393 394 #if USE_CELL_BOUNDARY_CODE 395 void xdictionary::setCellBoundary(sal_Int32* cellArray) 396 { 397 useCellBoundary = sal_True; 398 cellBoundary = cellArray; 399 } 400 #endif 401 402 } } } } 403