1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 24 // MARKER(update_precomp.py): autogen include statement, do not remove 25 #include "precompiled_i18npool.hxx" 26 27 #include "textsearch.hxx" 28 #include "levdis.hxx" 29 #include <com/sun/star/lang/Locale.hpp> 30 #include <com/sun/star/lang/XMultiServiceFactory.hpp> 31 #include <comphelper/processfactory.hxx> 32 #include <com/sun/star/i18n/UnicodeType.hpp> 33 #include <com/sun/star/util/SearchFlags.hpp> 34 #include <com/sun/star/i18n/WordType.hpp> 35 #include <com/sun/star/i18n/ScriptType.hpp> 36 #include <com/sun/star/i18n/CharacterIteratorMode.hpp> 37 #include <com/sun/star/i18n/KCharacterType.hpp> 38 #include <com/sun/star/registry/XRegistryKey.hpp> 39 #include <cppuhelper/factory.hxx> 40 #include <cppuhelper/weak.hxx> 41 42 #ifdef _MSC_VER 43 // get rid of that dumb compiler warning 44 // identifier was truncated to '255' characters in the debug information 45 // for STL template usage, if .pdb files are to be created 46 #pragma warning( disable: 4786 ) 47 #endif 48 49 #include <string.h> 50 51 using namespace ::com::sun::star::util; 52 using namespace ::com::sun::star::uno; 53 using namespace ::com::sun::star::lang; 54 using namespace ::com::sun::star::i18n; 55 using namespace ::rtl; 56 57 static sal_Int32 COMPLEX_TRANS_MASK_TMP = 58 TransliterationModules_ignoreBaFa_ja_JP | 59 TransliterationModules_ignoreIterationMark_ja_JP | 60 TransliterationModules_ignoreTiJi_ja_JP | 61 TransliterationModules_ignoreHyuByu_ja_JP | 62 TransliterationModules_ignoreSeZe_ja_JP | 63 TransliterationModules_ignoreIandEfollowedByYa_ja_JP | 64 TransliterationModules_ignoreKiKuFollowedBySa_ja_JP | 65 TransliterationModules_ignoreProlongedSoundMark_ja_JP; 66 static const sal_Int32 COMPLEX_TRANS_MASK = COMPLEX_TRANS_MASK_TMP | TransliterationModules_IGNORE_KANA | TransliterationModules_FULLWIDTH_HALFWIDTH; 67 static const sal_Int32 SIMPLE_TRANS_MASK = ~(COMPLEX_TRANS_MASK | TransliterationModules_IGNORE_CASE | TransliterationModules_UPPERCASE_LOWERCASE | TransliterationModules_LOWERCASE_UPPERCASE); 68 // Above 2 transliteration is simple but need to take effect in 69 // complex transliteration 70 71 TextSearch::TextSearch(const Reference < XMultiServiceFactory > & rxMSF) 72 : xMSF( rxMSF ) 73 , pJumpTable( 0 ) 74 , pJumpTable2( 0 ) 75 , pRegexMatcher( NULL ) 76 , pWLD( 0 ) 77 { 78 SearchOptions aOpt; 79 aOpt.algorithmType = SearchAlgorithms_ABSOLUTE; 80 aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE; 81 //aOpt.Locale = ???; 82 setOptions( aOpt ); 83 } 84 85 TextSearch::~TextSearch() 86 { 87 delete pRegexMatcher; 88 delete pWLD; 89 delete pJumpTable; 90 delete pJumpTable2; 91 } 92 93 void TextSearch::setOptions( const SearchOptions& rOptions ) throw( RuntimeException ) 94 { 95 aSrchPara = rOptions; 96 97 delete pRegexMatcher, pRegexMatcher = NULL; 98 delete pWLD, pWLD = 0; 99 delete pJumpTable, pJumpTable = 0; 100 delete pJumpTable2, pJumpTable2 = 0; 101 102 // Create Transliteration class 103 if( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ) 104 { 105 if( !xTranslit.is() ) 106 { 107 Reference < XInterface > xI = xMSF->createInstance( 108 OUString::createFromAscii( 109 "com.sun.star.i18n.Transliteration")); 110 if ( xI.is() ) 111 xI->queryInterface( ::getCppuType( 112 (const Reference< XExtendedTransliteration >*)0)) 113 >>= xTranslit; 114 } 115 // Load transliteration module 116 if( xTranslit.is() ) 117 xTranslit->loadModule( 118 (TransliterationModules)( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ), 119 aSrchPara.Locale); 120 } 121 else if( xTranslit.is() ) 122 xTranslit = 0; 123 124 // Create Transliteration for 2<->1, 2<->2 transliteration 125 if ( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ) 126 { 127 if( !xTranslit2.is() ) 128 { 129 Reference < XInterface > xI = xMSF->createInstance( 130 OUString::createFromAscii( 131 "com.sun.star.i18n.Transliteration")); 132 if ( xI.is() ) 133 xI->queryInterface( ::getCppuType( 134 (const Reference< XExtendedTransliteration >*)0)) 135 >>= xTranslit2; 136 } 137 // Load transliteration module 138 if( xTranslit2.is() ) 139 xTranslit2->loadModule( 140 (TransliterationModules)( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ), 141 aSrchPara.Locale); 142 } 143 144 if ( !xBreak.is() ) 145 { 146 Reference < XInterface > xI = xMSF->createInstance( 147 OUString::createFromAscii( "com.sun.star.i18n.BreakIterator")); 148 if( xI.is() ) 149 xI->queryInterface( ::getCppuType( 150 (const Reference< XBreakIterator >*)0)) 151 >>= xBreak; 152 } 153 154 sSrchStr = aSrchPara.searchString; 155 156 // use transliteration here 157 if ( xTranslit.is() && 158 aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ) 159 sSrchStr = xTranslit->transliterateString2String( 160 aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); 161 162 if ( xTranslit2.is() && 163 aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ) 164 sSrchStr2 = xTranslit2->transliterateString2String( 165 aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); 166 167 // When start or end of search string is a complex script type, we need to 168 // make sure the result boundary is not located in the middle of cell. 169 checkCTLStart = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 0) == 170 ScriptType::COMPLEX)); 171 checkCTLEnd = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 172 sSrchStr.getLength()-1) == ScriptType::COMPLEX)); 173 174 switch( aSrchPara.algorithmType) 175 { 176 case SearchAlgorithms_REGEXP: 177 fnForward = &TextSearch::RESrchFrwrd; 178 fnBackward = &TextSearch::RESrchBkwrd; 179 RESrchPrepare( aSrchPara); 180 break; 181 182 case SearchAlgorithms_APPROXIMATE: 183 fnForward = &TextSearch::ApproxSrchFrwrd; 184 fnBackward = &TextSearch::ApproxSrchBkwrd; 185 186 pWLD = new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars, 187 aSrchPara.insertedChars, aSrchPara.deletedChars, 188 0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) ); 189 190 nLimit = pWLD->GetLimit(); 191 break; 192 193 default: 194 fnForward = &TextSearch::NSrchFrwrd; 195 fnBackward = &TextSearch::NSrchBkwrd; 196 break; 197 } 198 } 199 200 sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos ) 201 { 202 sal_Int32 nRet = 0, nEnd = rOff.getLength(); 203 while( nRet < nEnd && nPos > rOff[ nRet ] ) ++nRet; 204 return nRet; 205 } 206 207 sal_Bool TextSearch::isCellStart(const OUString& searchStr, sal_Int32 nPos) 208 throw( RuntimeException ) 209 { 210 sal_Int32 nDone; 211 return nPos == xBreak->previousCharacters(searchStr, nPos+1, 212 aSrchPara.Locale, CharacterIteratorMode::SKIPCELL, 1, nDone); 213 } 214 215 SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 216 throw( RuntimeException ) 217 { 218 SearchResult sres; 219 220 OUString in_str(searchStr); 221 sal_Int32 newStartPos = startPos; 222 sal_Int32 newEndPos = endPos; 223 224 bUsePrimarySrchStr = true; 225 226 if ( xTranslit.is() ) 227 { 228 // apply normal transliteration (1<->1, 1<->0) 229 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 230 in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset ); 231 232 // JP 20.6.2001: also the start and end positions must be corrected! 233 if( startPos ) 234 newStartPos = FindPosInSeq_Impl( offset, startPos ); 235 236 if( endPos < searchStr.getLength() ) 237 newEndPos = FindPosInSeq_Impl( offset, endPos ); 238 else 239 newEndPos = in_str.getLength(); 240 241 sres = (this->*fnForward)( in_str, newStartPos, newEndPos ); 242 243 for ( int k = 0; k < sres.startOffset.getLength(); k++ ) 244 { 245 if (sres.startOffset[k]) 246 sres.startOffset[k] = offset[sres.startOffset[k]]; 247 // JP 20.6.2001: end is ever exclusive and then don't return 248 // the position of the next character - return the 249 // next position behind the last found character! 250 // "a b c" find "b" must return 2,3 and not 2,4!!! 251 if (sres.endOffset[k]) 252 sres.endOffset[k] = offset[sres.endOffset[k]-1] + 1; 253 } 254 } 255 else 256 { 257 sres = (this->*fnForward)( in_str, startPos, endPos ); 258 } 259 260 if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP) 261 { 262 SearchResult sres2; 263 264 in_str = OUString(searchStr); 265 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 266 267 in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset ); 268 269 if( startPos ) 270 startPos = FindPosInSeq_Impl( offset, startPos ); 271 272 if( endPos < searchStr.getLength() ) 273 endPos = FindPosInSeq_Impl( offset, endPos ); 274 else 275 endPos = in_str.getLength(); 276 277 bUsePrimarySrchStr = false; 278 sres2 = (this->*fnForward)( in_str, startPos, endPos ); 279 280 for ( int k = 0; k < sres2.startOffset.getLength(); k++ ) 281 { 282 if (sres2.startOffset[k]) 283 sres2.startOffset[k] = offset[sres2.startOffset[k]-1] + 1; 284 if (sres2.endOffset[k]) 285 sres2.endOffset[k] = offset[sres2.endOffset[k]-1] + 1; 286 } 287 288 // pick first and long one 289 if ( sres.subRegExpressions == 0) 290 return sres2; 291 if ( sres2.subRegExpressions == 1) 292 { 293 if ( sres.startOffset[0] > sres2.startOffset[0]) 294 return sres2; 295 else if ( sres.startOffset[0] == sres2.startOffset[0] && 296 sres.endOffset[0] < sres2.endOffset[0]) 297 return sres2; 298 } 299 } 300 301 return sres; 302 } 303 304 SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 305 throw(RuntimeException) 306 { 307 SearchResult sres; 308 309 OUString in_str(searchStr); 310 sal_Int32 newStartPos = startPos; 311 sal_Int32 newEndPos = endPos; 312 313 bUsePrimarySrchStr = true; 314 315 if ( xTranslit.is() ) 316 { 317 // apply only simple 1<->1 transliteration here 318 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 319 in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset ); 320 321 // JP 20.6.2001: also the start and end positions must be corrected! 322 if( startPos < searchStr.getLength() ) 323 newStartPos = FindPosInSeq_Impl( offset, startPos ); 324 else 325 newStartPos = in_str.getLength(); 326 327 if( endPos ) 328 newEndPos = FindPosInSeq_Impl( offset, endPos ); 329 330 sres = (this->*fnBackward)( in_str, newStartPos, newEndPos ); 331 332 for ( int k = 0; k < sres.startOffset.getLength(); k++ ) 333 { 334 if (sres.startOffset[k]) 335 sres.startOffset[k] = offset[sres.startOffset[k] - 1] + 1; 336 // JP 20.6.2001: end is ever exclusive and then don't return 337 // the position of the next character - return the 338 // next position behind the last found character! 339 // "a b c" find "b" must return 2,3 and not 2,4!!! 340 if (sres.endOffset[k]) 341 sres.endOffset[k] = offset[sres.endOffset[k]]; 342 } 343 } 344 else 345 { 346 sres = (this->*fnBackward)( in_str, startPos, endPos ); 347 } 348 349 if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP ) 350 { 351 SearchResult sres2; 352 353 in_str = OUString(searchStr); 354 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 355 356 in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset); 357 358 if( startPos < searchStr.getLength() ) 359 startPos = FindPosInSeq_Impl( offset, startPos ); 360 else 361 startPos = in_str.getLength(); 362 363 if( endPos ) 364 endPos = FindPosInSeq_Impl( offset, endPos ); 365 366 bUsePrimarySrchStr = false; 367 sres2 = (this->*fnBackward)( in_str, startPos, endPos ); 368 369 for( int k = 0; k < sres2.startOffset.getLength(); k++ ) 370 { 371 if (sres2.startOffset[k]) 372 sres2.startOffset[k] = offset[sres2.startOffset[k]-1]+1; 373 if (sres2.endOffset[k]) 374 sres2.endOffset[k] = offset[sres2.endOffset[k]-1]+1; 375 } 376 377 // pick last and long one 378 if ( sres.subRegExpressions == 0 ) 379 return sres2; 380 if ( sres2.subRegExpressions == 1 ) 381 { 382 if ( sres.startOffset[0] < sres2.startOffset[0] ) 383 return sres2; 384 if ( sres.startOffset[0] == sres2.startOffset[0] && 385 sres.endOffset[0] > sres2.endOffset[0] ) 386 return sres2; 387 } 388 } 389 390 return sres; 391 } 392 393 //--------------------------------------------------------------------- 394 395 bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const 396 { 397 bool bRet = 1; 398 if( '\x7f' != rStr[nPos]) 399 { 400 if ( !xCharClass.is() ) 401 { 402 Reference < XInterface > xI = xMSF->createInstance( 403 OUString::createFromAscii( "com.sun.star.i18n.CharacterClassification")); 404 if( xI.is() ) 405 xI->queryInterface( ::getCppuType( 406 (const Reference< XCharacterClassification >*)0)) 407 >>= xCharClass; 408 } 409 if ( xCharClass.is() ) 410 { 411 sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos, 412 aSrchPara.Locale ); 413 if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA | 414 KCharacterType::LETTER ) & nCType ) ) 415 bRet = 0; 416 } 417 } 418 return bRet; 419 } 420 421 // --------- helper methods for Boyer-Moore like text searching ---------- 422 // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available 423 424 void TextSearch::MakeForwardTab() 425 { 426 // create the jumptable for the search text 427 if( pJumpTable ) 428 { 429 if( bIsForwardTab ) 430 return ; // the jumpTable is ok 431 delete pJumpTable; 432 } 433 bIsForwardTab = true; 434 435 sal_Int32 n, nLen = sSrchStr.getLength(); 436 pJumpTable = new TextSearchJumpTable; 437 438 for( n = 0; n < nLen - 1; ++n ) 439 { 440 sal_Unicode cCh = sSrchStr[n]; 441 sal_Int32 nDiff = nLen - n - 1; 442 TextSearchJumpTable::value_type aEntry( cCh, nDiff ); 443 444 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 445 pJumpTable->insert( aEntry ); 446 if ( !aPair.second ) 447 (*(aPair.first)).second = nDiff; 448 } 449 } 450 451 void TextSearch::MakeForwardTab2() 452 { 453 // create the jumptable for the search text 454 if( pJumpTable2 ) 455 { 456 if( bIsForwardTab ) 457 return ; // the jumpTable is ok 458 delete pJumpTable2; 459 } 460 bIsForwardTab = true; 461 462 sal_Int32 n, nLen = sSrchStr2.getLength(); 463 pJumpTable2 = new TextSearchJumpTable; 464 465 for( n = 0; n < nLen - 1; ++n ) 466 { 467 sal_Unicode cCh = sSrchStr2[n]; 468 sal_Int32 nDiff = nLen - n - 1; 469 470 TextSearchJumpTable::value_type aEntry( cCh, nDiff ); 471 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 472 pJumpTable2->insert( aEntry ); 473 if ( !aPair.second ) 474 (*(aPair.first)).second = nDiff; 475 } 476 } 477 478 void TextSearch::MakeBackwardTab() 479 { 480 // create the jumptable for the search text 481 if( pJumpTable ) 482 { 483 if( !bIsForwardTab ) 484 return ; // the jumpTable is ok 485 delete pJumpTable; 486 } 487 bIsForwardTab = false; 488 489 sal_Int32 n, nLen = sSrchStr.getLength(); 490 pJumpTable = new TextSearchJumpTable; 491 492 for( n = nLen-1; n > 0; --n ) 493 { 494 sal_Unicode cCh = sSrchStr[n]; 495 TextSearchJumpTable::value_type aEntry( cCh, n ); 496 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 497 pJumpTable->insert( aEntry ); 498 if ( !aPair.second ) 499 (*(aPair.first)).second = n; 500 } 501 } 502 503 void TextSearch::MakeBackwardTab2() 504 { 505 // create the jumptable for the search text 506 if( pJumpTable2 ) 507 { 508 if( !bIsForwardTab ) 509 return ; // the jumpTable is ok 510 delete pJumpTable2; 511 } 512 bIsForwardTab = false; 513 514 sal_Int32 n, nLen = sSrchStr2.getLength(); 515 pJumpTable2 = new TextSearchJumpTable; 516 517 for( n = nLen-1; n > 0; --n ) 518 { 519 sal_Unicode cCh = sSrchStr2[n]; 520 TextSearchJumpTable::value_type aEntry( cCh, n ); 521 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 522 pJumpTable2->insert( aEntry ); 523 if ( !aPair.second ) 524 (*(aPair.first)).second = n; 525 } 526 } 527 528 sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const 529 { 530 TextSearchJumpTable *pJump; 531 OUString sSearchKey; 532 533 if ( bUsePrimarySrchStr ) { 534 pJump = pJumpTable; 535 sSearchKey = sSrchStr; 536 } else { 537 pJump = pJumpTable2; 538 sSearchKey = sSrchStr2; 539 } 540 541 TextSearchJumpTable::const_iterator iLook = pJump->find( cChr ); 542 if ( iLook == pJump->end() ) 543 return sSearchKey.getLength(); 544 return (*iLook).second; 545 } 546 547 548 // TextSearch::NSrchFrwrd is mis-optimized on unxsoli (#i105945#) 549 SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 550 throw(RuntimeException) 551 { 552 SearchResult aRet; 553 aRet.subRegExpressions = 0; 554 555 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2; 556 557 OUString aStr( searchStr ); 558 sal_Int32 nSuchIdx = aStr.getLength(); 559 sal_Int32 nEnde = endPos; 560 if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx ) 561 return aRet; 562 563 564 if( nEnde < sSearchKey.getLength() ) // position inside the search region ? 565 return aRet; 566 567 nEnde -= sSearchKey.getLength(); 568 569 if (bUsePrimarySrchStr) 570 MakeForwardTab(); // create the jumptable 571 else 572 MakeForwardTab2(); 573 574 for (sal_Int32 nCmpIdx = startPos; // start position for the search 575 nCmpIdx <= nEnde; 576 nCmpIdx += GetDiff( aStr[nCmpIdx + sSearchKey.getLength()-1])) 577 { 578 // if the match would be the completed cells, skip it. 579 if ( (checkCTLStart && !isCellStart( aStr, nCmpIdx )) || (checkCTLEnd 580 && !isCellStart( aStr, nCmpIdx + sSearchKey.getLength())) ) 581 continue; 582 583 nSuchIdx = sSearchKey.getLength() - 1; 584 while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == aStr[nCmpIdx + nSuchIdx]) 585 { 586 if( nSuchIdx == 0 ) 587 { 588 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag ) 589 { 590 sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength(); 591 bool bAtStart = !nCmpIdx; 592 bool bAtEnd = nFndEnd == endPos; 593 bool bDelimBefore = bAtStart || IsDelimiter( aStr, nCmpIdx-1 ); 594 bool bDelimBehind = IsDelimiter( aStr, nFndEnd ); 595 // * 1 -> only one word in the paragraph 596 // * 2 -> at begin of paragraph 597 // * 3 -> at end of paragraph 598 // * 4 -> inside the paragraph 599 if( !( ( bAtStart && bAtEnd ) || // 1 600 ( bAtStart && bDelimBehind ) || // 2 601 ( bAtEnd && bDelimBefore ) || // 3 602 ( bDelimBefore && bDelimBehind ))) // 4 603 break; 604 } 605 606 aRet.subRegExpressions = 1; 607 aRet.startOffset.realloc( 1 ); 608 aRet.startOffset[ 0 ] = nCmpIdx; 609 aRet.endOffset.realloc( 1 ); 610 aRet.endOffset[ 0 ] = nCmpIdx + sSearchKey.getLength(); 611 612 return aRet; 613 } 614 else 615 nSuchIdx--; 616 } 617 } 618 return aRet; 619 } 620 621 SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 622 throw(RuntimeException) 623 { 624 SearchResult aRet; 625 aRet.subRegExpressions = 0; 626 627 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2; 628 629 OUString aStr( searchStr ); 630 sal_Int32 nSuchIdx = aStr.getLength(); 631 sal_Int32 nEnde = endPos; 632 if( nSuchIdx == 0 || sSearchKey.getLength() == 0 || sSearchKey.getLength() > nSuchIdx) 633 return aRet; 634 635 if (bUsePrimarySrchStr) 636 MakeBackwardTab(); // create the jumptable 637 else 638 MakeBackwardTab2(); 639 640 if( nEnde == nSuchIdx ) // end position for the search 641 nEnde = sSearchKey.getLength(); 642 else 643 nEnde += sSearchKey.getLength(); 644 645 sal_Int32 nCmpIdx = startPos; // start position for the search 646 647 while (nCmpIdx >= nEnde) 648 { 649 // if the match would be the completed cells, skip it. 650 if ( (!checkCTLStart || isCellStart( aStr, nCmpIdx - 651 sSearchKey.getLength() )) && (!checkCTLEnd || 652 isCellStart( aStr, nCmpIdx))) 653 { 654 nSuchIdx = 0; 655 while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] == 656 aStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] ) 657 nSuchIdx++; 658 if( nSuchIdx >= sSearchKey.getLength() ) 659 { 660 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag ) 661 { 662 sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength(); 663 bool bAtStart = !nFndStt; 664 bool bAtEnd = nCmpIdx == startPos; 665 bool bDelimBehind = IsDelimiter( aStr, nCmpIdx ); 666 bool bDelimBefore = bAtStart || // begin of paragraph 667 IsDelimiter( aStr, nFndStt-1 ); 668 // * 1 -> only one word in the paragraph 669 // * 2 -> at begin of paragraph 670 // * 3 -> at end of paragraph 671 // * 4 -> inside the paragraph 672 if( ( bAtStart && bAtEnd ) || // 1 673 ( bAtStart && bDelimBehind ) || // 2 674 ( bAtEnd && bDelimBefore ) || // 3 675 ( bDelimBefore && bDelimBehind )) // 4 676 { 677 aRet.subRegExpressions = 1; 678 aRet.startOffset.realloc( 1 ); 679 aRet.startOffset[ 0 ] = nCmpIdx; 680 aRet.endOffset.realloc( 1 ); 681 aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength(); 682 return aRet; 683 } 684 } 685 else 686 { 687 aRet.subRegExpressions = 1; 688 aRet.startOffset.realloc( 1 ); 689 aRet.startOffset[ 0 ] = nCmpIdx; 690 aRet.endOffset.realloc( 1 ); 691 aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength(); 692 return aRet; 693 } 694 } 695 } 696 nSuchIdx = GetDiff( aStr[nCmpIdx - sSearchKey.getLength()] ); 697 if( nCmpIdx < nSuchIdx ) 698 return aRet; 699 nCmpIdx -= nSuchIdx; 700 } 701 return aRet; 702 } 703 704 void TextSearch::RESrchPrepare( const ::com::sun::star::util::SearchOptions& rOptions) 705 { 706 // select the transliterated pattern string 707 const OUString& rPatternStr = 708 (rOptions.transliterateFlags & SIMPLE_TRANS_MASK) ? sSrchStr 709 : ((rOptions.transliterateFlags & COMPLEX_TRANS_MASK) ? sSrchStr2 : rOptions.searchString); 710 711 sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability 712 // map com::sun::star::util::SearchFlags to ICU uregex.h flags 713 // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE 714 // REG_NEWLINE is neither properly defined nor used anywhere => not implemented 715 // REG_NOSUB is not used anywhere => not implemented 716 // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute 717 // LEV_RELAXED is only used for SearchAlgorithm==Approximate 718 // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO 719 // probably because the transliteration flag IGNORE_CASE handles it as well. 720 if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0 721 || (rOptions.transliterateFlags & TransliterationModules_IGNORE_CASE) != 0) 722 nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE; 723 UErrorCode nIcuErr = U_ZERO_ERROR; 724 // assumption: transliteration didn't mangle regexp control chars 725 IcuUniString aIcuSearchPatStr( (const UChar*)rPatternStr.getStr(), rPatternStr.getLength()); 726 #ifndef DISABLE_WORDBOUND_EMULATION 727 // for conveniance specific syntax elements of the old regex engine are emulated 728 // - by replacing \< with "word-break followed by a look-ahead word-char" 729 static const IcuUniString aChevronPatternB( "\\\\<", -1, IcuUniString::kInvariant); 730 static const IcuUniString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, IcuUniString::kInvariant); 731 static RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr); 732 aChevronMatcherB.reset( aIcuSearchPatStr); 733 aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr); 734 aChevronMatcherB.reset(); 735 // - by replacing \> with "look-behind word-char followed by a word-break" 736 static const IcuUniString aChevronPatternE( "\\\\>", -1, IcuUniString::kInvariant); 737 static const IcuUniString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, IcuUniString::kInvariant); 738 static RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr); 739 aChevronMatcherE.reset( aIcuSearchPatStr); 740 aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr); 741 aChevronMatcherE.reset(); 742 #endif 743 pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr); 744 if( nIcuErr) 745 { delete pRegexMatcher; pRegexMatcher = NULL;} 746 } 747 748 //--------------------------------------------------------------------------- 749 750 SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr, 751 sal_Int32 startPos, sal_Int32 endPos ) 752 throw(RuntimeException) 753 { 754 SearchResult aRet; 755 aRet.subRegExpressions = 0; 756 if( !pRegexMatcher) 757 return aRet; 758 759 if( endPos > searchStr.getLength()) 760 endPos = searchStr.getLength(); 761 762 // use the ICU RegexMatcher to find the matches 763 UErrorCode nIcuErr = U_ZERO_ERROR; 764 const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), endPos); 765 pRegexMatcher->reset( aSearchTargetStr); 766 // search until there is a valid match 767 for(;;) 768 { 769 if( !pRegexMatcher->find( startPos, nIcuErr)) 770 return aRet; 771 772 // #i118887# ignore zero-length matches e.g. "a*" in "bc" 773 int nStartOfs = pRegexMatcher->start( nIcuErr); 774 int nEndOfs = pRegexMatcher->end( nIcuErr); 775 if( nStartOfs < nEndOfs) 776 break; 777 // try at next position if there was a zero-length match 778 if( ++startPos >= endPos) 779 return aRet; 780 } 781 782 // extract the result of the search 783 const int nGroupCount = pRegexMatcher->groupCount(); 784 aRet.subRegExpressions = nGroupCount + 1; 785 aRet.startOffset.realloc( aRet.subRegExpressions); 786 aRet.endOffset.realloc( aRet.subRegExpressions); 787 aRet.startOffset[0] = pRegexMatcher->start( nIcuErr); 788 aRet.endOffset[0] = pRegexMatcher->end( nIcuErr); 789 for( int i = 1; i <= nGroupCount; ++i) { 790 aRet.startOffset[i] = pRegexMatcher->start( i, nIcuErr); 791 aRet.endOffset[i] = pRegexMatcher->end( i, nIcuErr); 792 } 793 794 return aRet; 795 } 796 797 SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr, 798 sal_Int32 startPos, sal_Int32 endPos ) 799 throw(RuntimeException) 800 { 801 // NOTE: for backwards search callers provide startPos/endPos inverted! 802 SearchResult aRet; 803 aRet.subRegExpressions = 0; 804 if( !pRegexMatcher) 805 return aRet; 806 807 if( startPos > searchStr.getLength()) 808 startPos = searchStr.getLength(); 809 810 // use the ICU RegexMatcher to find the matches 811 // TODO: use ICU's backward searching once it becomes available 812 // as its replacement using forward search is not as good as the real thing 813 UErrorCode nIcuErr = U_ZERO_ERROR; 814 const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), startPos); 815 pRegexMatcher->reset( aSearchTargetStr); 816 if( !pRegexMatcher->find( endPos, nIcuErr)) 817 return aRet; 818 819 // find the last match 820 int nLastPos = 0; 821 int nFoundEnd = 0; 822 do { 823 nLastPos = pRegexMatcher->start( nIcuErr); 824 nFoundEnd = pRegexMatcher->end( nIcuErr); 825 if( nFoundEnd >= startPos) 826 break; 827 if( nFoundEnd == nLastPos) 828 ++nFoundEnd; 829 } while( pRegexMatcher->find( nFoundEnd, nIcuErr)); 830 831 // find last match again to get its details 832 pRegexMatcher->find( nLastPos, nIcuErr); 833 834 // fill in the details of the last match 835 const int nGroupCount = pRegexMatcher->groupCount(); 836 aRet.subRegExpressions = nGroupCount + 1; 837 aRet.startOffset.realloc( aRet.subRegExpressions); 838 aRet.endOffset.realloc( aRet.subRegExpressions); 839 // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted! 840 aRet.startOffset[0] = pRegexMatcher->end( nIcuErr); 841 aRet.endOffset[0] = pRegexMatcher->start( nIcuErr); 842 for( int i = 1; i <= nGroupCount; ++i) { 843 aRet.startOffset[i] = pRegexMatcher->end( i, nIcuErr); 844 aRet.endOffset[i] = pRegexMatcher->start( i, nIcuErr); 845 } 846 847 return aRet; 848 } 849 850 //--------------------------------------------------------------------------- 851 852 // search for words phonetically 853 SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr, 854 sal_Int32 startPos, sal_Int32 endPos ) 855 throw(RuntimeException) 856 { 857 SearchResult aRet; 858 aRet.subRegExpressions = 0; 859 860 if( !xBreak.is() ) 861 return aRet; 862 863 OUString aWTemp( searchStr ); 864 865 register sal_Int32 nStt, nEnd; 866 867 Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos, 868 aSrchPara.Locale, 869 WordType::ANYWORD_IGNOREWHITESPACES, sal_True ); 870 871 do 872 { 873 if( aWBnd.startPos >= endPos ) 874 break; 875 nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos; 876 nEnd = aWBnd.endPos > endPos ? endPos : aWBnd.endPos; 877 878 if( nStt < nEnd && 879 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit ) 880 { 881 aRet.subRegExpressions = 1; 882 aRet.startOffset.realloc( 1 ); 883 aRet.startOffset[ 0 ] = nStt; 884 aRet.endOffset.realloc( 1 ); 885 aRet.endOffset[ 0 ] = nEnd; 886 break; 887 } 888 889 nStt = nEnd - 1; 890 aWBnd = xBreak->nextWord( aWTemp, nStt, aSrchPara.Locale, 891 WordType::ANYWORD_IGNOREWHITESPACES); 892 } while( aWBnd.startPos != aWBnd.endPos || 893 (aWBnd.endPos != aWTemp.getLength() && aWBnd.endPos != nEnd) ); 894 // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only 895 // whitespace) in searchStr, getWordBoundary() returned startPos,startPos 896 // and nextWord() does also => don't loop forever. 897 return aRet; 898 } 899 900 SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr, 901 sal_Int32 startPos, sal_Int32 endPos ) 902 throw(RuntimeException) 903 { 904 SearchResult aRet; 905 aRet.subRegExpressions = 0; 906 907 if( !xBreak.is() ) 908 return aRet; 909 910 OUString aWTemp( searchStr ); 911 912 register sal_Int32 nStt, nEnd; 913 914 Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos, 915 aSrchPara.Locale, 916 WordType::ANYWORD_IGNOREWHITESPACES, sal_True ); 917 918 do 919 { 920 if( aWBnd.endPos <= endPos ) 921 break; 922 nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos; 923 nEnd = aWBnd.endPos > startPos ? startPos : aWBnd.endPos; 924 925 if( nStt < nEnd && 926 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit ) 927 { 928 aRet.subRegExpressions = 1; 929 aRet.startOffset.realloc( 1 ); 930 aRet.startOffset[ 0 ] = nEnd; 931 aRet.endOffset.realloc( 1 ); 932 aRet.endOffset[ 0 ] = nStt; 933 break; 934 } 935 if( !nStt ) 936 break; 937 938 aWBnd = xBreak->previousWord( aWTemp, nStt, aSrchPara.Locale, 939 WordType::ANYWORD_IGNOREWHITESPACES); 940 } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != aWTemp.getLength() ); 941 return aRet; 942 } 943 944 945 static const sal_Char cSearchName[] = "com.sun.star.util.TextSearch"; 946 static const sal_Char cSearchImpl[] = "com.sun.star.util.TextSearch_i18n"; 947 948 static OUString getServiceName_Static() 949 { 950 return OUString::createFromAscii( cSearchName ); 951 } 952 953 static OUString getImplementationName_Static() 954 { 955 return OUString::createFromAscii( cSearchImpl ); 956 } 957 958 OUString SAL_CALL 959 TextSearch::getImplementationName() 960 throw( RuntimeException ) 961 { 962 return getImplementationName_Static(); 963 } 964 965 sal_Bool SAL_CALL 966 TextSearch::supportsService(const OUString& rServiceName) 967 throw( RuntimeException ) 968 { 969 return !rServiceName.compareToAscii( cSearchName ); 970 } 971 972 Sequence< OUString > SAL_CALL 973 TextSearch::getSupportedServiceNames(void) throw( RuntimeException ) 974 { 975 Sequence< OUString > aRet(1); 976 aRet[0] = getServiceName_Static(); 977 return aRet; 978 } 979 980 ::com::sun::star::uno::Reference< ::com::sun::star::uno::XInterface > 981 SAL_CALL TextSearch_CreateInstance( 982 const ::com::sun::star::uno::Reference< 983 ::com::sun::star::lang::XMultiServiceFactory >& rxMSF ) 984 { 985 return ::com::sun::star::uno::Reference< 986 ::com::sun::star::uno::XInterface >( 987 (::cppu::OWeakObject*) new TextSearch( rxMSF ) ); 988 } 989 990 extern "C" 991 { 992 993 void SAL_CALL component_getImplementationEnvironment( 994 const sal_Char** ppEnvTypeName, uno_Environment** /*ppEnv*/ ) 995 { 996 *ppEnvTypeName = CPPU_CURRENT_LANGUAGE_BINDING_NAME; 997 } 998 999 void* SAL_CALL component_getFactory( const sal_Char* sImplementationName, 1000 void* _pServiceManager, void* /*_pRegistryKey*/ ) 1001 { 1002 void* pRet = NULL; 1003 1004 ::com::sun::star::lang::XMultiServiceFactory* pServiceManager = 1005 reinterpret_cast< ::com::sun::star::lang::XMultiServiceFactory* > 1006 ( _pServiceManager ); 1007 ::com::sun::star::uno::Reference< 1008 ::com::sun::star::lang::XSingleServiceFactory > xFactory; 1009 1010 if ( 0 == rtl_str_compare( sImplementationName, cSearchImpl) ) 1011 { 1012 ::com::sun::star::uno::Sequence< ::rtl::OUString > aServiceNames(1); 1013 aServiceNames[0] = getServiceName_Static(); 1014 xFactory = ::cppu::createSingleFactory( 1015 pServiceManager, getImplementationName_Static(), 1016 &TextSearch_CreateInstance, aServiceNames ); 1017 } 1018 1019 if ( xFactory.is() ) 1020 { 1021 xFactory->acquire(); 1022 pRet = xFactory.get(); 1023 } 1024 1025 return pRet; 1026 } 1027 1028 } // extern "C" 1029