1 /**************************************************************
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing,
14 * software distributed under the License is distributed on an
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 * KIND, either express or implied. See the License for the
17 * specific language governing permissions and limitations
18 * under the License.
19 *
20 *************************************************************/
21
22
23
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_i18npool.hxx"
26
27 #include "textsearch.hxx"
28 #include "levdis.hxx"
29 #include <com/sun/star/lang/Locale.hpp>
30 #include <com/sun/star/lang/XMultiServiceFactory.hpp>
31 #include <comphelper/processfactory.hxx>
32 #include <com/sun/star/i18n/UnicodeType.hpp>
33 #include <com/sun/star/util/SearchFlags.hpp>
34 #include <com/sun/star/i18n/WordType.hpp>
35 #include <com/sun/star/i18n/ScriptType.hpp>
36 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
37 #include <com/sun/star/i18n/KCharacterType.hpp>
38 #include <com/sun/star/registry/XRegistryKey.hpp>
39 #include <cppuhelper/factory.hxx>
40 #include <cppuhelper/weak.hxx>
41
42 #ifdef _MSC_VER
43 // get rid of that dumb compiler warning
44 // identifier was truncated to '255' characters in the debug information
45 // for STL template usage, if .pdb files are to be created
46 #pragma warning( disable: 4786 )
47 #endif
48
49 #include <string.h>
50
51 using namespace ::com::sun::star::util;
52 using namespace ::com::sun::star::uno;
53 using namespace ::com::sun::star::lang;
54 using namespace ::com::sun::star::i18n;
55 using namespace ::rtl;
56
57 static sal_Int32 COMPLEX_TRANS_MASK_TMP =
58 TransliterationModules_ignoreBaFa_ja_JP |
59 TransliterationModules_ignoreIterationMark_ja_JP |
60 TransliterationModules_ignoreTiJi_ja_JP |
61 TransliterationModules_ignoreHyuByu_ja_JP |
62 TransliterationModules_ignoreSeZe_ja_JP |
63 TransliterationModules_ignoreIandEfollowedByYa_ja_JP |
64 TransliterationModules_ignoreKiKuFollowedBySa_ja_JP |
65 TransliterationModules_ignoreProlongedSoundMark_ja_JP;
66 static const sal_Int32 COMPLEX_TRANS_MASK = COMPLEX_TRANS_MASK_TMP | TransliterationModules_IGNORE_KANA | TransliterationModules_FULLWIDTH_HALFWIDTH;
67 static const sal_Int32 SIMPLE_TRANS_MASK = ~COMPLEX_TRANS_MASK;
68 static const sal_Int32 REGEX_TRANS_MASK = ~(COMPLEX_TRANS_MASK | TransliterationModules_IGNORE_CASE | TransliterationModules_UPPERCASE_LOWERCASE | TransliterationModules_LOWERCASE_UPPERCASE);
69 // Above 2 transliteration is simple but need to take effect in
70 // complex transliteration
71
TextSearch(const Reference<XMultiServiceFactory> & rxMSF)72 TextSearch::TextSearch(const Reference < XMultiServiceFactory > & rxMSF)
73 : xMSF( rxMSF )
74 , pJumpTable( 0 )
75 , pJumpTable2( 0 )
76 , pRegexMatcher( NULL )
77 , pWLD( 0 )
78 {
79 SearchOptions aOpt;
80 aOpt.algorithmType = SearchAlgorithms_ABSOLUTE;
81 aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE;
82 //aOpt.Locale = ???;
83 setOptions( aOpt );
84 }
85
~TextSearch()86 TextSearch::~TextSearch()
87 {
88 delete pRegexMatcher;
89 delete pWLD;
90 delete pJumpTable;
91 delete pJumpTable2;
92 }
93
setOptions(const SearchOptions & rOptions)94 void TextSearch::setOptions( const SearchOptions& rOptions ) throw( RuntimeException )
95 {
96 aSrchPara = rOptions;
97
98 delete pRegexMatcher, pRegexMatcher = NULL;
99 delete pWLD, pWLD = 0;
100 delete pJumpTable, pJumpTable = 0;
101 delete pJumpTable2, pJumpTable2 = 0;
102
103 // Create Transliteration class
104 if( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK )
105 {
106 if( !xTranslit.is() )
107 {
108 Reference < XInterface > xI = xMSF->createInstance(
109 OUString::createFromAscii(
110 "com.sun.star.i18n.Transliteration"));
111 if ( xI.is() )
112 xI->queryInterface( ::getCppuType(
113 (const Reference< XExtendedTransliteration >*)0))
114 >>= xTranslit;
115 }
116 // Load transliteration module
117 if( xTranslit.is() )
118 xTranslit->loadModule(
119 (TransliterationModules)( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ),
120 aSrchPara.Locale);
121 }
122 else if( xTranslit.is() )
123 xTranslit = 0;
124
125 // Create Transliteration for 2<->1, 2<->2 transliteration
126 if ( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK )
127 {
128 if( !xTranslit2.is() )
129 {
130 Reference < XInterface > xI = xMSF->createInstance(
131 OUString::createFromAscii(
132 "com.sun.star.i18n.Transliteration"));
133 if ( xI.is() )
134 xI->queryInterface( ::getCppuType(
135 (const Reference< XExtendedTransliteration >*)0))
136 >>= xTranslit2;
137 }
138 // Load transliteration module
139 if( xTranslit2.is() )
140 xTranslit2->loadModule(
141 (TransliterationModules)( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ),
142 aSrchPara.Locale);
143 }
144
145 if ( !xBreak.is() )
146 {
147 Reference < XInterface > xI = xMSF->createInstance(
148 OUString::createFromAscii( "com.sun.star.i18n.BreakIterator"));
149 if( xI.is() )
150 xI->queryInterface( ::getCppuType(
151 (const Reference< XBreakIterator >*)0))
152 >>= xBreak;
153 }
154
155 sSrchStr = aSrchPara.searchString;
156
157 // use transliteration here
158 if ( xTranslit.is() &&
159 aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK )
160 sSrchStr = xTranslit->transliterateString2String(
161 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
162
163 if ( xTranslit2.is() &&
164 aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK )
165 sSrchStr2 = xTranslit2->transliterateString2String(
166 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
167
168 // When start or end of search string is a complex script type, we need to
169 // make sure the result boundary is not located in the middle of cell.
170 checkCTLStart = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 0) ==
171 ScriptType::COMPLEX));
172 checkCTLEnd = (xBreak.is() && (xBreak->getScriptType(sSrchStr,
173 sSrchStr.getLength()-1) == ScriptType::COMPLEX));
174
175 switch( aSrchPara.algorithmType)
176 {
177 case SearchAlgorithms_REGEXP:
178 fnForward = &TextSearch::RESrchFrwrd;
179 fnBackward = &TextSearch::RESrchBkwrd;
180 RESrchPrepare( aSrchPara);
181 break;
182
183 case SearchAlgorithms_APPROXIMATE:
184 fnForward = &TextSearch::ApproxSrchFrwrd;
185 fnBackward = &TextSearch::ApproxSrchBkwrd;
186
187 pWLD = new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars,
188 aSrchPara.insertedChars, aSrchPara.deletedChars,
189 0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) );
190
191 nLimit = pWLD->GetLimit();
192 break;
193
194 default:
195 fnForward = &TextSearch::NSrchFrwrd;
196 fnBackward = &TextSearch::NSrchBkwrd;
197 break;
198 }
199 }
200
FindPosInSeq_Impl(const Sequence<sal_Int32> & rOff,sal_Int32 nPos)201 sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos )
202 {
203 sal_Int32 nRet = 0, nEnd = rOff.getLength();
204 while( nRet < nEnd && nPos > rOff[ nRet ] ) ++nRet;
205 return nRet;
206 }
207
isCellStart(const OUString & searchStr,sal_Int32 nPos)208 sal_Bool TextSearch::isCellStart(const OUString& searchStr, sal_Int32 nPos)
209 throw( RuntimeException )
210 {
211 sal_Int32 nDone;
212 return nPos == xBreak->previousCharacters(searchStr, nPos+1,
213 aSrchPara.Locale, CharacterIteratorMode::SKIPCELL, 1, nDone);
214 }
215
searchForward(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)216 SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
217 throw( RuntimeException )
218 {
219 SearchResult sres;
220
221 OUString in_str(searchStr);
222 sal_Int32 newStartPos = startPos;
223 sal_Int32 newEndPos = endPos;
224
225 bUsePrimarySrchStr = true;
226
227 if ( xTranslit.is() )
228 {
229 // apply normal transliteration (1<->1, 1<->0)
230 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
231 in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset );
232
233 // JP 20.6.2001: also the start and end positions must be corrected!
234 if( startPos )
235 newStartPos = FindPosInSeq_Impl( offset, startPos );
236
237 if( endPos < searchStr.getLength() )
238 newEndPos = FindPosInSeq_Impl( offset, endPos );
239 else
240 newEndPos = in_str.getLength();
241
242 sres = (this->*fnForward)( in_str, newStartPos, newEndPos );
243
244 sal_Int32 nOffsetLength = offset.getLength();
245 sal_Int32 nStartOffset = 0;
246 for ( int k = 0; k < sres.startOffset.getLength(); k++ )
247 {
248 nStartOffset = sres.startOffset[k];
249 if ( nStartOffset )
250 {
251 if ( nStartOffset < nOffsetLength )
252 sres.startOffset[k] = offset[nStartOffset];
253 else
254 sres.startOffset[k] = offset[offset.getLength()-1] +1;
255 }
256 // JP 20.6.2001: end is ever exclusive and then don't return
257 // the position of the next character - return the
258 // next position behind the last found character!
259 // "a b c" find "b" must return 2,3 and not 2,4!!!
260 if (sres.endOffset[k])
261 sres.endOffset[k] = offset[sres.endOffset[k]-1] + 1;
262 }
263 }
264 else
265 {
266 sres = (this->*fnForward)( in_str, startPos, endPos );
267 }
268
269 if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP)
270 {
271 SearchResult sres2;
272
273 in_str = OUString(searchStr);
274 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
275
276 in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset );
277
278 if( startPos )
279 startPos = FindPosInSeq_Impl( offset, startPos );
280
281 if( endPos < searchStr.getLength() )
282 endPos = FindPosInSeq_Impl( offset, endPos );
283 else
284 endPos = in_str.getLength();
285
286 bUsePrimarySrchStr = false;
287 sres2 = (this->*fnForward)( in_str, startPos, endPos );
288
289 for ( int k = 0; k < sres2.startOffset.getLength(); k++ )
290 {
291 if (sres2.startOffset[k])
292 sres2.startOffset[k] = offset[sres2.startOffset[k]-1] + 1;
293 if (sres2.endOffset[k])
294 sres2.endOffset[k] = offset[sres2.endOffset[k]-1] + 1;
295 }
296
297 // pick first and long one
298 if ( sres.subRegExpressions == 0)
299 return sres2;
300 if ( sres2.subRegExpressions == 1)
301 {
302 if ( sres.startOffset[0] > sres2.startOffset[0])
303 return sres2;
304 else if ( sres.startOffset[0] == sres2.startOffset[0] &&
305 sres.endOffset[0] < sres2.endOffset[0])
306 return sres2;
307 }
308 }
309
310 return sres;
311 }
312
searchBackward(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)313 SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
314 throw(RuntimeException)
315 {
316 SearchResult sres;
317
318 OUString in_str(searchStr);
319 sal_Int32 newStartPos = startPos;
320 sal_Int32 newEndPos = endPos;
321
322 bUsePrimarySrchStr = true;
323
324 if ( xTranslit.is() )
325 {
326 // apply only simple 1<->1 transliteration here
327 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
328 in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset );
329
330 // JP 20.6.2001: also the start and end positions must be corrected!
331 if( startPos < searchStr.getLength() )
332 newStartPos = FindPosInSeq_Impl( offset, startPos );
333 else
334 newStartPos = in_str.getLength();
335
336 if( endPos )
337 newEndPos = FindPosInSeq_Impl( offset, endPos );
338
339 sres = (this->*fnBackward)( in_str, newStartPos, newEndPos );
340
341 sal_Int32 nOffsetLength = offset.getLength();
342 sal_Int32 nEndOffset = 0;
343 for ( int k = 0; k < sres.startOffset.getLength(); k++ )
344 {
345 if (sres.startOffset[k])
346 sres.startOffset[k] = offset[sres.startOffset[k] - 1] + 1;
347 // JP 20.6.2001: end is ever exclusive and then don't return
348 // the position of the next character - return the
349 // next position behind the last found character!
350 // "a b c" find "b" must return 2,3 and not 2,4!!!
351 nEndOffset = sres.endOffset[k];
352 if ( nEndOffset )
353 {
354 if ( nEndOffset < nOffsetLength )
355 sres.endOffset[k] = offset[nEndOffset];
356 else
357 sres.endOffset[k] = offset[offset.getLength()-1] +1;
358 }
359 }
360 }
361 else
362 {
363 sres = (this->*fnBackward)( in_str, startPos, endPos );
364 }
365
366 if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP )
367 {
368 SearchResult sres2;
369
370 in_str = OUString(searchStr);
371 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
372
373 in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset);
374
375 if( startPos < searchStr.getLength() )
376 startPos = FindPosInSeq_Impl( offset, startPos );
377 else
378 startPos = in_str.getLength();
379
380 if( endPos )
381 endPos = FindPosInSeq_Impl( offset, endPos );
382
383 bUsePrimarySrchStr = false;
384 sres2 = (this->*fnBackward)( in_str, startPos, endPos );
385
386 for( int k = 0; k < sres2.startOffset.getLength(); k++ )
387 {
388 if (sres2.startOffset[k])
389 sres2.startOffset[k] = offset[sres2.startOffset[k]-1]+1;
390 if (sres2.endOffset[k])
391 sres2.endOffset[k] = offset[sres2.endOffset[k]-1]+1;
392 }
393
394 // pick last and long one
395 if ( sres.subRegExpressions == 0 )
396 return sres2;
397 if ( sres2.subRegExpressions == 1 )
398 {
399 if ( sres.startOffset[0] < sres2.startOffset[0] )
400 return sres2;
401 if ( sres.startOffset[0] == sres2.startOffset[0] &&
402 sres.endOffset[0] > sres2.endOffset[0] )
403 return sres2;
404 }
405 }
406
407 return sres;
408 }
409
410 //---------------------------------------------------------------------
411
IsDelimiter(const OUString & rStr,sal_Int32 nPos) const412 bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const
413 {
414 bool bRet = 1;
415 if( '\x7f' != rStr[nPos])
416 {
417 if ( !xCharClass.is() )
418 {
419 Reference < XInterface > xI = xMSF->createInstance(
420 OUString::createFromAscii( "com.sun.star.i18n.CharacterClassification"));
421 if( xI.is() )
422 xI->queryInterface( ::getCppuType(
423 (const Reference< XCharacterClassification >*)0))
424 >>= xCharClass;
425 }
426 if ( xCharClass.is() )
427 {
428 sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos,
429 aSrchPara.Locale );
430 if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA |
431 KCharacterType::LETTER ) & nCType ) )
432 bRet = 0;
433 }
434 }
435 return bRet;
436 }
437
438 // --------- helper methods for Boyer-Moore like text searching ----------
439 // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available
440
MakeForwardTab()441 void TextSearch::MakeForwardTab()
442 {
443 // create the jumptable for the search text
444 if( pJumpTable )
445 {
446 if( bIsForwardTab )
447 return ; // the jumpTable is ok
448 delete pJumpTable;
449 }
450 bIsForwardTab = true;
451
452 sal_Int32 n, nLen = sSrchStr.getLength();
453 pJumpTable = new TextSearchJumpTable;
454
455 for( n = 0; n < nLen - 1; ++n )
456 {
457 sal_Unicode cCh = sSrchStr[n];
458 sal_Int32 nDiff = nLen - n - 1;
459 TextSearchJumpTable::value_type aEntry( cCh, nDiff );
460
461 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
462 pJumpTable->insert( aEntry );
463 if ( !aPair.second )
464 (*(aPair.first)).second = nDiff;
465 }
466 }
467
MakeForwardTab2()468 void TextSearch::MakeForwardTab2()
469 {
470 // create the jumptable for the search text
471 if( pJumpTable2 )
472 {
473 if( bIsForwardTab )
474 return ; // the jumpTable is ok
475 delete pJumpTable2;
476 }
477 bIsForwardTab = true;
478
479 sal_Int32 n, nLen = sSrchStr2.getLength();
480 pJumpTable2 = new TextSearchJumpTable;
481
482 for( n = 0; n < nLen - 1; ++n )
483 {
484 sal_Unicode cCh = sSrchStr2[n];
485 sal_Int32 nDiff = nLen - n - 1;
486
487 TextSearchJumpTable::value_type aEntry( cCh, nDiff );
488 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
489 pJumpTable2->insert( aEntry );
490 if ( !aPair.second )
491 (*(aPair.first)).second = nDiff;
492 }
493 }
494
MakeBackwardTab()495 void TextSearch::MakeBackwardTab()
496 {
497 // create the jumptable for the search text
498 if( pJumpTable )
499 {
500 if( !bIsForwardTab )
501 return ; // the jumpTable is ok
502 delete pJumpTable;
503 }
504 bIsForwardTab = false;
505
506 sal_Int32 n, nLen = sSrchStr.getLength();
507 pJumpTable = new TextSearchJumpTable;
508
509 for( n = nLen-1; n > 0; --n )
510 {
511 sal_Unicode cCh = sSrchStr[n];
512 TextSearchJumpTable::value_type aEntry( cCh, n );
513 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
514 pJumpTable->insert( aEntry );
515 if ( !aPair.second )
516 (*(aPair.first)).second = n;
517 }
518 }
519
MakeBackwardTab2()520 void TextSearch::MakeBackwardTab2()
521 {
522 // create the jumptable for the search text
523 if( pJumpTable2 )
524 {
525 if( !bIsForwardTab )
526 return ; // the jumpTable is ok
527 delete pJumpTable2;
528 }
529 bIsForwardTab = false;
530
531 sal_Int32 n, nLen = sSrchStr2.getLength();
532 pJumpTable2 = new TextSearchJumpTable;
533
534 for( n = nLen-1; n > 0; --n )
535 {
536 sal_Unicode cCh = sSrchStr2[n];
537 TextSearchJumpTable::value_type aEntry( cCh, n );
538 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
539 pJumpTable2->insert( aEntry );
540 if ( !aPair.second )
541 (*(aPair.first)).second = n;
542 }
543 }
544
GetDiff(const sal_Unicode cChr) const545 sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const
546 {
547 TextSearchJumpTable *pJump;
548 OUString sSearchKey;
549
550 if ( bUsePrimarySrchStr ) {
551 pJump = pJumpTable;
552 sSearchKey = sSrchStr;
553 } else {
554 pJump = pJumpTable2;
555 sSearchKey = sSrchStr2;
556 }
557
558 TextSearchJumpTable::const_iterator iLook = pJump->find( cChr );
559 if ( iLook == pJump->end() )
560 return sSearchKey.getLength();
561 return (*iLook).second;
562 }
563
564
565 // TextSearch::NSrchFrwrd is mis-optimized on unxsoli (#i105945#)
NSrchFrwrd(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)566 SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
567 throw(RuntimeException)
568 {
569 SearchResult aRet;
570 aRet.subRegExpressions = 0;
571
572 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
573
574 OUString aStr( searchStr );
575 sal_Int32 nSuchIdx = aStr.getLength();
576 sal_Int32 nEnde = endPos;
577 if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx )
578 return aRet;
579
580
581 if( nEnde < sSearchKey.getLength() ) // position inside the search region ?
582 return aRet;
583
584 nEnde -= sSearchKey.getLength();
585
586 if (bUsePrimarySrchStr)
587 MakeForwardTab(); // create the jumptable
588 else
589 MakeForwardTab2();
590
591 for (sal_Int32 nCmpIdx = startPos; // start position for the search
592 nCmpIdx <= nEnde;
593 nCmpIdx += GetDiff( aStr[nCmpIdx + sSearchKey.getLength()-1]))
594 {
595 // if the match would be the completed cells, skip it.
596 if ( (checkCTLStart && !isCellStart( aStr, nCmpIdx )) || (checkCTLEnd
597 && !isCellStart( aStr, nCmpIdx + sSearchKey.getLength())) )
598 continue;
599
600 nSuchIdx = sSearchKey.getLength() - 1;
601 while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == aStr[nCmpIdx + nSuchIdx])
602 {
603 if( nSuchIdx == 0 )
604 {
605 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
606 {
607 sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength();
608 bool bAtStart = !nCmpIdx;
609 bool bAtEnd = nFndEnd == endPos;
610 bool bDelimBefore = bAtStart || IsDelimiter( aStr, nCmpIdx-1 );
611 bool bDelimBehind = IsDelimiter( aStr, nFndEnd );
612 // * 1 -> only one word in the paragraph
613 // * 2 -> at begin of paragraph
614 // * 3 -> at end of paragraph
615 // * 4 -> inside the paragraph
616 if( !( ( bAtStart && bAtEnd ) || // 1
617 ( bAtStart && bDelimBehind ) || // 2
618 ( bAtEnd && bDelimBefore ) || // 3
619 ( bDelimBefore && bDelimBehind ))) // 4
620 break;
621 }
622
623 aRet.subRegExpressions = 1;
624 aRet.startOffset.realloc( 1 );
625 aRet.startOffset[ 0 ] = nCmpIdx;
626 aRet.endOffset.realloc( 1 );
627 aRet.endOffset[ 0 ] = nCmpIdx + sSearchKey.getLength();
628
629 return aRet;
630 }
631 else
632 nSuchIdx--;
633 }
634 }
635 return aRet;
636 }
637
NSrchBkwrd(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)638 SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
639 throw(RuntimeException)
640 {
641 SearchResult aRet;
642 aRet.subRegExpressions = 0;
643
644 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
645
646 OUString aStr( searchStr );
647 sal_Int32 nSuchIdx = aStr.getLength();
648 sal_Int32 nEnde = endPos;
649 if( nSuchIdx == 0 || sSearchKey.getLength() == 0 || sSearchKey.getLength() > nSuchIdx)
650 return aRet;
651
652 if (bUsePrimarySrchStr)
653 MakeBackwardTab(); // create the jumptable
654 else
655 MakeBackwardTab2();
656
657 if( nEnde == nSuchIdx ) // end position for the search
658 nEnde = sSearchKey.getLength();
659 else
660 nEnde += sSearchKey.getLength();
661
662 sal_Int32 nCmpIdx = startPos; // start position for the search
663
664 while (nCmpIdx >= nEnde)
665 {
666 // if the match would be the completed cells, skip it.
667 if ( (!checkCTLStart || isCellStart( aStr, nCmpIdx -
668 sSearchKey.getLength() )) && (!checkCTLEnd ||
669 isCellStart( aStr, nCmpIdx)))
670 {
671 nSuchIdx = 0;
672 while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] ==
673 aStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] )
674 nSuchIdx++;
675 if( nSuchIdx >= sSearchKey.getLength() )
676 {
677 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
678 {
679 sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength();
680 bool bAtStart = !nFndStt;
681 bool bAtEnd = nCmpIdx == startPos;
682 bool bDelimBehind = IsDelimiter( aStr, nCmpIdx );
683 bool bDelimBefore = bAtStart || // begin of paragraph
684 IsDelimiter( aStr, nFndStt-1 );
685 // * 1 -> only one word in the paragraph
686 // * 2 -> at begin of paragraph
687 // * 3 -> at end of paragraph
688 // * 4 -> inside the paragraph
689 if( ( bAtStart && bAtEnd ) || // 1
690 ( bAtStart && bDelimBehind ) || // 2
691 ( bAtEnd && bDelimBefore ) || // 3
692 ( bDelimBefore && bDelimBehind )) // 4
693 {
694 aRet.subRegExpressions = 1;
695 aRet.startOffset.realloc( 1 );
696 aRet.startOffset[ 0 ] = nCmpIdx;
697 aRet.endOffset.realloc( 1 );
698 aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength();
699 return aRet;
700 }
701 }
702 else
703 {
704 aRet.subRegExpressions = 1;
705 aRet.startOffset.realloc( 1 );
706 aRet.startOffset[ 0 ] = nCmpIdx;
707 aRet.endOffset.realloc( 1 );
708 aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength();
709 return aRet;
710 }
711 }
712 }
713 nSuchIdx = GetDiff( aStr[nCmpIdx - sSearchKey.getLength()] );
714 if( nCmpIdx < nSuchIdx )
715 return aRet;
716 nCmpIdx -= nSuchIdx;
717 }
718 return aRet;
719 }
720
RESrchPrepare(const::com::sun::star::util::SearchOptions & rOptions)721 void TextSearch::RESrchPrepare( const ::com::sun::star::util::SearchOptions& rOptions)
722 {
723 // select the transliterated pattern string
724 const OUString& rPatternStr =
725 (rOptions.transliterateFlags & REGEX_TRANS_MASK) ? sSrchStr
726 : ((rOptions.transliterateFlags & COMPLEX_TRANS_MASK) ? sSrchStr2 : rOptions.searchString);
727
728 sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability
729 // map com::sun::star::util::SearchFlags to ICU uregex.h flags
730 // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
731 // REG_NEWLINE is neither properly defined nor used anywhere => not implemented
732 // REG_NOSUB is not used anywhere => not implemented
733 // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
734 // LEV_RELAXED is only used for SearchAlgorithm==Approximate
735 // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO
736 // probably because the transliteration flag IGNORE_CASE handles it as well.
737 if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0
738 || (rOptions.transliterateFlags & TransliterationModules_IGNORE_CASE) != 0)
739 nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
740 UErrorCode nIcuErr = U_ZERO_ERROR;
741 // assumption: transliteration didn't mangle regexp control chars
742 IcuUniString aIcuSearchPatStr( (const UChar*)rPatternStr.getStr(), rPatternStr.getLength());
743 #ifndef DISABLE_WORDBOUND_EMULATION
744 // for conveniance specific syntax elements of the old regex engine are emulated
745 // - by replacing \< with "word-break followed by a look-ahead word-char"
746 static const IcuUniString aChevronPatternB( "\\\\<", -1, IcuUniString::kInvariant);
747 static const IcuUniString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, IcuUniString::kInvariant);
748 static RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr);
749 aChevronMatcherB.reset( aIcuSearchPatStr);
750 aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr);
751 aChevronMatcherB.reset();
752 // - by replacing \> with "look-behind word-char followed by a word-break"
753 static const IcuUniString aChevronPatternE( "\\\\>", -1, IcuUniString::kInvariant);
754 static const IcuUniString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, IcuUniString::kInvariant);
755 static RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr);
756 aChevronMatcherE.reset( aIcuSearchPatStr);
757 aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr);
758 aChevronMatcherE.reset();
759 #endif
760 pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr);
761 if( nIcuErr)
762 { delete pRegexMatcher; pRegexMatcher = NULL;}
763 }
764
765 //---------------------------------------------------------------------------
766
RESrchFrwrd(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)767 SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr,
768 sal_Int32 startPos, sal_Int32 endPos )
769 throw(RuntimeException)
770 {
771 SearchResult aRet;
772 aRet.subRegExpressions = 0;
773 if( !pRegexMatcher)
774 return aRet;
775
776 if( endPos > searchStr.getLength())
777 endPos = searchStr.getLength();
778
779 // use the ICU RegexMatcher to find the matches
780 UErrorCode nIcuErr = U_ZERO_ERROR;
781 const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), endPos);
782 pRegexMatcher->reset( aSearchTargetStr);
783 // search until there is a valid match
784 for(;;)
785 {
786 if( !pRegexMatcher->find( startPos, nIcuErr))
787 return aRet;
788
789 // #i118887# ignore zero-length matches e.g. "a*" in "bc"
790 int nStartOfs = pRegexMatcher->start( nIcuErr);
791 int nEndOfs = pRegexMatcher->end( nIcuErr);
792 if( nStartOfs < nEndOfs)
793 break;
794 // try at next position if there was a zero-length match
795 if( ++startPos >= endPos)
796 return aRet;
797 }
798
799 // extract the result of the search
800 const int nGroupCount = pRegexMatcher->groupCount();
801 aRet.subRegExpressions = nGroupCount + 1;
802 aRet.startOffset.realloc( aRet.subRegExpressions);
803 aRet.endOffset.realloc( aRet.subRegExpressions);
804 aRet.startOffset[0] = pRegexMatcher->start( nIcuErr);
805 aRet.endOffset[0] = pRegexMatcher->end( nIcuErr);
806 for( int i = 1; i <= nGroupCount; ++i) {
807 aRet.startOffset[i] = pRegexMatcher->start( i, nIcuErr);
808 aRet.endOffset[i] = pRegexMatcher->end( i, nIcuErr);
809 }
810
811 return aRet;
812 }
813
RESrchBkwrd(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)814 SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr,
815 sal_Int32 startPos, sal_Int32 endPos )
816 throw(RuntimeException)
817 {
818 // NOTE: for backwards search callers provide startPos/endPos inverted!
819 SearchResult aRet;
820 aRet.subRegExpressions = 0;
821 if( !pRegexMatcher)
822 return aRet;
823
824 if( startPos > searchStr.getLength())
825 startPos = searchStr.getLength();
826
827 // use the ICU RegexMatcher to find the matches
828 // TODO: use ICU's backward searching once it becomes available
829 // as its replacement using forward search is not as good as the real thing
830 UErrorCode nIcuErr = U_ZERO_ERROR;
831 const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), startPos);
832 pRegexMatcher->reset( aSearchTargetStr);
833 if( !pRegexMatcher->find( endPos, nIcuErr))
834 return aRet;
835
836 // find the last match
837 int nLastPos = 0;
838 int nFoundEnd = 0;
839 do {
840 nLastPos = pRegexMatcher->start( nIcuErr);
841 nFoundEnd = pRegexMatcher->end( nIcuErr);
842 if( nFoundEnd >= startPos)
843 break;
844 if( nFoundEnd == nLastPos)
845 ++nFoundEnd;
846 } while( pRegexMatcher->find( nFoundEnd, nIcuErr));
847
848 // find last match again to get its details
849 pRegexMatcher->find( nLastPos, nIcuErr);
850
851 // fill in the details of the last match
852 const int nGroupCount = pRegexMatcher->groupCount();
853 aRet.subRegExpressions = nGroupCount + 1;
854 aRet.startOffset.realloc( aRet.subRegExpressions);
855 aRet.endOffset.realloc( aRet.subRegExpressions);
856 // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!
857 aRet.startOffset[0] = pRegexMatcher->end( nIcuErr);
858 aRet.endOffset[0] = pRegexMatcher->start( nIcuErr);
859 for( int i = 1; i <= nGroupCount; ++i) {
860 aRet.startOffset[i] = pRegexMatcher->end( i, nIcuErr);
861 aRet.endOffset[i] = pRegexMatcher->start( i, nIcuErr);
862 }
863
864 return aRet;
865 }
866
867 //---------------------------------------------------------------------------
868
869 // search for words phonetically
ApproxSrchFrwrd(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)870 SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr,
871 sal_Int32 startPos, sal_Int32 endPos )
872 throw(RuntimeException)
873 {
874 SearchResult aRet;
875 aRet.subRegExpressions = 0;
876
877 if( !xBreak.is() )
878 return aRet;
879
880 OUString aWTemp( searchStr );
881
882 register sal_Int32 nStt, nEnd;
883
884 Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos,
885 aSrchPara.Locale,
886 WordType::ANYWORD_IGNOREWHITESPACES, sal_True );
887
888 do
889 {
890 if( aWBnd.startPos >= endPos )
891 break;
892 nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos;
893 nEnd = aWBnd.endPos > endPos ? endPos : aWBnd.endPos;
894
895 if( nStt < nEnd &&
896 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit )
897 {
898 aRet.subRegExpressions = 1;
899 aRet.startOffset.realloc( 1 );
900 aRet.startOffset[ 0 ] = nStt;
901 aRet.endOffset.realloc( 1 );
902 aRet.endOffset[ 0 ] = nEnd;
903 break;
904 }
905
906 nStt = nEnd - 1;
907 aWBnd = xBreak->nextWord( aWTemp, nStt, aSrchPara.Locale,
908 WordType::ANYWORD_IGNOREWHITESPACES);
909 } while( aWBnd.startPos != aWBnd.endPos ||
910 (aWBnd.endPos != aWTemp.getLength() && aWBnd.endPos != nEnd) );
911 // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only
912 // whitespace) in searchStr, getWordBoundary() returned startPos,startPos
913 // and nextWord() does also => don't loop forever.
914 return aRet;
915 }
916
ApproxSrchBkwrd(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)917 SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr,
918 sal_Int32 startPos, sal_Int32 endPos )
919 throw(RuntimeException)
920 {
921 SearchResult aRet;
922 aRet.subRegExpressions = 0;
923
924 if( !xBreak.is() )
925 return aRet;
926
927 OUString aWTemp( searchStr );
928
929 register sal_Int32 nStt, nEnd;
930
931 Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos,
932 aSrchPara.Locale,
933 WordType::ANYWORD_IGNOREWHITESPACES, sal_True );
934
935 do
936 {
937 if( aWBnd.endPos <= endPos )
938 break;
939 nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos;
940 nEnd = aWBnd.endPos > startPos ? startPos : aWBnd.endPos;
941
942 if( nStt < nEnd &&
943 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit )
944 {
945 aRet.subRegExpressions = 1;
946 aRet.startOffset.realloc( 1 );
947 aRet.startOffset[ 0 ] = nEnd;
948 aRet.endOffset.realloc( 1 );
949 aRet.endOffset[ 0 ] = nStt;
950 break;
951 }
952 if( !nStt )
953 break;
954
955 aWBnd = xBreak->previousWord( aWTemp, nStt, aSrchPara.Locale,
956 WordType::ANYWORD_IGNOREWHITESPACES);
957 } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != aWTemp.getLength() );
958 return aRet;
959 }
960
961
962 static const sal_Char cSearchName[] = "com.sun.star.util.TextSearch";
963 static const sal_Char cSearchImpl[] = "com.sun.star.util.TextSearch_i18n";
964
getServiceName_Static()965 static OUString getServiceName_Static()
966 {
967 return OUString::createFromAscii( cSearchName );
968 }
969
getImplementationName_Static()970 static OUString getImplementationName_Static()
971 {
972 return OUString::createFromAscii( cSearchImpl );
973 }
974
975 OUString SAL_CALL
getImplementationName()976 TextSearch::getImplementationName()
977 throw( RuntimeException )
978 {
979 return getImplementationName_Static();
980 }
981
982 sal_Bool SAL_CALL
supportsService(const OUString & rServiceName)983 TextSearch::supportsService(const OUString& rServiceName)
984 throw( RuntimeException )
985 {
986 return !rServiceName.compareToAscii( cSearchName );
987 }
988
989 Sequence< OUString > SAL_CALL
getSupportedServiceNames(void)990 TextSearch::getSupportedServiceNames(void) throw( RuntimeException )
991 {
992 Sequence< OUString > aRet(1);
993 aRet[0] = getServiceName_Static();
994 return aRet;
995 }
996
997 ::com::sun::star::uno::Reference< ::com::sun::star::uno::XInterface >
TextSearch_CreateInstance(const::com::sun::star::uno::Reference<::com::sun::star::lang::XMultiServiceFactory> & rxMSF)998 SAL_CALL TextSearch_CreateInstance(
999 const ::com::sun::star::uno::Reference<
1000 ::com::sun::star::lang::XMultiServiceFactory >& rxMSF )
1001 {
1002 return ::com::sun::star::uno::Reference<
1003 ::com::sun::star::uno::XInterface >(
1004 (::cppu::OWeakObject*) new TextSearch( rxMSF ) );
1005 }
1006
1007 extern "C"
1008 {
1009
component_getImplementationEnvironment(const sal_Char ** ppEnvTypeName,uno_Environment **)1010 void SAL_CALL component_getImplementationEnvironment(
1011 const sal_Char** ppEnvTypeName, uno_Environment** /*ppEnv*/ )
1012 {
1013 *ppEnvTypeName = CPPU_CURRENT_LANGUAGE_BINDING_NAME;
1014 }
1015
component_getFactory(const sal_Char * sImplementationName,void * _pServiceManager,void *)1016 void* SAL_CALL component_getFactory( const sal_Char* sImplementationName,
1017 void* _pServiceManager, void* /*_pRegistryKey*/ )
1018 {
1019 void* pRet = NULL;
1020
1021 ::com::sun::star::lang::XMultiServiceFactory* pServiceManager =
1022 reinterpret_cast< ::com::sun::star::lang::XMultiServiceFactory* >
1023 ( _pServiceManager );
1024 ::com::sun::star::uno::Reference<
1025 ::com::sun::star::lang::XSingleServiceFactory > xFactory;
1026
1027 if ( 0 == rtl_str_compare( sImplementationName, cSearchImpl) )
1028 {
1029 ::com::sun::star::uno::Sequence< ::rtl::OUString > aServiceNames(1);
1030 aServiceNames[0] = getServiceName_Static();
1031 xFactory = ::cppu::createSingleFactory(
1032 pServiceManager, getImplementationName_Static(),
1033 &TextSearch_CreateInstance, aServiceNames );
1034 }
1035
1036 if ( xFactory.is() )
1037 {
1038 xFactory->acquire();
1039 pRet = xFactory.get();
1040 }
1041
1042 return pRet;
1043 }
1044
1045 } // extern "C"
1046