textsearch.cxx (cdf0e10c) textsearch.cxx (cc450e3a)
1/*************************************************************************
2 *
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * Copyright 2000, 2010 Oracle and/or its affiliates.
6 *
7 * OpenOffice.org - a multi-platform office productivity suite
8 *

--- 16 unchanged lines hidden (view full) ---

25 *
26 ************************************************************************/
27
28// MARKER(update_precomp.py): autogen include statement, do not remove
29#include "precompiled_i18npool.hxx"
30
31#include "textsearch.hxx"
32#include "levdis.hxx"
1/*************************************************************************
2 *
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * Copyright 2000, 2010 Oracle and/or its affiliates.
6 *
7 * OpenOffice.org - a multi-platform office productivity suite
8 *

--- 16 unchanged lines hidden (view full) ---

25 *
26 ************************************************************************/
27
28// MARKER(update_precomp.py): autogen include statement, do not remove
29#include "precompiled_i18npool.hxx"
30
31#include "textsearch.hxx"
32#include "levdis.hxx"
33#include <regexp/reclass.hxx>
34#include <com/sun/star/lang/Locale.hpp>
35#include <com/sun/star/lang/XMultiServiceFactory.hpp>
36#include <comphelper/processfactory.hxx>
37#include <com/sun/star/i18n/UnicodeType.hpp>
38#include <com/sun/star/util/SearchFlags.hpp>
39#include <com/sun/star/i18n/WordType.hpp>
40#include <com/sun/star/i18n/ScriptType.hpp>
41#include <com/sun/star/i18n/CharacterIteratorMode.hpp>

--- 21 unchanged lines hidden (view full) ---

63 TransliterationModules_ignoreBaFa_ja_JP |
64 TransliterationModules_ignoreIterationMark_ja_JP |
65 TransliterationModules_ignoreTiJi_ja_JP |
66 TransliterationModules_ignoreHyuByu_ja_JP |
67 TransliterationModules_ignoreSeZe_ja_JP |
68 TransliterationModules_ignoreIandEfollowedByYa_ja_JP |
69 TransliterationModules_ignoreKiKuFollowedBySa_ja_JP |
70 TransliterationModules_ignoreProlongedSoundMark_ja_JP;
33#include <com/sun/star/lang/Locale.hpp>
34#include <com/sun/star/lang/XMultiServiceFactory.hpp>
35#include <comphelper/processfactory.hxx>
36#include <com/sun/star/i18n/UnicodeType.hpp>
37#include <com/sun/star/util/SearchFlags.hpp>
38#include <com/sun/star/i18n/WordType.hpp>
39#include <com/sun/star/i18n/ScriptType.hpp>
40#include <com/sun/star/i18n/CharacterIteratorMode.hpp>

--- 21 unchanged lines hidden (view full) ---

62 TransliterationModules_ignoreBaFa_ja_JP |
63 TransliterationModules_ignoreIterationMark_ja_JP |
64 TransliterationModules_ignoreTiJi_ja_JP |
65 TransliterationModules_ignoreHyuByu_ja_JP |
66 TransliterationModules_ignoreSeZe_ja_JP |
67 TransliterationModules_ignoreIandEfollowedByYa_ja_JP |
68 TransliterationModules_ignoreKiKuFollowedBySa_ja_JP |
69 TransliterationModules_ignoreProlongedSoundMark_ja_JP;
71static const sal_Int32 SIMPLE_TRANS_MASK = 0xffffffff ^ COMPLEX_TRANS_MASK_TMP;
72static const sal_Int32 COMPLEX_TRANS_MASK =
73 COMPLEX_TRANS_MASK_TMP |
74 TransliterationModules_IGNORE_KANA |
75 TransliterationModules_IGNORE_WIDTH;
70static const sal_Int32 SIMPLE_TRANS_MASK = ~(COMPLEX_TRANS_MASK_TMP | TransliterationModules_IGNORE_WIDTH) | TransliterationModules_FULLWIDTH_HALFWIDTH;
71static const sal_Int32 COMPLEX_TRANS_MASK = COMPLEX_TRANS_MASK_TMP | TransliterationModules_IGNORE_KANA | TransliterationModules_FULLWIDTH_HALFWIDTH;
76 // Above 2 transliteration is simple but need to take effect in
77 // complex transliteration
78
79TextSearch::TextSearch(const Reference < XMultiServiceFactory > & rxMSF)
80 : xMSF( rxMSF )
81 , pJumpTable( 0 )
82 , pJumpTable2( 0 )
72 // Above 2 transliteration is simple but need to take effect in
73 // complex transliteration
74
75TextSearch::TextSearch(const Reference < XMultiServiceFactory > & rxMSF)
76 : xMSF( rxMSF )
77 , pJumpTable( 0 )
78 , pJumpTable2( 0 )
83 , pRegExp( 0 )
79 , pRegexMatcher( NULL )
84 , pWLD( 0 )
85{
86 SearchOptions aOpt;
87 aOpt.algorithmType = SearchAlgorithms_ABSOLUTE;
88 aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE;
89 //aOpt.Locale = ???;
90 setOptions( aOpt );
91}
92
93TextSearch::~TextSearch()
94{
80 , pWLD( 0 )
81{
82 SearchOptions aOpt;
83 aOpt.algorithmType = SearchAlgorithms_ABSOLUTE;
84 aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE;
85 //aOpt.Locale = ???;
86 setOptions( aOpt );
87}
88
89TextSearch::~TextSearch()
90{
95 delete pRegExp;
91 delete pRegexMatcher;
96 delete pWLD;
97 delete pJumpTable;
98 delete pJumpTable2;
99}
100
101void TextSearch::setOptions( const SearchOptions& rOptions ) throw( RuntimeException )
102{
103 aSrchPara = rOptions;
104
92 delete pWLD;
93 delete pJumpTable;
94 delete pJumpTable2;
95}
96
97void TextSearch::setOptions( const SearchOptions& rOptions ) throw( RuntimeException )
98{
99 aSrchPara = rOptions;
100
105 delete pRegExp, pRegExp = 0;
101 delete pRegexMatcher, pRegexMatcher = NULL;
106 delete pWLD, pWLD = 0;
107 delete pJumpTable, pJumpTable = 0;
108 delete pJumpTable2, pJumpTable2 = 0;
109
110 // Create Transliteration class
111 if( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK )
112 {
113 if( !xTranslit.is() )

--- 42 unchanged lines hidden (view full) ---

156 if( xI.is() )
157 xI->queryInterface( ::getCppuType(
158 (const Reference< XBreakIterator >*)0))
159 >>= xBreak;
160 }
161
162 sSrchStr = aSrchPara.searchString;
163
102 delete pWLD, pWLD = 0;
103 delete pJumpTable, pJumpTable = 0;
104 delete pJumpTable2, pJumpTable2 = 0;
105
106 // Create Transliteration class
107 if( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK )
108 {
109 if( !xTranslit.is() )

--- 42 unchanged lines hidden (view full) ---

152 if( xI.is() )
153 xI->queryInterface( ::getCppuType(
154 (const Reference< XBreakIterator >*)0))
155 >>= xBreak;
156 }
157
158 sSrchStr = aSrchPara.searchString;
159
164 // use transliteration here, but only if not RegEx, which does it different
165 if ( aSrchPara.algorithmType != SearchAlgorithms_REGEXP && xTranslit.is() &&
160 // use transliteration here
161 if ( xTranslit.is() &&
166 aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK )
167 sSrchStr = xTranslit->transliterateString2String(
168 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
169
162 aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK )
163 sSrchStr = xTranslit->transliterateString2String(
164 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
165
170 if ( aSrchPara.algorithmType != SearchAlgorithms_REGEXP && xTranslit2.is() &&
166 if ( xTranslit2.is() &&
171 aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK )
172 sSrchStr2 = xTranslit2->transliterateString2String(
173 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
174
175 // When start or end of search string is a complex script type, we need to
176 // make sure the result boundary is not located in the middle of cell.
177 checkCTLStart = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 0) ==
178 ScriptType::COMPLEX));
179 checkCTLEnd = (xBreak.is() && (xBreak->getScriptType(sSrchStr,
180 sSrchStr.getLength()-1) == ScriptType::COMPLEX));
181
167 aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK )
168 sSrchStr2 = xTranslit2->transliterateString2String(
169 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
170
171 // When start or end of search string is a complex script type, we need to
172 // make sure the result boundary is not located in the middle of cell.
173 checkCTLStart = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 0) ==
174 ScriptType::COMPLEX));
175 checkCTLEnd = (xBreak.is() && (xBreak->getScriptType(sSrchStr,
176 sSrchStr.getLength()-1) == ScriptType::COMPLEX));
177
182 if ( aSrchPara.algorithmType == SearchAlgorithms_REGEXP )
178 switch( aSrchPara.algorithmType)
183 {
179 {
184 fnForward = &TextSearch::RESrchFrwrd;
185 fnBackward = &TextSearch::RESrchBkwrd;
180 case SearchAlgorithms_REGEXP:
181 fnForward = &TextSearch::RESrchFrwrd;
182 fnBackward = &TextSearch::RESrchBkwrd;
186
183
187 pRegExp = new Regexpr( aSrchPara, xTranslit );
188 }
189 else
190 {
191 if ( aSrchPara.algorithmType == SearchAlgorithms_APPROXIMATE )
192 {
184 {
185 sal_uInt32 nIcuSearchFlags = 0;
186 // map com::sun::star::util::SearchFlags to ICU uregex.h flags
187 // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
188 // REG_NEWLINE is neither defined properly nor used anywhere => not implemented
189 // REG_NOSUB is not used anywhere => not implemented
190 // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
191 // LEV_RELAXED is only used for SearchAlgorithm==Approximate
192 // why is even ALL_IGNORE_CASE deprecated in UNO? because of transliteration taking care of it???
193 if( (aSrchPara.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0)
194 nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
195 UErrorCode nIcuErr = U_ZERO_ERROR;
196 // assumption: transliteration doesn't mangle regexp control chars
197 OUString& rPatternStr = (aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK) ? sSrchStr
198 : ((aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK) ? sSrchStr2 : aSrchPara.searchString);
199 const IcuUniString aIcuSearchPatStr( rPatternStr.getStr(), rPatternStr.getLength());
200 pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr);
201 if( nIcuErr)
202 { delete pRegexMatcher; pRegexMatcher = NULL;}
203 } break;
204
205 case SearchAlgorithms_APPROXIMATE:
193 fnForward = &TextSearch::ApproxSrchFrwrd;
194 fnBackward = &TextSearch::ApproxSrchBkwrd;
195
196 pWLD = new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars,
197 aSrchPara.insertedChars, aSrchPara.deletedChars,
198 0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) );
199
200 nLimit = pWLD->GetLimit();
206 fnForward = &TextSearch::ApproxSrchFrwrd;
207 fnBackward = &TextSearch::ApproxSrchBkwrd;
208
209 pWLD = new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars,
210 aSrchPara.insertedChars, aSrchPara.deletedChars,
211 0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) );
212
213 nLimit = pWLD->GetLimit();
201 }
202 else
203 {
214 break;
215
216 default:
204 fnForward = &TextSearch::NSrchFrwrd;
205 fnBackward = &TextSearch::NSrchBkwrd;
217 fnForward = &TextSearch::NSrchFrwrd;
218 fnBackward = &TextSearch::NSrchBkwrd;
206 }
219 break;
207 }
208}
209
210sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos )
211{
212 sal_Int32 nRet = 0, nEnd = rOff.getLength();
213 while( nRet < nEnd && nPos > rOff[ nRet ] ) ++nRet;
214 return nRet;

--- 180 unchanged lines hidden (view full) ---

395 sres.endOffset[0] > sres2.endOffset[0] )
396 return sres2;
397 }
398 }
399
400 return sres;
401}
402
220 }
221}
222
223sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos )
224{
225 sal_Int32 nRet = 0, nEnd = rOff.getLength();
226 while( nRet < nEnd && nPos > rOff[ nRet ] ) ++nRet;
227 return nRet;

--- 180 unchanged lines hidden (view full) ---

408 sres.endOffset[0] > sres2.endOffset[0] )
409 return sres2;
410 }
411 }
412
413 return sres;
414}
415
416//---------------------------------------------------------------------
403
417
404
405//--------------- die Wort-Trennner ----------------------------------
406
407bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const
408{
409 bool bRet = 1;
410 if( '\x7f' != rStr[nPos])
411 {
412 if ( !xCharClass.is() )
413 {
414 Reference < XInterface > xI = xMSF->createInstance(

--- 10 unchanged lines hidden (view full) ---

425 if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA |
426 KCharacterType::LETTER ) & nCType ) )
427 bRet = 0;
428 }
429 }
430 return bRet;
431}
432
418bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const
419{
420 bool bRet = 1;
421 if( '\x7f' != rStr[nPos])
422 {
423 if ( !xCharClass.is() )
424 {
425 Reference < XInterface > xI = xMSF->createInstance(

--- 10 unchanged lines hidden (view full) ---

436 if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA |
437 KCharacterType::LETTER ) & nCType ) )
438 bRet = 0;
439 }
440 }
441 return bRet;
442}
443
444// --------- helper methods for Boyer-Moore like text searching ----------
445// TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available
433
446
434
435// --------- methods for the kind of boyer-morre search ------------------
436
437
438void TextSearch::MakeForwardTab()
439{
440 // create the jumptable for the search text
441 if( pJumpTable )
442 {
443 if( bIsForwardTab )
444 return ; // the jumpTable is ok
445 delete pJumpTable;

--- 264 unchanged lines hidden (view full) ---

710 nSuchIdx = GetDiff( aStr[nCmpIdx - sSearchKey.getLength()] );
711 if( nCmpIdx < nSuchIdx )
712 return aRet;
713 nCmpIdx -= nSuchIdx;
714 }
715 return aRet;
716}
717
447void TextSearch::MakeForwardTab()
448{
449 // create the jumptable for the search text
450 if( pJumpTable )
451 {
452 if( bIsForwardTab )
453 return ; // the jumpTable is ok
454 delete pJumpTable;

--- 264 unchanged lines hidden (view full) ---

719 nSuchIdx = GetDiff( aStr[nCmpIdx - sSearchKey.getLength()] );
720 if( nCmpIdx < nSuchIdx )
721 return aRet;
722 nCmpIdx -= nSuchIdx;
723 }
724 return aRet;
725}
726
718
719
720//---------------------------------------------------------------------------
727//---------------------------------------------------------------------------
721// ------- Methoden fuer die Suche ueber Regular-Expressions --------------
722
723SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr,
724 sal_Int32 startPos, sal_Int32 endPos )
725 throw(RuntimeException)
726{
728
729SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr,
730 sal_Int32 startPos, sal_Int32 endPos )
731 throw(RuntimeException)
732{
727 SearchResult aRet;
728 aRet.subRegExpressions = 0;
729 OUString aStr( searchStr );
733 SearchResult aRet;
734 aRet.subRegExpressions = 0;
735 if( !pRegexMatcher)
736 return aRet;
737
738 if( endPos > searchStr.getLength())
739 endPos = searchStr.getLength();
730
740
731 bool bSearchInSel = (0 != (( SearchFlags::REG_NOT_BEGINOFLINE |
732 SearchFlags::REG_NOT_ENDOFLINE ) & aSrchPara.searchFlag ));
741 // use the ICU RegexMatcher to find the matches
742 UErrorCode nIcuErr = U_ZERO_ERROR;
743 const IcuUniString aSearchTargetStr( searchStr.getStr(), endPos);
744 pRegexMatcher->reset( aSearchTargetStr);
745 if( !pRegexMatcher->find( startPos, nIcuErr))
746 return aRet;
733
747
734 pRegExp->set_line(aStr.getStr(), bSearchInSel ? endPos : aStr.getLength());
748 aRet.subRegExpressions = 1;
749 aRet.startOffset.realloc( aRet.subRegExpressions);
750 aRet.endOffset.realloc( aRet.subRegExpressions);
751 aRet.startOffset[0] = pRegexMatcher->start( nIcuErr);
752 aRet.endOffset[0] = pRegexMatcher->end( nIcuErr);
735
753
736 struct re_registers regs;
737
738 // Clear structure
739 memset((void *)&regs, 0, sizeof(struct re_registers));
740 if ( ! pRegExp->re_search(&regs, startPos) )
741 {
742 if( regs.num_of_match > 0 &&
743 (regs.start[0] != -1 && regs.end[0] != -1) )
744 {
745 aRet.startOffset.realloc(regs.num_of_match);
746 aRet.endOffset.realloc(regs.num_of_match);
747
748 sal_Int32 i = 0, j = 0;
749 while( j < regs.num_of_match )
750 {
751 if( regs.start[j] != -1 && regs.end[j] != -1 )
752 {
753 aRet.startOffset[i] = regs.start[j];
754 aRet.endOffset[i] = regs.end[j];
755 ++i;
756 }
757 ++j;
758 }
759 aRet.subRegExpressions = i;
760 }
761 if ( regs.num_regs > 0 )
762 {
763 if ( regs.start )
764 free(regs.start);
765 if ( regs.end )
766 free(regs.end);
767 }
768 }
769
770 return aRet;
754 return aRet;
771}
772
755}
756
773/*
774 * Sucht das Muster aSrchPara.sSrchStr rueckwaerts im String rStr
775 */
776SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr,
777 sal_Int32 startPos, sal_Int32 endPos )
778 throw(RuntimeException)
779{
757SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr,
758 sal_Int32 startPos, sal_Int32 endPos )
759 throw(RuntimeException)
760{
780 SearchResult aRet;
781 aRet.subRegExpressions = 0;
782 OUString aStr( searchStr );
761 // NOTE: for backwards search callers provide startPos/endPos inverted!
762 SearchResult aRet;
763 aRet.subRegExpressions = 0;
764 if( !pRegexMatcher)
765 return aRet;
766
767 if( startPos > searchStr.getLength())
768 startPos = searchStr.getLength();
783
769
784 sal_Int32 nOffset = 0;
785 sal_Int32 nStrEnde = aStr.getLength() == endPos ? 0 : endPos;
770 // use the ICU RegexMatcher to find the matches
771 // TODO: use ICU's backward searching once it becomes available
772 UErrorCode nIcuErr = U_ZERO_ERROR;
773 const IcuUniString aSearchTargetStr( searchStr.getStr(), startPos);
774 pRegexMatcher->reset( aSearchTargetStr);
775 if( !pRegexMatcher->find( endPos, nIcuErr))
776 return aRet;
786
777
787 bool bSearchInSel = (0 != (( SearchFlags::REG_NOT_BEGINOFLINE |
788 SearchFlags::REG_NOT_ENDOFLINE ) & aSrchPara.searchFlag ));
778 aRet.subRegExpressions = 1;
779 aRet.startOffset.realloc( aRet.subRegExpressions);
780 aRet.endOffset.realloc( aRet.subRegExpressions);
789
781
790 if( startPos )
791 nOffset = startPos - 1;
782 do {
783 // NOTE: backward search seems to be expected to have startOfs/endOfs inverted!
784 aRet.startOffset[0] = pRegexMatcher->end( nIcuErr);
785 aRet.endOffset[0] = pRegexMatcher->start( nIcuErr);
786 } while( pRegexMatcher->find( aRet.endOffset[0]+1, nIcuErr));
792
787
793 // search only in the subString
794 if( bSearchInSel && nStrEnde )
795 {
796 aStr = aStr.copy( nStrEnde, aStr.getLength() - nStrEnde );
797 if( nOffset > nStrEnde )
798 nOffset = nOffset - nStrEnde;
799 else
800 nOffset = 0;
801 }
802
803 // set the length to negative for reverse search
804 pRegExp->set_line( aStr.getStr(), -(aStr.getLength()) );
805 struct re_registers regs;
806
807 // Clear structure
808 memset((void *)&regs, 0, sizeof(struct re_registers));
809 if ( ! pRegExp->re_search(&regs, nOffset) )
810 {
811 if( regs.num_of_match > 0 &&
812 (regs.start[0] != -1 && regs.end[0] != -1) )
813 {
814 nOffset = bSearchInSel ? nStrEnde : 0;
815 aRet.startOffset.realloc(regs.num_of_match);
816 aRet.endOffset.realloc(regs.num_of_match);
817
818 sal_Int32 i = 0, j = 0;
819 while( j < regs.num_of_match )
820 {
821 if( regs.start[j] != -1 && regs.end[j] != -1 )
822 {
823 aRet.startOffset[i] = regs.end[j] + nOffset;
824 aRet.endOffset[i] = regs.start[j] + nOffset;
825 ++i;
826 }
827 ++j;
828 }
829 aRet.subRegExpressions = i;
830 }
831 if ( regs.num_regs > 0 )
832 {
833 if ( regs.start )
834 free(regs.start);
835 if ( regs.end )
836 free(regs.end);
837 }
838 }
839
840 return aRet;
788 return aRet;
841}
842
789}
790
843// Phonetische Suche von Worten
791//---------------------------------------------------------------------------
792
793// search for words phonetically
844SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr,
845 sal_Int32 startPos, sal_Int32 endPos )
846 throw(RuntimeException)
847{
848 SearchResult aRet;
849 aRet.subRegExpressions = 0;
850
851 if( !xBreak.is() )

--- 168 unchanged lines hidden ---
794SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr,
795 sal_Int32 startPos, sal_Int32 endPos )
796 throw(RuntimeException)
797{
798 SearchResult aRet;
799 aRet.subRegExpressions = 0;
800
801 if( !xBreak.is() )

--- 168 unchanged lines hidden ---