1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_i18npool.hxx"
26 
27 #include <rtl/ustrbuf.hxx>
28 #include <i18nutil/casefolding.hxx>
29 #include <i18nutil/unicode.hxx>
30 
31 #include <comphelper/processfactory.hxx>
32 #include <osl/diagnose.h>
33 
34 #include <string.h>
35 
36 #include "characterclassificationImpl.hxx"
37 #include "breakiteratorImpl.hxx"
38 
39 #define TRANSLITERATION_ALL
40 #include "transliteration_body.hxx"
41 
42 using namespace ::com::sun::star::uno;
43 using namespace ::com::sun::star::lang;
44 using namespace ::rtl;
45 
46 #define A2OU(x) OUString::createFromAscii(x)
47 
48 namespace com { namespace sun { namespace star { namespace i18n {
49 
50 
Transliteration_body()51 Transliteration_body::Transliteration_body()
52 {
53 	nMappingType = 0;
54 	transliterationName = "Transliteration_body";
55 	implementationName = "com.sun.star.i18n.Transliteration.Transliteration_body";
56 }
57 
getType()58 sal_Int16 SAL_CALL Transliteration_body::getType() throw(RuntimeException)
59 {
60 	return TransliterationType::ONE_TO_ONE;
61 }
62 
equals(const OUString &,sal_Int32,sal_Int32,sal_Int32 &,const OUString &,sal_Int32,sal_Int32,sal_Int32 &)63 sal_Bool SAL_CALL Transliteration_body::equals(
64 	const OUString& /*str1*/, sal_Int32 /*pos1*/, sal_Int32 /*nCount1*/, sal_Int32& /*nMatch1*/,
65 	const OUString& /*str2*/, sal_Int32 /*pos2*/, sal_Int32 /*nCount2*/, sal_Int32& /*nMatch2*/)
66 	throw(RuntimeException)
67 {
68 	throw RuntimeException();
69 }
70 
71 Sequence< OUString > SAL_CALL
transliterateRange(const OUString & str1,const OUString & str2)72 Transliteration_body::transliterateRange( const OUString& str1, const OUString& str2 )
73 	throw( RuntimeException)
74 {
75 	Sequence< OUString > ostr(2);
76 	ostr[0] = str1;
77 	ostr[1] = str2;
78 	return ostr;
79 }
80 
81 
lcl_getMappingTypeForToggleCase(sal_uInt8 nMappingType,sal_Unicode cChar)82 static sal_uInt8 lcl_getMappingTypeForToggleCase( sal_uInt8 nMappingType, sal_Unicode cChar )
83 {
84     sal_uInt8 nRes = nMappingType;
85 
86     // take care of TOGGLE_CASE transliteration:
87     // nMappingType should not be a combination of flags, thuse we decide now
88     // which one to use.
89     if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower))
90     {
91         const sal_Int16 nType = unicode::getUnicodeType( cChar );
92         if (nType & 0x02 /* lower case*/)
93             nRes = MappingTypeLowerToUpper;
94         else
95         {
96             // should also work properly for non-upper characters like white spacs, numbers, ...
97             nRes = MappingTypeUpperToLower;
98         }
99     }
100 
101     return nRes;
102 }
103 
104 
105 OUString SAL_CALL
transliterate(const OUString & inStr,sal_Int32 startPos,sal_Int32 nCount,Sequence<sal_Int32> & offset)106 Transliteration_body::transliterate(
107     const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
108 	Sequence< sal_Int32 >& offset)
109     throw(RuntimeException)
110 {
111 #if 0
112 /* Performance optimization:
113  * The two realloc() consume 48% (32% grow, 16% shrink) runtime of this method!
114  * getValue() needs about 15%, so there is equal balance if we trade the second
115  * (shrinking) realloc() for a getValue(). But if the caller initializes the
116  * sequence to nCount elements there isn't any change in size necessary in most
117  * cases (one-to-one mapping) and we gain 33%.
118  *
119  * Of that constellation the getValue() method takes 20% upon each call, so 40%
120  * for both. By remembering the first calls' results we could gain some extra
121  * percentage again, but unfortunately getValue() may return a reference to a
122  * static buffer, so we can't store the pointer directly but would have to
123  * copy-construct an array, which doesn't give us any advantage.
124  *
125  * Much more is accomplished by working directly on the sequence buffer
126  * returned by getArray() instead of using operator[] for each and every
127  * access.
128  *
129  * And while we're at it: now that we know the size in advance we don't need to
130  * copy the buffer anymore, just create the real string buffer and let the
131  * return value take ownership.
132  *
133  * All together these changes result in the new implementation needing only 62%
134  * of the time of the old implementation (in other words: that one was 1.61
135  * times slower ...)
136  */
137 
138     // Allocate the max possible buffer. Try to use stack instead of heap which
139     // would have to be reallocated most times anyway.
140     const sal_Int32 nLocalBuf = 512 * NMAPPINGMAX;
141     sal_Unicode aLocalBuf[nLocalBuf], *out = aLocalBuf, *aHeapBuf = NULL;
142 
143     const sal_Unicode *in = inStr.getStr() + startPos;
144 
145     if (nCount > 512)
146         out = aHeapBuf =  (sal_Unicode*) malloc((nCount * NMAPPINGMAX) * sizeof(sal_Unicode));
147 
148         if (useOffset)
149             offset.realloc(nCount * NMAPPINGMAX);
150 	sal_Int32 j = 0;
151 	for (sal_Int32 i = 0; i < nCount; i++) {
152 	    Mapping &map = casefolding::getValue(in, i, nCount, aLocale, nMappingType);
153 	    for (sal_Int32 k = 0; k < map.nmap; k++) {
154                 if (useOffset)
155                     offset[j] = i + startPos;
156 		out[j++] = map.map[k];
157 	    }
158 	}
159         if (useOffset)
160             offset.realloc(j);
161 
162 	OUString r(out, j);
163 
164 	if (aHeapBuf)
165 	    free(aHeapBuf);
166 
167 	return r;
168 #else
169     const sal_Unicode *in = inStr.getStr() + startPos;
170 
171     // Two different blocks to eliminate the if(useOffset) condition inside the
172     // inner k loop. Yes, on massive use even such small things do count.
173     if ( useOffset )
174     {
175         sal_Int32 nOffCount = 0, i;
176         for (i = 0; i < nCount; i++)
177         {
178             // take care of TOGGLE_CASE transliteration:
179             sal_uInt8 nTmpMappingType = nMappingType;
180             if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower))
181                 nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] );
182 
183             const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType );
184             nOffCount += map.nmap;
185         }
186         rtl_uString* pStr = x_rtl_uString_new_WithLength( nOffCount );  // our x_rtl_ustring.h
187         sal_Unicode* out = pStr->buffer;
188 
189         if ( nOffCount != offset.getLength() )
190             offset.realloc( nOffCount );
191 
192         sal_Int32 j = 0;
193         sal_Int32 * pArr = offset.getArray();
194         for (i = 0; i < nCount; i++)
195         {
196             // take care of TOGGLE_CASE transliteration:
197             sal_uInt8 nTmpMappingType = nMappingType;
198             if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower))
199                 nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] );
200 
201             const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType );
202             for (sal_Int32 k = 0; k < map.nmap; k++)
203             {
204                 pArr[j] = i + startPos;
205                 out[j++] = map.map[k];
206             }
207         }
208         out[j] = 0;
209 
210         return OUString( pStr, SAL_NO_ACQUIRE ); // take over ownership of <pStr>
211     }
212     else
213     {
214         // In the simple case of no offset sequence used we can eliminate the
215         // first getValue() loop. We could also assume that most calls result
216         // in identical string lengths, thus using a preallocated
217         // OUStringBuffer could be an easy way to assemble the return string
218         // without too much hassle. However, for single characters the
219         // OUStringBuffer::append() method is quite expensive compared to a
220         // simple array operation, so it pays here to copy the final result
221         // instead.
222 
223         // Allocate the max possible buffer. Try to use stack instead of heap,
224         // which would have to be reallocated most times anyways.
225         const sal_Int32 nLocalBuf = 2048;
226         sal_Unicode aLocalBuf[ nLocalBuf * NMAPPINGMAX ], *out = aLocalBuf, *pHeapBuf = NULL;
227         if ( nCount > nLocalBuf )
228             out = pHeapBuf = new sal_Unicode[ nCount * NMAPPINGMAX ];
229 
230         sal_Int32 j = 0;
231         for ( sal_Int32 i = 0; i < nCount; i++)
232         {
233             // take care of TOGGLE_CASE transliteration:
234             sal_uInt8 nTmpMappingType = nMappingType;
235             if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower))
236                 nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] );
237 
238             const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType );
239             for (sal_Int32 k = 0; k < map.nmap; k++)
240             {
241                 out[j++] = map.map[k];
242             }
243         }
244 
245         OUString aRet( out, j );
246         if ( pHeapBuf )
247             delete [] pHeapBuf;
248         return aRet;
249     }
250 #endif
251 }
252 
253 OUString SAL_CALL
transliterateChar2String(sal_Unicode inChar)254 Transliteration_body::transliterateChar2String( sal_Unicode inChar ) throw(RuntimeException)
255 {
256         const Mapping &map = casefolding::getValue(&inChar, 0, 1, aLocale, nMappingType);
257         rtl_uString* pStr = x_rtl_uString_new_WithLength( map.nmap );  // our x_rtl_ustring.h
258         sal_Unicode* out = pStr->buffer;
259         sal_Int32 i;
260 
261         for (i = 0; i < map.nmap; i++)
262             out[i] = map.map[i];
263         out[i] = 0;
264 
265         return OUString( pStr, SAL_NO_ACQUIRE ); // take over ownership of <pStr>
266 }
267 
268 sal_Unicode SAL_CALL
transliterateChar2Char(sal_Unicode inChar)269 Transliteration_body::transliterateChar2Char( sal_Unicode inChar ) throw(MultipleCharsOutputException, RuntimeException)
270 {
271         const Mapping &map = casefolding::getValue(&inChar, 0, 1, aLocale, nMappingType);
272         if (map.nmap > 1)
273             throw MultipleCharsOutputException();
274         return map.map[0];
275 }
276 
277 OUString SAL_CALL
folding(const OUString & inStr,sal_Int32 startPos,sal_Int32 nCount,Sequence<sal_Int32> & offset)278 Transliteration_body::folding( const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
279 	Sequence< sal_Int32 >& offset) throw(RuntimeException)
280 {
281 	return this->transliterate(inStr, startPos, nCount, offset);
282 }
283 
Transliteration_casemapping()284 Transliteration_casemapping::Transliteration_casemapping()
285 {
286 	nMappingType = 0;
287 	transliterationName = "casemapping(generic)";
288 	implementationName = "com.sun.star.i18n.Transliteration.Transliteration_casemapping";
289 }
290 
291 void SAL_CALL
setMappingType(const sal_uInt8 rMappingType,const Locale & rLocale)292 Transliteration_casemapping::setMappingType( const sal_uInt8 rMappingType, const Locale& rLocale )
293 {
294 	nMappingType = rMappingType;
295 	aLocale = rLocale;
296 }
297 
Transliteration_u2l()298 Transliteration_u2l::Transliteration_u2l()
299 {
300 	nMappingType = MappingTypeUpperToLower;
301 	transliterationName = "upper_to_lower(generic)";
302 	implementationName = "com.sun.star.i18n.Transliteration.Transliteration_u2l";
303 }
304 
Transliteration_l2u()305 Transliteration_l2u::Transliteration_l2u()
306 {
307 	nMappingType = MappingTypeLowerToUpper;
308 	transliterationName = "lower_to_upper(generic)";
309 	implementationName = "com.sun.star.i18n.Transliteration.Transliteration_l2u";
310 }
311 
Transliteration_togglecase()312 Transliteration_togglecase::Transliteration_togglecase()
313 {
314     // usually nMappingType must NOT be a combiantion of different flages here,
315     // but we take care of that problem in Transliteration_body::transliterate above
316     // before that value is used. There we will decide which of both is to be used on
317     // a per character basis.
318     nMappingType = MappingTypeLowerToUpper | MappingTypeUpperToLower;
319     transliterationName = "toggle(generic)";
320     implementationName = "com.sun.star.i18n.Transliteration.Transliteration_togglecase";
321 }
322 
Transliteration_titlecase()323 Transliteration_titlecase::Transliteration_titlecase()
324 {
325     nMappingType = MappingTypeToTitle;
326     transliterationName = "title(generic)";
327     implementationName = "com.sun.star.i18n.Transliteration.Transliteration_titlecase";
328 }
329 
330 #if 0
331 struct LigatureData
332 {
333     sal_uInt32  cChar;
334     sal_Char *  pUtf8Text;
335 };
336 
337 // available Unicode ligatures:
338 // http://www.unicode.org/charts
339 // http://www.unicode.org/charts/PDF/UFB00.pdf
340 static LigatureData aLigatures[] =
341 {
342     { 0x0FB00,     "ff" },
343     { 0x0FB01,     "fi" },
344     { 0x0FB02,     "fl" },
345     { 0x0FB03,     "ffi" },
346     { 0x0FB04,     "ffl" },
347     { 0x0FB05,     "ft" },
348     { 0x0FB06,     "st" },
349 
350     { 0x0FB13,     "\xD5\xB4\xD5\xB6" },     // Armenian small men now
351     { 0x0FB14,     "\xD5\xB4\xD5\xA5" },     // Armenian small men ech
352     { 0x0FB15,     "\xD5\xB4\xD5\xAB" },     // Armenian small men ini
353     { 0x0FB16,     "\xD5\xBE\xD5\xB6" },     // Armenian small vew now
354     { 0x0FB17,     "\xD5\xB4\xD5\xAD" },     // Armenian small men xeh
355     { 0x00000,     "" }
356 };
357 
358 static inline bool lcl_IsLigature( sal_uInt32 cChar )
359 {
360     return (0x0FB00 <= cChar && cChar <= 0x0FB06) || (0x0FB13 <= cChar && cChar <= 0x0FB17);
361 }
362 
363 static rtl::OUString lcl_ResolveLigature( sal_uInt32 cChar )
364 {
365     rtl::OUString aRes;
366     if (lcl_IsLigature( cChar ))
367     {
368         LigatureData *pFound = NULL;
369         LigatureData *pData = aLigatures;
370         while (!pFound && pData->cChar != 0)
371         {
372             if (pData->cChar == cChar)
373                 pFound = pData;
374             ++pData;
375         }
376         if (pFound)
377             aRes = rtl::OUString( pFound->pUtf8Text, strlen( pFound->pUtf8Text ), RTL_TEXTENCODING_UTF8 );
378     }
379     else
380         aRes = rtl::OUString( &cChar, 1 );
381     return aRes;
382 }
383 #endif // if 0
384 
transliterate_titlecase_Impl(const OUString & inStr,sal_Int32 startPos,sal_Int32 nCount,const Locale & rLocale,Sequence<sal_Int32> & offset)385 static rtl::OUString transliterate_titlecase_Impl(
386     const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
387     const Locale &rLocale,
388 	Sequence< sal_Int32 >& offset )
389     throw(RuntimeException)
390 {
391     const OUString aText( inStr.copy( startPos, nCount ) );
392 
393     OUString aRes;
394     if (aText.getLength() > 0)
395     {
396         Reference< XMultiServiceFactory > xMSF = ::comphelper::getProcessServiceFactory();
397         CharacterClassificationImpl aCharClassImpl( xMSF );
398 
399         // because aCharClassImpl.toTitle does not handle ligatures or � but will raise
400         // an exception we need to handle the first chara manually...
401 
402         // we don't want to change surrogates by accident, thuse we use proper code point iteration
403         sal_Int32 nPos = 0;
404         sal_uInt32 cFirstChar = aText.iterateCodePoints( &nPos );
405         OUString aResolvedLigature( &cFirstChar, 1 ); //lcl_ResolveLigature( cFirstChar ) );
406         // toUpper can be used to properly resolve ligatures and characters like �
407         aResolvedLigature = aCharClassImpl.toUpper( aResolvedLigature, 0, aResolvedLigature.getLength(), rLocale );
408         // since toTitle will leave all-uppercase text unchanged we first need to
409         // use toLower to bring possible 2nd and following charas in lowercase
410         aResolvedLigature = aCharClassImpl.toLower( aResolvedLigature, 0, aResolvedLigature.getLength(), rLocale );
411         sal_Int32 nResolvedLen = aResolvedLigature.getLength();
412 
413         // now we can properly use toTitle to get the expected result for the resolved string.
414         // The rest of the text should just become lowercase.
415         aRes = aCharClassImpl.toTitle( aResolvedLigature, 0, nResolvedLen, rLocale );
416         aRes += aCharClassImpl.toLower( aText, 1, aText.getLength() - 1, rLocale );
417         offset.realloc( aRes.getLength() );
418 
419         sal_Int32 *pOffset = offset.getArray();
420         sal_Int32 nLen = offset.getLength();
421         for (sal_Int32 i = 0; i < nLen; ++i)
422         {
423             sal_Int32 nIdx = 0;
424             if (i >= nResolvedLen)
425                 nIdx = i - nResolvedLen + 1;
426             pOffset[i] = nIdx;
427         }
428     }
429 #if OSL_DEBUG_LEVEL > 1
430     const sal_Int32 *pCOffset = offset.getConstArray();
431     (void) pCOffset;
432 #endif
433 
434     return aRes;
435 }
436 
437 
438 // this function expects to be called on a word-by-word basis,
439 // namely that startPos points to the first char of the word
transliterate(const OUString & inStr,sal_Int32 startPos,sal_Int32 nCount,Sequence<sal_Int32> & offset)440 rtl::OUString SAL_CALL Transliteration_titlecase::transliterate(
441     const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
442 	Sequence< sal_Int32 >& offset )
443     throw(RuntimeException)
444 {
445     return transliterate_titlecase_Impl( inStr, startPos, nCount, aLocale, offset );
446 }
447 
448 
Transliteration_sentencecase()449 Transliteration_sentencecase::Transliteration_sentencecase()
450 {
451     nMappingType = MappingTypeToTitle;  // though only to be applied to the first word...
452     transliterationName = "sentence(generic)";
453     implementationName = "com.sun.star.i18n.Transliteration.Transliteration_sentencecase";
454 }
455 
456 
457 // this function expects to be called on a sentence-by-sentence basis,
458 // namely that startPos points to the first word (NOT first char!) in the sentence
transliterate(const OUString & inStr,sal_Int32 startPos,sal_Int32 nCount,Sequence<sal_Int32> & offset)459 rtl::OUString SAL_CALL Transliteration_sentencecase::transliterate(
460     const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
461     Sequence< sal_Int32 >& offset )
462     throw(RuntimeException)
463 {
464     return transliterate_titlecase_Impl( inStr, startPos, nCount, aLocale, offset );
465 }
466 
467 
468 } } } }
469 
470