1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 // MARKER(update_precomp.py): autogen include statement, do not remove
29 #include "precompiled_i18npool.hxx"
30 
31 #include <rtl/ustrbuf.hxx>
32 #include <i18nutil/casefolding.hxx>
33 #include <i18nutil/unicode.hxx>
34 
35 #include <comphelper/processfactory.hxx>
36 #include <osl/diagnose.h>
37 
38 #include <string.h>
39 
40 #include "characterclassificationImpl.hxx"
41 #include "breakiteratorImpl.hxx"
42 
43 #define TRANSLITERATION_ALL
44 #include "transliteration_body.hxx"
45 
46 using namespace ::com::sun::star::uno;
47 using namespace ::com::sun::star::lang;
48 using namespace ::rtl;
49 
50 #define A2OU(x) OUString::createFromAscii(x)
51 
52 namespace com { namespace sun { namespace star { namespace i18n {
53 
54 
55 Transliteration_body::Transliteration_body()
56 {
57 	nMappingType = 0;
58 	transliterationName = "Transliteration_body";
59 	implementationName = "com.sun.star.i18n.Transliteration.Transliteration_body";
60 }
61 
62 sal_Int16 SAL_CALL Transliteration_body::getType() throw(RuntimeException)
63 {
64 	return TransliterationType::ONE_TO_ONE;
65 }
66 
67 sal_Bool SAL_CALL Transliteration_body::equals(
68 	const OUString& /*str1*/, sal_Int32 /*pos1*/, sal_Int32 /*nCount1*/, sal_Int32& /*nMatch1*/,
69 	const OUString& /*str2*/, sal_Int32 /*pos2*/, sal_Int32 /*nCount2*/, sal_Int32& /*nMatch2*/)
70 	throw(RuntimeException)
71 {
72 	throw RuntimeException();
73 }
74 
75 Sequence< OUString > SAL_CALL
76 Transliteration_body::transliterateRange( const OUString& str1, const OUString& str2 )
77 	throw( RuntimeException)
78 {
79 	Sequence< OUString > ostr(2);
80 	ostr[0] = str1;
81 	ostr[1] = str2;
82 	return ostr;
83 }
84 
85 
86 static sal_uInt8 lcl_getMappingTypeForToggleCase( sal_uInt8 nMappingType, sal_Unicode cChar )
87 {
88     sal_uInt8 nRes = nMappingType;
89 
90     // take care of TOGGLE_CASE transliteration:
91     // nMappingType should not be a combination of flags, thuse we decide now
92     // which one to use.
93     if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower))
94     {
95         const sal_Int16 nType = unicode::getUnicodeType( cChar );
96         if (nType & 0x02 /* lower case*/)
97             nRes = MappingTypeLowerToUpper;
98         else
99         {
100             // should also work properly for non-upper characters like white spacs, numbers, ...
101             nRes = MappingTypeUpperToLower;
102         }
103     }
104 
105     return nRes;
106 }
107 
108 
109 OUString SAL_CALL
110 Transliteration_body::transliterate(
111     const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
112 	Sequence< sal_Int32 >& offset)
113     throw(RuntimeException)
114 {
115 #if 0
116 /* Performance optimization:
117  * The two realloc() consume 48% (32% grow, 16% shrink) runtime of this method!
118  * getValue() needs about 15%, so there is equal balance if we trade the second
119  * (shrinking) realloc() for a getValue(). But if the caller initializes the
120  * sequence to nCount elements there isn't any change in size necessary in most
121  * cases (one-to-one mapping) and we gain 33%.
122  *
123  * Of that constellation the getValue() method takes 20% upon each call, so 40%
124  * for both. By remembering the first calls' results we could gain some extra
125  * percentage again, but unfortunately getValue() may return a reference to a
126  * static buffer, so we can't store the pointer directly but would have to
127  * copy-construct an array, which doesn't give us any advantage.
128  *
129  * Much more is accomplished by working directly on the sequence buffer
130  * returned by getArray() instead of using operator[] for each and every
131  * access.
132  *
133  * And while we're at it: now that we know the size in advance we don't need to
134  * copy the buffer anymore, just create the real string buffer and let the
135  * return value take ownership.
136  *
137  * All together these changes result in the new implementation needing only 62%
138  * of the time of the old implementation (in other words: that one was 1.61
139  * times slower ...)
140  */
141 
142     // Allocate the max possible buffer. Try to use stack instead of heap which
143     // would have to be reallocated most times anyway.
144     const sal_Int32 nLocalBuf = 512 * NMAPPINGMAX;
145     sal_Unicode aLocalBuf[nLocalBuf], *out = aLocalBuf, *aHeapBuf = NULL;
146 
147     const sal_Unicode *in = inStr.getStr() + startPos;
148 
149     if (nCount > 512)
150         out = aHeapBuf =  (sal_Unicode*) malloc((nCount * NMAPPINGMAX) * sizeof(sal_Unicode));
151 
152         if (useOffset)
153             offset.realloc(nCount * NMAPPINGMAX);
154 	sal_Int32 j = 0;
155 	for (sal_Int32 i = 0; i < nCount; i++) {
156 	    Mapping &map = casefolding::getValue(in, i, nCount, aLocale, nMappingType);
157 	    for (sal_Int32 k = 0; k < map.nmap; k++) {
158                 if (useOffset)
159                     offset[j] = i + startPos;
160 		out[j++] = map.map[k];
161 	    }
162 	}
163         if (useOffset)
164             offset.realloc(j);
165 
166 	OUString r(out, j);
167 
168 	if (aHeapBuf)
169 	    free(aHeapBuf);
170 
171 	return r;
172 #else
173     const sal_Unicode *in = inStr.getStr() + startPos;
174 
175     // Two different blocks to eliminate the if(useOffset) condition inside the
176     // inner k loop. Yes, on massive use even such small things do count.
177     if ( useOffset )
178     {
179         sal_Int32 nOffCount = 0, i;
180         for (i = 0; i < nCount; i++)
181         {
182             // take care of TOGGLE_CASE transliteration:
183             sal_uInt8 nTmpMappingType = nMappingType;
184             if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower))
185                 nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] );
186 
187             const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType );
188             nOffCount += map.nmap;
189         }
190         rtl_uString* pStr = x_rtl_uString_new_WithLength( nOffCount, 1 );  // our x_rtl_ustring.h
191         sal_Unicode* out = pStr->buffer;
192 
193         if ( nOffCount != offset.getLength() )
194             offset.realloc( nOffCount );
195 
196         sal_Int32 j = 0;
197         sal_Int32 * pArr = offset.getArray();
198         for (i = 0; i < nCount; i++)
199         {
200             // take care of TOGGLE_CASE transliteration:
201             sal_uInt8 nTmpMappingType = nMappingType;
202             if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower))
203                 nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] );
204 
205             const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType );
206             for (sal_Int32 k = 0; k < map.nmap; k++)
207             {
208                 pArr[j] = i + startPos;
209                 out[j++] = map.map[k];
210             }
211         }
212         out[j] = 0;
213 
214         return OUString( pStr, SAL_NO_ACQUIRE );
215     }
216     else
217     {
218         // In the simple case of no offset sequence used we can eliminate the
219         // first getValue() loop. We could also assume that most calls result
220         // in identical string lengths, thus using a preallocated
221         // OUStringBuffer could be an easy way to assemble the return string
222         // without too much hassle. However, for single characters the
223         // OUStringBuffer::append() method is quite expensive compared to a
224         // simple array operation, so it pays here to copy the final result
225         // instead.
226 
227         // Allocate the max possible buffer. Try to use stack instead of heap,
228         // which would have to be reallocated most times anyways.
229         const sal_Int32 nLocalBuf = 2048;
230         sal_Unicode aLocalBuf[ nLocalBuf * NMAPPINGMAX ], *out = aLocalBuf, *pHeapBuf = NULL;
231         if ( nCount > nLocalBuf )
232             out = pHeapBuf = new sal_Unicode[ nCount * NMAPPINGMAX ];
233 
234         sal_Int32 j = 0;
235         for ( sal_Int32 i = 0; i < nCount; i++)
236         {
237             // take care of TOGGLE_CASE transliteration:
238             sal_uInt8 nTmpMappingType = nMappingType;
239             if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower))
240                 nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] );
241 
242             const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType );
243             for (sal_Int32 k = 0; k < map.nmap; k++)
244             {
245                 out[j++] = map.map[k];
246             }
247         }
248 
249         OUString aRet( out, j );
250         if ( pHeapBuf )
251             delete [] pHeapBuf;
252         return aRet;
253     }
254 #endif
255 }
256 
257 OUString SAL_CALL
258 Transliteration_body::transliterateChar2String( sal_Unicode inChar ) throw(RuntimeException)
259 {
260         const Mapping &map = casefolding::getValue(&inChar, 0, 1, aLocale, nMappingType);
261         rtl_uString* pStr = x_rtl_uString_new_WithLength( map.nmap, 1 );  // our x_rtl_ustring.h
262         sal_Unicode* out = pStr->buffer;
263         sal_Int32 i;
264 
265         for (i = 0; i < map.nmap; i++)
266             out[i] = map.map[i];
267         out[i] = 0;
268 
269         return OUString( pStr, SAL_NO_ACQUIRE );
270 }
271 
272 sal_Unicode SAL_CALL
273 Transliteration_body::transliterateChar2Char( sal_Unicode inChar ) throw(MultipleCharsOutputException, RuntimeException)
274 {
275         const Mapping &map = casefolding::getValue(&inChar, 0, 1, aLocale, nMappingType);
276         if (map.nmap > 1)
277             throw MultipleCharsOutputException();
278         return map.map[0];
279 }
280 
281 OUString SAL_CALL
282 Transliteration_body::folding( const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
283 	Sequence< sal_Int32 >& offset) throw(RuntimeException)
284 {
285 	return this->transliterate(inStr, startPos, nCount, offset);
286 }
287 
288 Transliteration_casemapping::Transliteration_casemapping()
289 {
290 	nMappingType = 0;
291 	transliterationName = "casemapping(generic)";
292 	implementationName = "com.sun.star.i18n.Transliteration.Transliteration_casemapping";
293 }
294 
295 void SAL_CALL
296 Transliteration_casemapping::setMappingType( const sal_uInt8 rMappingType, const Locale& rLocale )
297 {
298 	nMappingType = rMappingType;
299 	aLocale = rLocale;
300 }
301 
302 Transliteration_u2l::Transliteration_u2l()
303 {
304 	nMappingType = MappingTypeUpperToLower;
305 	transliterationName = "upper_to_lower(generic)";
306 	implementationName = "com.sun.star.i18n.Transliteration.Transliteration_u2l";
307 }
308 
309 Transliteration_l2u::Transliteration_l2u()
310 {
311 	nMappingType = MappingTypeLowerToUpper;
312 	transliterationName = "lower_to_upper(generic)";
313 	implementationName = "com.sun.star.i18n.Transliteration.Transliteration_l2u";
314 }
315 
316 Transliteration_togglecase::Transliteration_togglecase()
317 {
318     // usually nMappingType must NOT be a combiantion of different flages here,
319     // but we take care of that problem in Transliteration_body::transliterate above
320     // before that value is used. There we will decide which of both is to be used on
321     // a per character basis.
322     nMappingType = MappingTypeLowerToUpper | MappingTypeUpperToLower;
323     transliterationName = "toggle(generic)";
324     implementationName = "com.sun.star.i18n.Transliteration.Transliteration_togglecase";
325 }
326 
327 Transliteration_titlecase::Transliteration_titlecase()
328 {
329     nMappingType = MappingTypeToTitle;
330     transliterationName = "title(generic)";
331     implementationName = "com.sun.star.i18n.Transliteration.Transliteration_titlecase";
332 }
333 
334 #if 0
335 struct LigatureData
336 {
337     sal_uInt32  cChar;
338     sal_Char *  pUtf8Text;
339 };
340 
341 // available Unicode ligatures:
342 // http://www.unicode.org/charts
343 // http://www.unicode.org/charts/PDF/UFB00.pdf
344 static LigatureData aLigatures[] =
345 {
346     { 0x0FB00,     "ff" },
347     { 0x0FB01,     "fi" },
348     { 0x0FB02,     "fl" },
349     { 0x0FB03,     "ffi" },
350     { 0x0FB04,     "ffl" },
351     { 0x0FB05,     "ft" },
352     { 0x0FB06,     "st" },
353 
354     { 0x0FB13,     "\xD5\xB4\xD5\xB6" },     // Armenian small men now
355     { 0x0FB14,     "\xD5\xB4\xD5\xA5" },     // Armenian small men ech
356     { 0x0FB15,     "\xD5\xB4\xD5\xAB" },     // Armenian small men ini
357     { 0x0FB16,     "\xD5\xBE\xD5\xB6" },     // Armenian small vew now
358     { 0x0FB17,     "\xD5\xB4\xD5\xAD" },     // Armenian small men xeh
359     { 0x00000,     "" }
360 };
361 
362 static inline bool lcl_IsLigature( sal_uInt32 cChar )
363 {
364     return (0x0FB00 <= cChar && cChar <= 0x0FB06) || (0x0FB13 <= cChar && cChar <= 0x0FB17);
365 }
366 
367 static rtl::OUString lcl_ResolveLigature( sal_uInt32 cChar )
368 {
369     rtl::OUString aRes;
370     if (lcl_IsLigature( cChar ))
371     {
372         LigatureData *pFound = NULL;
373         LigatureData *pData = aLigatures;
374         while (!pFound && pData->cChar != 0)
375         {
376             if (pData->cChar == cChar)
377                 pFound = pData;
378             ++pData;
379         }
380         if (pFound)
381             aRes = rtl::OUString( pFound->pUtf8Text, strlen( pFound->pUtf8Text ), RTL_TEXTENCODING_UTF8 );
382     }
383     else
384         aRes = rtl::OUString( &cChar, 1 );
385     return aRes;
386 }
387 #endif // if 0
388 
389 static rtl::OUString transliterate_titlecase_Impl(
390     const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
391     const Locale &rLocale,
392 	Sequence< sal_Int32 >& offset )
393     throw(RuntimeException)
394 {
395     const OUString aText( inStr.copy( startPos, nCount ) );
396 
397     OUString aRes;
398     if (aText.getLength() > 0)
399     {
400         Reference< XMultiServiceFactory > xMSF = ::comphelper::getProcessServiceFactory();
401         CharacterClassificationImpl aCharClassImpl( xMSF );
402 
403         // because aCharClassImpl.toTitle does not handle ligatures or � but will raise
404         // an exception we need to handle the first chara manually...
405 
406         // we don't want to change surrogates by accident, thuse we use proper code point iteration
407         sal_Int32 nPos = 0;
408         sal_uInt32 cFirstChar = aText.iterateCodePoints( &nPos );
409         OUString aResolvedLigature( &cFirstChar, 1 ); //lcl_ResolveLigature( cFirstChar ) );
410         // toUpper can be used to properly resolve ligatures and characters like �
411         aResolvedLigature = aCharClassImpl.toUpper( aResolvedLigature, 0, aResolvedLigature.getLength(), rLocale );
412         // since toTitle will leave all-uppercase text unchanged we first need to
413         // use toLower to bring possible 2nd and following charas in lowercase
414         aResolvedLigature = aCharClassImpl.toLower( aResolvedLigature, 0, aResolvedLigature.getLength(), rLocale );
415         sal_Int32 nResolvedLen = aResolvedLigature.getLength();
416 
417         // now we can properly use toTitle to get the expected result for the resolved string.
418         // The rest of the text should just become lowercase.
419         aRes = aCharClassImpl.toTitle( aResolvedLigature, 0, nResolvedLen, rLocale );
420         aRes += aCharClassImpl.toLower( aText, 1, aText.getLength() - 1, rLocale );
421         offset.realloc( aRes.getLength() );
422 
423         sal_Int32 *pOffset = offset.getArray();
424         sal_Int32 nLen = offset.getLength();
425         for (sal_Int32 i = 0; i < nLen; ++i)
426         {
427             sal_Int32 nIdx = 0;
428             if (i >= nResolvedLen)
429                 nIdx = i - nResolvedLen + 1;
430             pOffset[i] = nIdx;
431         }
432     }
433 #if OSL_DEBUG_LEVEL > 1
434     const sal_Int32 *pCOffset = offset.getConstArray();
435     (void) pCOffset;
436 #endif
437 
438     return aRes;
439 }
440 
441 
442 // this function expects to be called on a word-by-word basis,
443 // namely that startPos points to the first char of the word
444 rtl::OUString SAL_CALL Transliteration_titlecase::transliterate(
445     const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
446 	Sequence< sal_Int32 >& offset )
447     throw(RuntimeException)
448 {
449     return transliterate_titlecase_Impl( inStr, startPos, nCount, aLocale, offset );
450 }
451 
452 
453 Transliteration_sentencecase::Transliteration_sentencecase()
454 {
455     nMappingType = MappingTypeToTitle;  // though only to be applied to the first word...
456     transliterationName = "sentence(generic)";
457     implementationName = "com.sun.star.i18n.Transliteration.Transliteration_sentencecase";
458 }
459 
460 
461 // this function expects to be called on a sentence-by-sentence basis,
462 // namely that startPos points to the first word (NOT first char!) in the sentence
463 rtl::OUString SAL_CALL Transliteration_sentencecase::transliterate(
464     const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
465     Sequence< sal_Int32 >& offset )
466     throw(RuntimeException)
467 {
468     return transliterate_titlecase_Impl( inStr, startPos, nCount, aLocale, offset );
469 }
470 
471 
472 } } } }
473 
474