1449ab281SAndrew Rist /**************************************************************
2cdf0e10cSrcweir  *
3449ab281SAndrew Rist  * Licensed to the Apache Software Foundation (ASF) under one
4449ab281SAndrew Rist  * or more contributor license agreements.  See the NOTICE file
5449ab281SAndrew Rist  * distributed with this work for additional information
6449ab281SAndrew Rist  * regarding copyright ownership.  The ASF licenses this file
7449ab281SAndrew Rist  * to you under the Apache License, Version 2.0 (the
8449ab281SAndrew Rist  * "License"); you may not use this file except in compliance
9449ab281SAndrew Rist  * with the License.  You may obtain a copy of the License at
10449ab281SAndrew Rist  *
11449ab281SAndrew Rist  *   http://www.apache.org/licenses/LICENSE-2.0
12449ab281SAndrew Rist  *
13449ab281SAndrew Rist  * Unless required by applicable law or agreed to in writing,
14449ab281SAndrew Rist  * software distributed under the License is distributed on an
15449ab281SAndrew Rist  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16449ab281SAndrew Rist  * KIND, either express or implied.  See the License for the
17449ab281SAndrew Rist  * specific language governing permissions and limitations
18449ab281SAndrew Rist  * under the License.
19449ab281SAndrew Rist  *
20449ab281SAndrew Rist  *************************************************************/
21449ab281SAndrew Rist 
22449ab281SAndrew Rist 
23cdf0e10cSrcweir 
24cdf0e10cSrcweir // MARKER(update_precomp.py): autogen include statement, do not remove
25cdf0e10cSrcweir #include "precompiled_i18npool.hxx"
26cdf0e10cSrcweir 
27cdf0e10cSrcweir #include <rtl/ustrbuf.hxx>
28cdf0e10cSrcweir #include <i18nutil/casefolding.hxx>
29cdf0e10cSrcweir #include <i18nutil/unicode.hxx>
30cdf0e10cSrcweir 
31cdf0e10cSrcweir #include <comphelper/processfactory.hxx>
32cdf0e10cSrcweir #include <osl/diagnose.h>
33cdf0e10cSrcweir 
34cdf0e10cSrcweir #include <string.h>
35cdf0e10cSrcweir 
36cdf0e10cSrcweir #include "characterclassificationImpl.hxx"
37cdf0e10cSrcweir #include "breakiteratorImpl.hxx"
38cdf0e10cSrcweir 
39cdf0e10cSrcweir #define TRANSLITERATION_ALL
40cdf0e10cSrcweir #include "transliteration_body.hxx"
41cdf0e10cSrcweir 
42cdf0e10cSrcweir using namespace ::com::sun::star::uno;
43cdf0e10cSrcweir using namespace ::com::sun::star::lang;
44cdf0e10cSrcweir using namespace ::rtl;
45cdf0e10cSrcweir 
46cdf0e10cSrcweir #define A2OU(x) OUString::createFromAscii(x)
47cdf0e10cSrcweir 
48cdf0e10cSrcweir namespace com { namespace sun { namespace star { namespace i18n {
49cdf0e10cSrcweir 
50cdf0e10cSrcweir 
Transliteration_body()51cdf0e10cSrcweir Transliteration_body::Transliteration_body()
52cdf0e10cSrcweir {
53cdf0e10cSrcweir 	nMappingType = 0;
54cdf0e10cSrcweir 	transliterationName = "Transliteration_body";
55cdf0e10cSrcweir 	implementationName = "com.sun.star.i18n.Transliteration.Transliteration_body";
56cdf0e10cSrcweir }
57cdf0e10cSrcweir 
getType()58cdf0e10cSrcweir sal_Int16 SAL_CALL Transliteration_body::getType() throw(RuntimeException)
59cdf0e10cSrcweir {
60cdf0e10cSrcweir 	return TransliterationType::ONE_TO_ONE;
61cdf0e10cSrcweir }
62cdf0e10cSrcweir 
equals(const OUString &,sal_Int32,sal_Int32,sal_Int32 &,const OUString &,sal_Int32,sal_Int32,sal_Int32 &)63cdf0e10cSrcweir sal_Bool SAL_CALL Transliteration_body::equals(
64cdf0e10cSrcweir 	const OUString& /*str1*/, sal_Int32 /*pos1*/, sal_Int32 /*nCount1*/, sal_Int32& /*nMatch1*/,
65cdf0e10cSrcweir 	const OUString& /*str2*/, sal_Int32 /*pos2*/, sal_Int32 /*nCount2*/, sal_Int32& /*nMatch2*/)
66cdf0e10cSrcweir 	throw(RuntimeException)
67cdf0e10cSrcweir {
68cdf0e10cSrcweir 	throw RuntimeException();
69cdf0e10cSrcweir }
70cdf0e10cSrcweir 
71cdf0e10cSrcweir Sequence< OUString > SAL_CALL
transliterateRange(const OUString & str1,const OUString & str2)72cdf0e10cSrcweir Transliteration_body::transliterateRange( const OUString& str1, const OUString& str2 )
73cdf0e10cSrcweir 	throw( RuntimeException)
74cdf0e10cSrcweir {
75cdf0e10cSrcweir 	Sequence< OUString > ostr(2);
76cdf0e10cSrcweir 	ostr[0] = str1;
77cdf0e10cSrcweir 	ostr[1] = str2;
78cdf0e10cSrcweir 	return ostr;
79cdf0e10cSrcweir }
80cdf0e10cSrcweir 
81cdf0e10cSrcweir 
lcl_getMappingTypeForToggleCase(sal_uInt8 nMappingType,sal_Unicode cChar)82cdf0e10cSrcweir static sal_uInt8 lcl_getMappingTypeForToggleCase( sal_uInt8 nMappingType, sal_Unicode cChar )
83cdf0e10cSrcweir {
84cdf0e10cSrcweir     sal_uInt8 nRes = nMappingType;
85cdf0e10cSrcweir 
86cdf0e10cSrcweir     // take care of TOGGLE_CASE transliteration:
87cdf0e10cSrcweir     // nMappingType should not be a combination of flags, thuse we decide now
88cdf0e10cSrcweir     // which one to use.
89cdf0e10cSrcweir     if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower))
90cdf0e10cSrcweir     {
91cdf0e10cSrcweir         const sal_Int16 nType = unicode::getUnicodeType( cChar );
92cdf0e10cSrcweir         if (nType & 0x02 /* lower case*/)
93cdf0e10cSrcweir             nRes = MappingTypeLowerToUpper;
94cdf0e10cSrcweir         else
95cdf0e10cSrcweir         {
96cdf0e10cSrcweir             // should also work properly for non-upper characters like white spacs, numbers, ...
97cdf0e10cSrcweir             nRes = MappingTypeUpperToLower;
98cdf0e10cSrcweir         }
99cdf0e10cSrcweir     }
100cdf0e10cSrcweir 
101cdf0e10cSrcweir     return nRes;
102cdf0e10cSrcweir }
103cdf0e10cSrcweir 
104cdf0e10cSrcweir 
105cdf0e10cSrcweir OUString SAL_CALL
transliterate(const OUString & inStr,sal_Int32 startPos,sal_Int32 nCount,Sequence<sal_Int32> & offset)106cdf0e10cSrcweir Transliteration_body::transliterate(
107cdf0e10cSrcweir     const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
108cdf0e10cSrcweir 	Sequence< sal_Int32 >& offset)
109cdf0e10cSrcweir     throw(RuntimeException)
110cdf0e10cSrcweir {
111cdf0e10cSrcweir #if 0
112cdf0e10cSrcweir /* Performance optimization:
113cdf0e10cSrcweir  * The two realloc() consume 48% (32% grow, 16% shrink) runtime of this method!
114cdf0e10cSrcweir  * getValue() needs about 15%, so there is equal balance if we trade the second
115cdf0e10cSrcweir  * (shrinking) realloc() for a getValue(). But if the caller initializes the
116cdf0e10cSrcweir  * sequence to nCount elements there isn't any change in size necessary in most
117cdf0e10cSrcweir  * cases (one-to-one mapping) and we gain 33%.
118cdf0e10cSrcweir  *
119cdf0e10cSrcweir  * Of that constellation the getValue() method takes 20% upon each call, so 40%
120cdf0e10cSrcweir  * for both. By remembering the first calls' results we could gain some extra
121cdf0e10cSrcweir  * percentage again, but unfortunately getValue() may return a reference to a
122cdf0e10cSrcweir  * static buffer, so we can't store the pointer directly but would have to
123cdf0e10cSrcweir  * copy-construct an array, which doesn't give us any advantage.
124cdf0e10cSrcweir  *
125cdf0e10cSrcweir  * Much more is accomplished by working directly on the sequence buffer
126cdf0e10cSrcweir  * returned by getArray() instead of using operator[] for each and every
127cdf0e10cSrcweir  * access.
128cdf0e10cSrcweir  *
129cdf0e10cSrcweir  * And while we're at it: now that we know the size in advance we don't need to
130cdf0e10cSrcweir  * copy the buffer anymore, just create the real string buffer and let the
131cdf0e10cSrcweir  * return value take ownership.
132cdf0e10cSrcweir  *
133cdf0e10cSrcweir  * All together these changes result in the new implementation needing only 62%
134cdf0e10cSrcweir  * of the time of the old implementation (in other words: that one was 1.61
135cdf0e10cSrcweir  * times slower ...)
136cdf0e10cSrcweir  */
137cdf0e10cSrcweir 
138cdf0e10cSrcweir     // Allocate the max possible buffer. Try to use stack instead of heap which
139cdf0e10cSrcweir     // would have to be reallocated most times anyway.
140cdf0e10cSrcweir     const sal_Int32 nLocalBuf = 512 * NMAPPINGMAX;
141cdf0e10cSrcweir     sal_Unicode aLocalBuf[nLocalBuf], *out = aLocalBuf, *aHeapBuf = NULL;
142cdf0e10cSrcweir 
143cdf0e10cSrcweir     const sal_Unicode *in = inStr.getStr() + startPos;
144cdf0e10cSrcweir 
145cdf0e10cSrcweir     if (nCount > 512)
146cdf0e10cSrcweir         out = aHeapBuf =  (sal_Unicode*) malloc((nCount * NMAPPINGMAX) * sizeof(sal_Unicode));
147cdf0e10cSrcweir 
148cdf0e10cSrcweir         if (useOffset)
149cdf0e10cSrcweir             offset.realloc(nCount * NMAPPINGMAX);
150cdf0e10cSrcweir 	sal_Int32 j = 0;
151cdf0e10cSrcweir 	for (sal_Int32 i = 0; i < nCount; i++) {
152cdf0e10cSrcweir 	    Mapping &map = casefolding::getValue(in, i, nCount, aLocale, nMappingType);
153cdf0e10cSrcweir 	    for (sal_Int32 k = 0; k < map.nmap; k++) {
154cdf0e10cSrcweir                 if (useOffset)
155cdf0e10cSrcweir                     offset[j] = i + startPos;
156cdf0e10cSrcweir 		out[j++] = map.map[k];
157cdf0e10cSrcweir 	    }
158cdf0e10cSrcweir 	}
159cdf0e10cSrcweir         if (useOffset)
160cdf0e10cSrcweir             offset.realloc(j);
161cdf0e10cSrcweir 
162cdf0e10cSrcweir 	OUString r(out, j);
163cdf0e10cSrcweir 
164cdf0e10cSrcweir 	if (aHeapBuf)
165cdf0e10cSrcweir 	    free(aHeapBuf);
166cdf0e10cSrcweir 
167cdf0e10cSrcweir 	return r;
168cdf0e10cSrcweir #else
169cdf0e10cSrcweir     const sal_Unicode *in = inStr.getStr() + startPos;
170cdf0e10cSrcweir 
171cdf0e10cSrcweir     // Two different blocks to eliminate the if(useOffset) condition inside the
172cdf0e10cSrcweir     // inner k loop. Yes, on massive use even such small things do count.
173cdf0e10cSrcweir     if ( useOffset )
174cdf0e10cSrcweir     {
175cdf0e10cSrcweir         sal_Int32 nOffCount = 0, i;
176cdf0e10cSrcweir         for (i = 0; i < nCount; i++)
177cdf0e10cSrcweir         {
178cdf0e10cSrcweir             // take care of TOGGLE_CASE transliteration:
179cdf0e10cSrcweir             sal_uInt8 nTmpMappingType = nMappingType;
180cdf0e10cSrcweir             if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower))
181cdf0e10cSrcweir                 nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] );
182cdf0e10cSrcweir 
183cdf0e10cSrcweir             const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType );
184cdf0e10cSrcweir             nOffCount += map.nmap;
185cdf0e10cSrcweir         }
186*4674bdb9SOliver-Rainer Wittmann         rtl_uString* pStr = x_rtl_uString_new_WithLength( nOffCount );  // our x_rtl_ustring.h
187cdf0e10cSrcweir         sal_Unicode* out = pStr->buffer;
188cdf0e10cSrcweir 
189cdf0e10cSrcweir         if ( nOffCount != offset.getLength() )
190cdf0e10cSrcweir             offset.realloc( nOffCount );
191cdf0e10cSrcweir 
192cdf0e10cSrcweir         sal_Int32 j = 0;
193cdf0e10cSrcweir         sal_Int32 * pArr = offset.getArray();
194cdf0e10cSrcweir         for (i = 0; i < nCount; i++)
195cdf0e10cSrcweir         {
196cdf0e10cSrcweir             // take care of TOGGLE_CASE transliteration:
197cdf0e10cSrcweir             sal_uInt8 nTmpMappingType = nMappingType;
198cdf0e10cSrcweir             if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower))
199cdf0e10cSrcweir                 nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] );
200cdf0e10cSrcweir 
201cdf0e10cSrcweir             const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType );
202cdf0e10cSrcweir             for (sal_Int32 k = 0; k < map.nmap; k++)
203cdf0e10cSrcweir             {
204cdf0e10cSrcweir                 pArr[j] = i + startPos;
205cdf0e10cSrcweir                 out[j++] = map.map[k];
206cdf0e10cSrcweir             }
207cdf0e10cSrcweir         }
208cdf0e10cSrcweir         out[j] = 0;
209cdf0e10cSrcweir 
210*4674bdb9SOliver-Rainer Wittmann         return OUString( pStr, SAL_NO_ACQUIRE ); // take over ownership of <pStr>
211cdf0e10cSrcweir     }
212cdf0e10cSrcweir     else
213cdf0e10cSrcweir     {
214cdf0e10cSrcweir         // In the simple case of no offset sequence used we can eliminate the
215cdf0e10cSrcweir         // first getValue() loop. We could also assume that most calls result
216cdf0e10cSrcweir         // in identical string lengths, thus using a preallocated
217cdf0e10cSrcweir         // OUStringBuffer could be an easy way to assemble the return string
218cdf0e10cSrcweir         // without too much hassle. However, for single characters the
219cdf0e10cSrcweir         // OUStringBuffer::append() method is quite expensive compared to a
220cdf0e10cSrcweir         // simple array operation, so it pays here to copy the final result
221cdf0e10cSrcweir         // instead.
222cdf0e10cSrcweir 
223cdf0e10cSrcweir         // Allocate the max possible buffer. Try to use stack instead of heap,
224cdf0e10cSrcweir         // which would have to be reallocated most times anyways.
225cdf0e10cSrcweir         const sal_Int32 nLocalBuf = 2048;
226cdf0e10cSrcweir         sal_Unicode aLocalBuf[ nLocalBuf * NMAPPINGMAX ], *out = aLocalBuf, *pHeapBuf = NULL;
227cdf0e10cSrcweir         if ( nCount > nLocalBuf )
228cdf0e10cSrcweir             out = pHeapBuf = new sal_Unicode[ nCount * NMAPPINGMAX ];
229cdf0e10cSrcweir 
230cdf0e10cSrcweir         sal_Int32 j = 0;
231cdf0e10cSrcweir         for ( sal_Int32 i = 0; i < nCount; i++)
232cdf0e10cSrcweir         {
233cdf0e10cSrcweir             // take care of TOGGLE_CASE transliteration:
234cdf0e10cSrcweir             sal_uInt8 nTmpMappingType = nMappingType;
235cdf0e10cSrcweir             if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower))
236cdf0e10cSrcweir                 nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] );
237cdf0e10cSrcweir 
238cdf0e10cSrcweir             const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType );
239cdf0e10cSrcweir             for (sal_Int32 k = 0; k < map.nmap; k++)
240cdf0e10cSrcweir             {
241cdf0e10cSrcweir                 out[j++] = map.map[k];
242cdf0e10cSrcweir             }
243cdf0e10cSrcweir         }
244cdf0e10cSrcweir 
245cdf0e10cSrcweir         OUString aRet( out, j );
246cdf0e10cSrcweir         if ( pHeapBuf )
247cdf0e10cSrcweir             delete [] pHeapBuf;
248cdf0e10cSrcweir         return aRet;
249cdf0e10cSrcweir     }
250cdf0e10cSrcweir #endif
251cdf0e10cSrcweir }
252cdf0e10cSrcweir 
253cdf0e10cSrcweir OUString SAL_CALL
transliterateChar2String(sal_Unicode inChar)254cdf0e10cSrcweir Transliteration_body::transliterateChar2String( sal_Unicode inChar ) throw(RuntimeException)
255cdf0e10cSrcweir {
256cdf0e10cSrcweir         const Mapping &map = casefolding::getValue(&inChar, 0, 1, aLocale, nMappingType);
257*4674bdb9SOliver-Rainer Wittmann         rtl_uString* pStr = x_rtl_uString_new_WithLength( map.nmap );  // our x_rtl_ustring.h
258cdf0e10cSrcweir         sal_Unicode* out = pStr->buffer;
259cdf0e10cSrcweir         sal_Int32 i;
260cdf0e10cSrcweir 
261cdf0e10cSrcweir         for (i = 0; i < map.nmap; i++)
262cdf0e10cSrcweir             out[i] = map.map[i];
263cdf0e10cSrcweir         out[i] = 0;
264cdf0e10cSrcweir 
265*4674bdb9SOliver-Rainer Wittmann         return OUString( pStr, SAL_NO_ACQUIRE ); // take over ownership of <pStr>
266cdf0e10cSrcweir }
267cdf0e10cSrcweir 
268cdf0e10cSrcweir sal_Unicode SAL_CALL
transliterateChar2Char(sal_Unicode inChar)269cdf0e10cSrcweir Transliteration_body::transliterateChar2Char( sal_Unicode inChar ) throw(MultipleCharsOutputException, RuntimeException)
270cdf0e10cSrcweir {
271cdf0e10cSrcweir         const Mapping &map = casefolding::getValue(&inChar, 0, 1, aLocale, nMappingType);
272cdf0e10cSrcweir         if (map.nmap > 1)
273cdf0e10cSrcweir             throw MultipleCharsOutputException();
274cdf0e10cSrcweir         return map.map[0];
275cdf0e10cSrcweir }
276cdf0e10cSrcweir 
277cdf0e10cSrcweir OUString SAL_CALL
folding(const OUString & inStr,sal_Int32 startPos,sal_Int32 nCount,Sequence<sal_Int32> & offset)278cdf0e10cSrcweir Transliteration_body::folding( const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
279cdf0e10cSrcweir 	Sequence< sal_Int32 >& offset) throw(RuntimeException)
280cdf0e10cSrcweir {
281cdf0e10cSrcweir 	return this->transliterate(inStr, startPos, nCount, offset);
282cdf0e10cSrcweir }
283cdf0e10cSrcweir 
Transliteration_casemapping()284cdf0e10cSrcweir Transliteration_casemapping::Transliteration_casemapping()
285cdf0e10cSrcweir {
286cdf0e10cSrcweir 	nMappingType = 0;
287cdf0e10cSrcweir 	transliterationName = "casemapping(generic)";
288cdf0e10cSrcweir 	implementationName = "com.sun.star.i18n.Transliteration.Transliteration_casemapping";
289cdf0e10cSrcweir }
290cdf0e10cSrcweir 
291cdf0e10cSrcweir void SAL_CALL
setMappingType(const sal_uInt8 rMappingType,const Locale & rLocale)292cdf0e10cSrcweir Transliteration_casemapping::setMappingType( const sal_uInt8 rMappingType, const Locale& rLocale )
293cdf0e10cSrcweir {
294cdf0e10cSrcweir 	nMappingType = rMappingType;
295cdf0e10cSrcweir 	aLocale = rLocale;
296cdf0e10cSrcweir }
297cdf0e10cSrcweir 
Transliteration_u2l()298cdf0e10cSrcweir Transliteration_u2l::Transliteration_u2l()
299cdf0e10cSrcweir {
300cdf0e10cSrcweir 	nMappingType = MappingTypeUpperToLower;
301cdf0e10cSrcweir 	transliterationName = "upper_to_lower(generic)";
302cdf0e10cSrcweir 	implementationName = "com.sun.star.i18n.Transliteration.Transliteration_u2l";
303cdf0e10cSrcweir }
304cdf0e10cSrcweir 
Transliteration_l2u()305cdf0e10cSrcweir Transliteration_l2u::Transliteration_l2u()
306cdf0e10cSrcweir {
307cdf0e10cSrcweir 	nMappingType = MappingTypeLowerToUpper;
308cdf0e10cSrcweir 	transliterationName = "lower_to_upper(generic)";
309cdf0e10cSrcweir 	implementationName = "com.sun.star.i18n.Transliteration.Transliteration_l2u";
310cdf0e10cSrcweir }
311cdf0e10cSrcweir 
Transliteration_togglecase()312cdf0e10cSrcweir Transliteration_togglecase::Transliteration_togglecase()
313cdf0e10cSrcweir {
314cdf0e10cSrcweir     // usually nMappingType must NOT be a combiantion of different flages here,
315cdf0e10cSrcweir     // but we take care of that problem in Transliteration_body::transliterate above
316cdf0e10cSrcweir     // before that value is used. There we will decide which of both is to be used on
317cdf0e10cSrcweir     // a per character basis.
318cdf0e10cSrcweir     nMappingType = MappingTypeLowerToUpper | MappingTypeUpperToLower;
319cdf0e10cSrcweir     transliterationName = "toggle(generic)";
320cdf0e10cSrcweir     implementationName = "com.sun.star.i18n.Transliteration.Transliteration_togglecase";
321cdf0e10cSrcweir }
322cdf0e10cSrcweir 
Transliteration_titlecase()323cdf0e10cSrcweir Transliteration_titlecase::Transliteration_titlecase()
324cdf0e10cSrcweir {
325cdf0e10cSrcweir     nMappingType = MappingTypeToTitle;
326cdf0e10cSrcweir     transliterationName = "title(generic)";
327cdf0e10cSrcweir     implementationName = "com.sun.star.i18n.Transliteration.Transliteration_titlecase";
328cdf0e10cSrcweir }
329cdf0e10cSrcweir 
330cdf0e10cSrcweir #if 0
331cdf0e10cSrcweir struct LigatureData
332cdf0e10cSrcweir {
333cdf0e10cSrcweir     sal_uInt32  cChar;
334cdf0e10cSrcweir     sal_Char *  pUtf8Text;
335cdf0e10cSrcweir };
336cdf0e10cSrcweir 
337cdf0e10cSrcweir // available Unicode ligatures:
338cdf0e10cSrcweir // http://www.unicode.org/charts
339cdf0e10cSrcweir // http://www.unicode.org/charts/PDF/UFB00.pdf
340cdf0e10cSrcweir static LigatureData aLigatures[] =
341cdf0e10cSrcweir {
342cdf0e10cSrcweir     { 0x0FB00,     "ff" },
343cdf0e10cSrcweir     { 0x0FB01,     "fi" },
344cdf0e10cSrcweir     { 0x0FB02,     "fl" },
345cdf0e10cSrcweir     { 0x0FB03,     "ffi" },
346cdf0e10cSrcweir     { 0x0FB04,     "ffl" },
347cdf0e10cSrcweir     { 0x0FB05,     "ft" },
348cdf0e10cSrcweir     { 0x0FB06,     "st" },
349cdf0e10cSrcweir 
350cdf0e10cSrcweir     { 0x0FB13,     "\xD5\xB4\xD5\xB6" },     // Armenian small men now
351cdf0e10cSrcweir     { 0x0FB14,     "\xD5\xB4\xD5\xA5" },     // Armenian small men ech
352cdf0e10cSrcweir     { 0x0FB15,     "\xD5\xB4\xD5\xAB" },     // Armenian small men ini
353cdf0e10cSrcweir     { 0x0FB16,     "\xD5\xBE\xD5\xB6" },     // Armenian small vew now
354cdf0e10cSrcweir     { 0x0FB17,     "\xD5\xB4\xD5\xAD" },     // Armenian small men xeh
355cdf0e10cSrcweir     { 0x00000,     "" }
356cdf0e10cSrcweir };
357cdf0e10cSrcweir 
358cdf0e10cSrcweir static inline bool lcl_IsLigature( sal_uInt32 cChar )
359cdf0e10cSrcweir {
360cdf0e10cSrcweir     return (0x0FB00 <= cChar && cChar <= 0x0FB06) || (0x0FB13 <= cChar && cChar <= 0x0FB17);
361cdf0e10cSrcweir }
362cdf0e10cSrcweir 
363cdf0e10cSrcweir static rtl::OUString lcl_ResolveLigature( sal_uInt32 cChar )
364cdf0e10cSrcweir {
365cdf0e10cSrcweir     rtl::OUString aRes;
366cdf0e10cSrcweir     if (lcl_IsLigature( cChar ))
367cdf0e10cSrcweir     {
368cdf0e10cSrcweir         LigatureData *pFound = NULL;
369cdf0e10cSrcweir         LigatureData *pData = aLigatures;
370cdf0e10cSrcweir         while (!pFound && pData->cChar != 0)
371cdf0e10cSrcweir         {
372cdf0e10cSrcweir             if (pData->cChar == cChar)
373cdf0e10cSrcweir                 pFound = pData;
374cdf0e10cSrcweir             ++pData;
375cdf0e10cSrcweir         }
376cdf0e10cSrcweir         if (pFound)
377cdf0e10cSrcweir             aRes = rtl::OUString( pFound->pUtf8Text, strlen( pFound->pUtf8Text ), RTL_TEXTENCODING_UTF8 );
378cdf0e10cSrcweir     }
379cdf0e10cSrcweir     else
380cdf0e10cSrcweir         aRes = rtl::OUString( &cChar, 1 );
381cdf0e10cSrcweir     return aRes;
382cdf0e10cSrcweir }
383cdf0e10cSrcweir #endif // if 0
384cdf0e10cSrcweir 
transliterate_titlecase_Impl(const OUString & inStr,sal_Int32 startPos,sal_Int32 nCount,const Locale & rLocale,Sequence<sal_Int32> & offset)385cdf0e10cSrcweir static rtl::OUString transliterate_titlecase_Impl(
386cdf0e10cSrcweir     const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
387cdf0e10cSrcweir     const Locale &rLocale,
388cdf0e10cSrcweir 	Sequence< sal_Int32 >& offset )
389cdf0e10cSrcweir     throw(RuntimeException)
390cdf0e10cSrcweir {
391cdf0e10cSrcweir     const OUString aText( inStr.copy( startPos, nCount ) );
392cdf0e10cSrcweir 
393cdf0e10cSrcweir     OUString aRes;
394cdf0e10cSrcweir     if (aText.getLength() > 0)
395cdf0e10cSrcweir     {
396cdf0e10cSrcweir         Reference< XMultiServiceFactory > xMSF = ::comphelper::getProcessServiceFactory();
397cdf0e10cSrcweir         CharacterClassificationImpl aCharClassImpl( xMSF );
398cdf0e10cSrcweir 
399cdf0e10cSrcweir         // because aCharClassImpl.toTitle does not handle ligatures or � but will raise
400cdf0e10cSrcweir         // an exception we need to handle the first chara manually...
401cdf0e10cSrcweir 
402cdf0e10cSrcweir         // we don't want to change surrogates by accident, thuse we use proper code point iteration
403cdf0e10cSrcweir         sal_Int32 nPos = 0;
404cdf0e10cSrcweir         sal_uInt32 cFirstChar = aText.iterateCodePoints( &nPos );
405cdf0e10cSrcweir         OUString aResolvedLigature( &cFirstChar, 1 ); //lcl_ResolveLigature( cFirstChar ) );
406cdf0e10cSrcweir         // toUpper can be used to properly resolve ligatures and characters like �
407cdf0e10cSrcweir         aResolvedLigature = aCharClassImpl.toUpper( aResolvedLigature, 0, aResolvedLigature.getLength(), rLocale );
408cdf0e10cSrcweir         // since toTitle will leave all-uppercase text unchanged we first need to
409cdf0e10cSrcweir         // use toLower to bring possible 2nd and following charas in lowercase
410cdf0e10cSrcweir         aResolvedLigature = aCharClassImpl.toLower( aResolvedLigature, 0, aResolvedLigature.getLength(), rLocale );
411cdf0e10cSrcweir         sal_Int32 nResolvedLen = aResolvedLigature.getLength();
412cdf0e10cSrcweir 
413cdf0e10cSrcweir         // now we can properly use toTitle to get the expected result for the resolved string.
414cdf0e10cSrcweir         // The rest of the text should just become lowercase.
415cdf0e10cSrcweir         aRes = aCharClassImpl.toTitle( aResolvedLigature, 0, nResolvedLen, rLocale );
416cdf0e10cSrcweir         aRes += aCharClassImpl.toLower( aText, 1, aText.getLength() - 1, rLocale );
417cdf0e10cSrcweir         offset.realloc( aRes.getLength() );
418cdf0e10cSrcweir 
419cdf0e10cSrcweir         sal_Int32 *pOffset = offset.getArray();
420cdf0e10cSrcweir         sal_Int32 nLen = offset.getLength();
421cdf0e10cSrcweir         for (sal_Int32 i = 0; i < nLen; ++i)
422cdf0e10cSrcweir         {
423cdf0e10cSrcweir             sal_Int32 nIdx = 0;
424cdf0e10cSrcweir             if (i >= nResolvedLen)
425cdf0e10cSrcweir                 nIdx = i - nResolvedLen + 1;
426cdf0e10cSrcweir             pOffset[i] = nIdx;
427cdf0e10cSrcweir         }
428cdf0e10cSrcweir     }
429cdf0e10cSrcweir #if OSL_DEBUG_LEVEL > 1
430cdf0e10cSrcweir     const sal_Int32 *pCOffset = offset.getConstArray();
431cdf0e10cSrcweir     (void) pCOffset;
432cdf0e10cSrcweir #endif
433cdf0e10cSrcweir 
434cdf0e10cSrcweir     return aRes;
435cdf0e10cSrcweir }
436cdf0e10cSrcweir 
437cdf0e10cSrcweir 
438cdf0e10cSrcweir // this function expects to be called on a word-by-word basis,
439cdf0e10cSrcweir // namely that startPos points to the first char of the word
transliterate(const OUString & inStr,sal_Int32 startPos,sal_Int32 nCount,Sequence<sal_Int32> & offset)440cdf0e10cSrcweir rtl::OUString SAL_CALL Transliteration_titlecase::transliterate(
441cdf0e10cSrcweir     const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
442cdf0e10cSrcweir 	Sequence< sal_Int32 >& offset )
443cdf0e10cSrcweir     throw(RuntimeException)
444cdf0e10cSrcweir {
445cdf0e10cSrcweir     return transliterate_titlecase_Impl( inStr, startPos, nCount, aLocale, offset );
446cdf0e10cSrcweir }
447cdf0e10cSrcweir 
448cdf0e10cSrcweir 
Transliteration_sentencecase()449cdf0e10cSrcweir Transliteration_sentencecase::Transliteration_sentencecase()
450cdf0e10cSrcweir {
451cdf0e10cSrcweir     nMappingType = MappingTypeToTitle;  // though only to be applied to the first word...
452cdf0e10cSrcweir     transliterationName = "sentence(generic)";
453cdf0e10cSrcweir     implementationName = "com.sun.star.i18n.Transliteration.Transliteration_sentencecase";
454cdf0e10cSrcweir }
455cdf0e10cSrcweir 
456cdf0e10cSrcweir 
457cdf0e10cSrcweir // this function expects to be called on a sentence-by-sentence basis,
458cdf0e10cSrcweir // namely that startPos points to the first word (NOT first char!) in the sentence
transliterate(const OUString & inStr,sal_Int32 startPos,sal_Int32 nCount,Sequence<sal_Int32> & offset)459cdf0e10cSrcweir rtl::OUString SAL_CALL Transliteration_sentencecase::transliterate(
460cdf0e10cSrcweir     const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
461cdf0e10cSrcweir     Sequence< sal_Int32 >& offset )
462cdf0e10cSrcweir     throw(RuntimeException)
463cdf0e10cSrcweir {
464cdf0e10cSrcweir     return transliterate_titlecase_Impl( inStr, startPos, nCount, aLocale, offset );
465cdf0e10cSrcweir }
466cdf0e10cSrcweir 
467cdf0e10cSrcweir 
468cdf0e10cSrcweir } } } }
469cdf0e10cSrcweir 
470