1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_i18npool.hxx"
26 
27 // xdictionary.cpp: implementation of the xdictionary class.
28 //
29 //////////////////////////////////////////////////////////////////////
30 
31 
32 #include <rtl/ustrbuf.hxx>
33 
34 #include <com/sun/star/i18n/WordType.hpp>
35 #include <xdictionary.hxx>
36 #include <unicode/uchar.h>
37 #include <string.h>
38 #include <breakiteratorImpl.hxx>
39 
40 //////////////////////////////////////////////////////////////////////
41 // Construction/Destruction
42 //////////////////////////////////////////////////////////////////////
43 
44 using namespace rtl;
45 
46 namespace com { namespace sun { namespace star { namespace i18n {
47 
thisModule()48 extern "C" { static void SAL_CALL thisModule() {} }
49 
xdictionary(const sal_Char * lang)50 xdictionary::xdictionary(const sal_Char *lang) :
51     existMark( NULL ),
52     index1( NULL ),
53     index2( NULL ),
54     lenArray( NULL ),
55     dataArea( NULL ),
56     hModule( NULL ),
57     boundary(),
58     japaneseWordBreak( sal_False )
59 #if USE_CELL_BOUNDARY_CODE
60     // For CTL breakiterator, where the word boundary should not be inside cell.
61     ,
62     useCellBoundary( sal_False ),
63     cellBoundary( NULL )
64 #endif
65 {
66 	index1 = 0;
67 #ifdef SAL_DLLPREFIX
68     OUStringBuffer aBuf( strlen(lang) + 7 + 6 );    // mostly "lib*.so" (with * == dict_zh)
69     aBuf.appendAscii( SAL_DLLPREFIX );
70 #else
71     OUStringBuffer aBuf( strlen(lang) + 7 + 4 );    // mostly "*.dll" (with * == dict_zh)
72 #endif
73     aBuf.appendAscii( "dict_" ).appendAscii( lang ).appendAscii( SAL_DLLEXTENSION );
74         hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
75         if( hModule ) {
76             sal_IntPtr (*func)();
77             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getExistMark").pData );
78             existMark = (sal_uInt8*) (*func)();
79             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex1").pData );
80             index1 = (sal_Int16*) (*func)();
81             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex2").pData );
82             index2 = (sal_Int32*) (*func)();
83             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getLenArray").pData );
84             lenArray = (sal_Int32*) (*func)();
85             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getDataArea").pData );
86             dataArea = (sal_Unicode*) (*func)();
87         }
88         else
89 		{
90             existMark = NULL;
91 			index1 = NULL;
92 			index2 = NULL;
93 			lenArray = NULL;
94 			dataArea = NULL;
95 		}
96 
97 		for (sal_Int32 i = 0; i < CACHE_MAX; i++)
98             cache[i].size = 0;
99 
100 #if USE_CELL_BOUNDARY_CODE
101         useCellBoundary = sal_False;
102         cellBoundary = NULL;
103 #endif
104         japaneseWordBreak = sal_False;
105 }
106 
~xdictionary()107 xdictionary::~xdictionary() {
108         osl_unloadModule(hModule);
109         for (sal_Int32 i = 0; i < CACHE_MAX; i++) {
110             if (cache[i].size > 0) {
111                 delete cache[i].contents;
112                 delete cache[i].wordboundary;
113             }
114         }
115 }
116 
setJapaneseWordBreak()117 void xdictionary::setJapaneseWordBreak()
118 {
119         japaneseWordBreak = sal_True;
120 }
121 
exists(const sal_uInt32 c)122 sal_Bool xdictionary::exists(const sal_uInt32 c) {
123         // 0x1FFF is the hardcoded limit in gendict for existMarks
124         sal_Bool exist = (existMark && ((c>>3) < 0x1FFF)) ? sal::static_int_cast<sal_Bool>((existMark[c>>3] & (1<<(c&0x07))) != 0) : sal_False;
125         if (!exist && japaneseWordBreak)
126             return BreakIteratorImpl::getScriptClass(c) == ScriptType::ASIAN;
127         else
128             return exist;
129 }
130 
getLongestMatch(const sal_Unicode * str,sal_Int32 sLen)131 sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen) {
132 
133 		if ( !index1 ) return 0;
134 
135         sal_Int16 idx = index1[str[0] >> 8];
136 
137         if (idx == 0xFF) return 0;
138 
139         idx = (idx<<8) | (str[0]&0xff);
140 
141         sal_uInt32 begin = index2[idx], end = index2[idx+1];
142 
143         if (begin == 0) return 0;
144 
145         str++; sLen--; // first character is not stored in the dictionary
146         for (sal_uInt32 i = end; i > begin; i--) {
147             sal_Int32 len = lenArray[i] - lenArray[i - 1];
148             if (sLen >= len) {
149                 const sal_Unicode *dstr = dataArea + lenArray[i-1];
150                 sal_Int32 pos = 0;
151 
152                 while (pos < len && dstr[pos] == str[pos]) { pos++; }
153 
154                 if (pos == len)
155                     return len + 1;
156             }
157         }
158         return 0;
159 }
160 
161 
162 /*
163  * c-tor
164  */
165 
WordBreakCache()166 WordBreakCache::WordBreakCache() :
167     length( 0 ),
168     contents( NULL ),
169     wordboundary( NULL ),
170     size( 0 )
171 {
172 }
173 
174 /*
175  * Compare two unicode string,
176  */
177 
equals(const sal_Unicode * str,Boundary & boundary)178 sal_Bool WordBreakCache::equals(const sal_Unicode* str, Boundary& boundary) {
179         // Different length, different string.
180         if (length != boundary.endPos - boundary.startPos) return sal_False;
181 
182         for (sal_Int32 i = 0; i < length; i++)
183             if (contents[i] != str[i + boundary.startPos]) return sal_False;
184 
185         return sal_True;
186 }
187 
188 
189 /*
190  * Retrieve the segment containing the character at pos.
191  * @param pos : Position of the given character.
192  * @return true if CJK.
193  */
seekSegment(const rtl::OUString & rText,sal_Int32 pos,Boundary & segBoundary)194 sal_Bool xdictionary::seekSegment(const rtl::OUString &rText, sal_Int32 pos,
195 	Boundary& segBoundary)
196 {
197     sal_Int32 indexUtf16;
198     segBoundary.endPos = segBoundary.startPos = pos;
199 
200     indexUtf16 = pos;
201     while (indexUtf16 > 0)
202     {
203         sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
204         if (u_isWhitespace(ch) || exists(ch))
205             segBoundary.startPos = indexUtf16;
206         else
207             break;
208     }
209 
210     indexUtf16 = pos;
211     while (indexUtf16 < rText.getLength())
212     {
213         sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
214         if (u_isWhitespace(ch) || exists(ch))
215             segBoundary.endPos = indexUtf16;
216         else
217             break;
218     }
219 
220     indexUtf16 = segBoundary.startPos;
221     rText.iterateCodePoints(&indexUtf16, 1);
222     return segBoundary.endPos > indexUtf16;
223 }
224 
225 #define KANJA       1
226 #define KATAKANA    2
227 #define HIRAKANA    3
228 
JapaneseCharType(sal_Unicode c)229 static sal_Int16 JapaneseCharType(sal_Unicode c)
230 {
231     if (0x3041 <= c && c <= 0x309e)
232         return HIRAKANA;
233     if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
234         return KATAKANA;
235     return KANJA;
236 }
237 
getCache(const sal_Unicode * text,Boundary & wordBoundary)238 WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary& wordBoundary)
239 {
240 
241         WordBreakCache& aCache = cache[text[0] & 0x1f];
242 
243         if (aCache.size != 0 && aCache.equals(text, wordBoundary))
244             return aCache;
245 
246         sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
247 
248         if (aCache.size == 0 || len > aCache.size) {
249             if (aCache.size != 0) {
250                 delete aCache.contents;
251                 delete aCache.wordboundary;
252                 aCache.size = len;
253             }
254             else
255                 aCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE;
256             aCache.contents = new sal_Unicode[aCache.size + 1];
257             aCache.wordboundary = new sal_Int32[aCache.size + 2];
258         }
259         aCache.length  = len;
260         memcpy(aCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
261         *(aCache.contents + len) = 0x0000;
262         // reset the wordboundary in cache
263         memset(aCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
264 
265         sal_Int32 i = 0;        // loop variable
266         while (aCache.wordboundary[i] < aCache.length) {
267             len = 0;
268             // look the continuous white space as one word and cashe it
269             while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + aCache.wordboundary[i] + len]))
270                 len ++;
271 
272             if (len == 0) {
273                 const sal_Unicode *str = text + wordBoundary.startPos + aCache.wordboundary[i];
274                 sal_Int32 slen = aCache.length - aCache.wordboundary[i];
275                 sal_Int16 type = 0, count = 0;
276                 for (;len == 0 && slen > 0; str++, slen--) {
277                     len = getLongestMatch(str, slen);
278                     if (len == 0) {
279                         if (!japaneseWordBreak) {
280                             len = 1;
281                         } else {
282                             if (count == 0)
283                                 type = JapaneseCharType(*str);
284                             else if (type != JapaneseCharType(*str))
285                                 break;
286                             count++;
287                         }
288                     }
289                 }
290                 if (count) {
291                     aCache.wordboundary[i+1] = aCache.wordboundary[i] + count;
292                     i++;
293 
294 #if USE_CELL_BOUNDARY_CODE
295                     if (useCellBoundary) {
296                         sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1];
297                         if (cBoundary > 0)
298                             aCache.wordboundary[i] = cBoundary - wordBoundary.startPos;
299                     }
300 #endif
301                 }
302             }
303 
304             if (len) {
305                 aCache.wordboundary[i+1] = aCache.wordboundary[i] + len;
306                 i++;
307 
308 #if USE_CELL_BOUNDARY_CODE
309                 if (useCellBoundary) {
310                     sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1];
311                     if (cBoundary > 0)
312                         aCache.wordboundary[i] = cBoundary - wordBoundary.startPos;
313                 }
314 #endif
315             }
316         }
317         aCache.wordboundary[i + 1] = aCache.length + 1;
318 
319         return aCache;
320 }
321 
previousWord(const OUString & rText,sal_Int32 anyPos,sal_Int16 wordType)322 Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
323 {
324         // looking for the first non-whitespace character from anyPos
325         sal_uInt32 ch = rText.iterateCodePoints(&anyPos, -1);
326 
327         while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
328 
329         return getWordBoundary(rText, anyPos, wordType, true);
330 }
331 
nextWord(const OUString & rText,sal_Int32 anyPos,sal_Int16 wordType)332 Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
333 {
334         boundary = getWordBoundary(rText, anyPos, wordType, true);
335         anyPos = boundary.endPos;
336         if (anyPos < rText.getLength()) {
337             // looknig for the first non-whitespace character from anyPos
338             sal_uInt32 ch = rText.iterateCodePoints(&anyPos, 1);
339             while (u_isWhitespace(ch)) ch=rText.iterateCodePoints(&anyPos, 1);
340             rText.iterateCodePoints(&anyPos, -1);
341         }
342 
343         return getWordBoundary(rText, anyPos, wordType, true);
344 }
345 
getWordBoundary(const OUString & rText,sal_Int32 anyPos,sal_Int16 wordType,sal_Bool bDirection)346 Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, sal_Bool bDirection)
347 {
348         const sal_Unicode *text=rText.getStr();
349         sal_Int32 len=rText.getLength();
350         if (anyPos >= len || anyPos < 0) {
351             boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
352         } else if (seekSegment(rText, anyPos, boundary)) {          // character in dict
353             WordBreakCache& aCache = getCache(text, boundary);
354             sal_Int32 i = 0;
355 
356             while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
357 
358             sal_Int32 startPos = aCache.wordboundary[i - 1];
359             // if bDirection is false
360             if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
361             {
362                 sal_Int32 indexUtf16 = anyPos-1;
363                 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
364                 if (u_isWhitespace(ch))
365                     i--;
366             }
367             boundary.endPos = boundary.startPos;
368             rText.iterateCodePoints(&boundary.endPos, aCache.wordboundary[i]);
369             rText.iterateCodePoints(&boundary.startPos, aCache.wordboundary[i-1]);
370         } else {
371             boundary.startPos = anyPos;
372             if (anyPos < len) rText.iterateCodePoints(&anyPos, 1);
373             boundary.endPos = anyPos < len ? anyPos : len;
374         }
375         if (wordType == WordType::WORD_COUNT) {
376             // skip punctuation for word count.
377             while (boundary.endPos < len)
378             {
379                 sal_Int32 indexUtf16 = boundary.endPos;
380                 if (u_ispunct(rText.iterateCodePoints(&indexUtf16, 1)))
381                     boundary.endPos = indexUtf16;
382                 else
383                     break;
384             }
385         }
386 
387         return boundary;
388 }
389 
390 #if USE_CELL_BOUNDARY_CODE
setCellBoundary(sal_Int32 * cellArray)391 void xdictionary::setCellBoundary(sal_Int32* cellArray)
392 {
393         useCellBoundary = sal_True;
394         cellBoundary = cellArray;
395 }
396 #endif
397 
398 } } } }
399