1 /**************************************************************
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing,
14 * software distributed under the License is distributed on an
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 * KIND, either express or implied. See the License for the
17 * specific language governing permissions and limitations
18 * under the License.
19 *
20 *************************************************************/
21
22
23
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_i18npool.hxx"
26
27 // xdictionary.cpp: implementation of the xdictionary class.
28 //
29 //////////////////////////////////////////////////////////////////////
30
31
32 #include <rtl/ustrbuf.hxx>
33
34 #include <com/sun/star/i18n/WordType.hpp>
35 #include <xdictionary.hxx>
36 #include <unicode/uchar.h>
37 #include <string.h>
38 #include <breakiteratorImpl.hxx>
39
40 //////////////////////////////////////////////////////////////////////
41 // Construction/Destruction
42 //////////////////////////////////////////////////////////////////////
43
44 using namespace rtl;
45
46 namespace com { namespace sun { namespace star { namespace i18n {
47
thisModule()48 extern "C" { static void SAL_CALL thisModule() {} }
49
xdictionary(const sal_Char * lang)50 xdictionary::xdictionary(const sal_Char *lang) :
51 existMark( NULL ),
52 index1( NULL ),
53 index2( NULL ),
54 lenArray( NULL ),
55 dataArea( NULL ),
56 hModule( NULL ),
57 boundary(),
58 japaneseWordBreak( sal_False )
59 #if USE_CELL_BOUNDARY_CODE
60 // For CTL breakiterator, where the word boundary should not be inside cell.
61 ,
62 useCellBoundary( sal_False ),
63 cellBoundary( NULL )
64 #endif
65 {
66 index1 = 0;
67 #ifdef SAL_DLLPREFIX
68 OUStringBuffer aBuf( strlen(lang) + 7 + 6 ); // mostly "lib*.so" (with * == dict_zh)
69 aBuf.appendAscii( SAL_DLLPREFIX );
70 #else
71 OUStringBuffer aBuf( strlen(lang) + 7 + 4 ); // mostly "*.dll" (with * == dict_zh)
72 #endif
73 aBuf.appendAscii( "dict_" ).appendAscii( lang ).appendAscii( SAL_DLLEXTENSION );
74 hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
75 if( hModule ) {
76 sal_IntPtr (*func)();
77 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getExistMark").pData );
78 existMark = (sal_uInt8*) (*func)();
79 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex1").pData );
80 index1 = (sal_Int16*) (*func)();
81 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex2").pData );
82 index2 = (sal_Int32*) (*func)();
83 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getLenArray").pData );
84 lenArray = (sal_Int32*) (*func)();
85 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getDataArea").pData );
86 dataArea = (sal_Unicode*) (*func)();
87 }
88 else
89 {
90 existMark = NULL;
91 index1 = NULL;
92 index2 = NULL;
93 lenArray = NULL;
94 dataArea = NULL;
95 }
96
97 for (sal_Int32 i = 0; i < CACHE_MAX; i++)
98 cache[i].size = 0;
99
100 #if USE_CELL_BOUNDARY_CODE
101 useCellBoundary = sal_False;
102 cellBoundary = NULL;
103 #endif
104 japaneseWordBreak = sal_False;
105 }
106
~xdictionary()107 xdictionary::~xdictionary() {
108 osl_unloadModule(hModule);
109 for (sal_Int32 i = 0; i < CACHE_MAX; i++) {
110 if (cache[i].size > 0) {
111 delete cache[i].contents;
112 delete cache[i].wordboundary;
113 }
114 }
115 }
116
setJapaneseWordBreak()117 void xdictionary::setJapaneseWordBreak()
118 {
119 japaneseWordBreak = sal_True;
120 }
121
exists(const sal_uInt32 c)122 sal_Bool xdictionary::exists(const sal_uInt32 c) {
123 // 0x1FFF is the hardcoded limit in gendict for existMarks
124 sal_Bool exist = (existMark && ((c>>3) < 0x1FFF)) ? sal::static_int_cast<sal_Bool>((existMark[c>>3] & (1<<(c&0x07))) != 0) : sal_False;
125 if (!exist && japaneseWordBreak)
126 return BreakIteratorImpl::getScriptClass(c) == ScriptType::ASIAN;
127 else
128 return exist;
129 }
130
getLongestMatch(const sal_Unicode * str,sal_Int32 sLen)131 sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen) {
132
133 if ( !index1 ) return 0;
134
135 sal_Int16 idx = index1[str[0] >> 8];
136
137 if (idx == 0xFF) return 0;
138
139 idx = (idx<<8) | (str[0]&0xff);
140
141 sal_uInt32 begin = index2[idx], end = index2[idx+1];
142
143 if (begin == 0) return 0;
144
145 str++; sLen--; // first character is not stored in the dictionary
146 for (sal_uInt32 i = end; i > begin; i--) {
147 sal_Int32 len = lenArray[i] - lenArray[i - 1];
148 if (sLen >= len) {
149 const sal_Unicode *dstr = dataArea + lenArray[i-1];
150 sal_Int32 pos = 0;
151
152 while (pos < len && dstr[pos] == str[pos]) { pos++; }
153
154 if (pos == len)
155 return len + 1;
156 }
157 }
158 return 0;
159 }
160
161
162 /*
163 * c-tor
164 */
165
WordBreakCache()166 WordBreakCache::WordBreakCache() :
167 length( 0 ),
168 contents( NULL ),
169 wordboundary( NULL ),
170 size( 0 )
171 {
172 }
173
174 /*
175 * Compare two unicode string,
176 */
177
equals(const sal_Unicode * str,Boundary & boundary)178 sal_Bool WordBreakCache::equals(const sal_Unicode* str, Boundary& boundary) {
179 // Different length, different string.
180 if (length != boundary.endPos - boundary.startPos) return sal_False;
181
182 for (sal_Int32 i = 0; i < length; i++)
183 if (contents[i] != str[i + boundary.startPos]) return sal_False;
184
185 return sal_True;
186 }
187
188
189 /*
190 * Retrieve the segment containing the character at pos.
191 * @param pos : Position of the given character.
192 * @return true if CJK.
193 */
seekSegment(const rtl::OUString & rText,sal_Int32 pos,Boundary & segBoundary)194 sal_Bool xdictionary::seekSegment(const rtl::OUString &rText, sal_Int32 pos,
195 Boundary& segBoundary)
196 {
197 sal_Int32 indexUtf16;
198 segBoundary.endPos = segBoundary.startPos = pos;
199
200 indexUtf16 = pos;
201 while (indexUtf16 > 0)
202 {
203 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
204 if (u_isWhitespace(ch) || exists(ch))
205 segBoundary.startPos = indexUtf16;
206 else
207 break;
208 }
209
210 indexUtf16 = pos;
211 while (indexUtf16 < rText.getLength())
212 {
213 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
214 if (u_isWhitespace(ch) || exists(ch))
215 segBoundary.endPos = indexUtf16;
216 else
217 break;
218 }
219
220 indexUtf16 = segBoundary.startPos;
221 rText.iterateCodePoints(&indexUtf16, 1);
222 return segBoundary.endPos > indexUtf16;
223 }
224
225 #define KANJA 1
226 #define KATAKANA 2
227 #define HIRAKANA 3
228
JapaneseCharType(sal_Unicode c)229 static sal_Int16 JapaneseCharType(sal_Unicode c)
230 {
231 if (0x3041 <= c && c <= 0x309e)
232 return HIRAKANA;
233 if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
234 return KATAKANA;
235 return KANJA;
236 }
237
getCache(const sal_Unicode * text,Boundary & wordBoundary)238 WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary& wordBoundary)
239 {
240
241 WordBreakCache& aCache = cache[text[0] & 0x1f];
242
243 if (aCache.size != 0 && aCache.equals(text, wordBoundary))
244 return aCache;
245
246 sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
247
248 if (aCache.size == 0 || len > aCache.size) {
249 if (aCache.size != 0) {
250 delete aCache.contents;
251 delete aCache.wordboundary;
252 aCache.size = len;
253 }
254 else
255 aCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE;
256 aCache.contents = new sal_Unicode[aCache.size + 1];
257 aCache.wordboundary = new sal_Int32[aCache.size + 2];
258 }
259 aCache.length = len;
260 memcpy(aCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
261 *(aCache.contents + len) = 0x0000;
262 // reset the wordboundary in cache
263 memset(aCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
264
265 sal_Int32 i = 0; // loop variable
266 while (aCache.wordboundary[i] < aCache.length) {
267 len = 0;
268 // look the continuous white space as one word and cashe it
269 while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + aCache.wordboundary[i] + len]))
270 len ++;
271
272 if (len == 0) {
273 const sal_Unicode *str = text + wordBoundary.startPos + aCache.wordboundary[i];
274 sal_Int32 slen = aCache.length - aCache.wordboundary[i];
275 sal_Int16 type = 0, count = 0;
276 for (;len == 0 && slen > 0; str++, slen--) {
277 len = getLongestMatch(str, slen);
278 if (len == 0) {
279 if (!japaneseWordBreak) {
280 len = 1;
281 } else {
282 if (count == 0)
283 type = JapaneseCharType(*str);
284 else if (type != JapaneseCharType(*str))
285 break;
286 count++;
287 }
288 }
289 }
290 if (count) {
291 aCache.wordboundary[i+1] = aCache.wordboundary[i] + count;
292 i++;
293
294 #if USE_CELL_BOUNDARY_CODE
295 if (useCellBoundary) {
296 sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1];
297 if (cBoundary > 0)
298 aCache.wordboundary[i] = cBoundary - wordBoundary.startPos;
299 }
300 #endif
301 }
302 }
303
304 if (len) {
305 aCache.wordboundary[i+1] = aCache.wordboundary[i] + len;
306 i++;
307
308 #if USE_CELL_BOUNDARY_CODE
309 if (useCellBoundary) {
310 sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1];
311 if (cBoundary > 0)
312 aCache.wordboundary[i] = cBoundary - wordBoundary.startPos;
313 }
314 #endif
315 }
316 }
317 aCache.wordboundary[i + 1] = aCache.length + 1;
318
319 return aCache;
320 }
321
previousWord(const OUString & rText,sal_Int32 anyPos,sal_Int16 wordType)322 Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
323 {
324 // looking for the first non-whitespace character from anyPos
325 sal_uInt32 ch = rText.iterateCodePoints(&anyPos, -1);
326
327 while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
328
329 return getWordBoundary(rText, anyPos, wordType, true);
330 }
331
nextWord(const OUString & rText,sal_Int32 anyPos,sal_Int16 wordType)332 Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
333 {
334 boundary = getWordBoundary(rText, anyPos, wordType, true);
335 anyPos = boundary.endPos;
336 if (anyPos < rText.getLength()) {
337 // looknig for the first non-whitespace character from anyPos
338 sal_uInt32 ch = rText.iterateCodePoints(&anyPos, 1);
339 while (u_isWhitespace(ch)) ch=rText.iterateCodePoints(&anyPos, 1);
340 rText.iterateCodePoints(&anyPos, -1);
341 }
342
343 return getWordBoundary(rText, anyPos, wordType, true);
344 }
345
getWordBoundary(const OUString & rText,sal_Int32 anyPos,sal_Int16 wordType,sal_Bool bDirection)346 Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, sal_Bool bDirection)
347 {
348 const sal_Unicode *text=rText.getStr();
349 sal_Int32 len=rText.getLength();
350 if (anyPos >= len || anyPos < 0) {
351 boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
352 } else if (seekSegment(rText, anyPos, boundary)) { // character in dict
353 WordBreakCache& aCache = getCache(text, boundary);
354 sal_Int32 i = 0;
355
356 while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
357
358 sal_Int32 startPos = aCache.wordboundary[i - 1];
359 // if bDirection is false
360 if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
361 {
362 sal_Int32 indexUtf16 = anyPos-1;
363 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
364 if (u_isWhitespace(ch))
365 i--;
366 }
367 boundary.endPos = boundary.startPos;
368 rText.iterateCodePoints(&boundary.endPos, aCache.wordboundary[i]);
369 rText.iterateCodePoints(&boundary.startPos, aCache.wordboundary[i-1]);
370 } else {
371 boundary.startPos = anyPos;
372 if (anyPos < len) rText.iterateCodePoints(&anyPos, 1);
373 boundary.endPos = anyPos < len ? anyPos : len;
374 }
375 if (wordType == WordType::WORD_COUNT) {
376 // skip punctuation for word count.
377 while (boundary.endPos < len)
378 {
379 sal_Int32 indexUtf16 = boundary.endPos;
380 if (u_ispunct(rText.iterateCodePoints(&indexUtf16, 1)))
381 boundary.endPos = indexUtf16;
382 else
383 break;
384 }
385 }
386
387 return boundary;
388 }
389
390 #if USE_CELL_BOUNDARY_CODE
setCellBoundary(sal_Int32 * cellArray)391 void xdictionary::setCellBoundary(sal_Int32* cellArray)
392 {
393 useCellBoundary = sal_True;
394 cellBoundary = cellArray;
395 }
396 #endif
397
398 } } } }
399