1 /**************************************************************
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing,
14 * software distributed under the License is distributed on an
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 * KIND, either express or implied. See the License for the
17 * specific language governing permissions and limitations
18 * under the License.
19 *
20 *************************************************************/
21
22
23
24 #include <com/sun/star/i18n/UnicodeType.hpp>
25 #include <com/sun/star/i18n/KCharacterType.hpp>
26 #include <i18nutil/unicode.hxx>
27 #include "unicode_data.h"
28
29 using namespace ::com::sun::star::i18n;
30
31 static ScriptTypeList defaultTypeList[] = {
32 { UnicodeScript_kBasicLatin,
33 UnicodeScript_kBasicLatin,
34 UnicodeScript_kBasicLatin }, // 0,
35 { UnicodeScript_kLatin1Supplement,
36 UnicodeScript_kLatin1Supplement,
37 UnicodeScript_kLatin1Supplement },// 1,
38 { UnicodeScript_kLatinExtendedA,
39 UnicodeScript_kLatinExtendedA,
40 UnicodeScript_kLatinExtendedA }, // 2,
41 { UnicodeScript_kLatinExtendedB,
42 UnicodeScript_kLatinExtendedB,
43 UnicodeScript_kLatinExtendedB }, // 3,
44 { UnicodeScript_kIPAExtension,
45 UnicodeScript_kIPAExtension,
46 UnicodeScript_kIPAExtension }, // 4,
47 { UnicodeScript_kSpacingModifier,
48 UnicodeScript_kSpacingModifier,
49 UnicodeScript_kSpacingModifier }, // 5,
50 { UnicodeScript_kCombiningDiacritical,
51 UnicodeScript_kCombiningDiacritical,
52 UnicodeScript_kCombiningDiacritical }, // 6,
53 { UnicodeScript_kGreek,
54 UnicodeScript_kGreek,
55 UnicodeScript_kGreek }, // 7,
56 { UnicodeScript_kCyrillic,
57 UnicodeScript_kCyrillic,
58 UnicodeScript_kCyrillic }, // 8,
59 { UnicodeScript_kArmenian,
60 UnicodeScript_kArmenian,
61 UnicodeScript_kArmenian }, // 9,
62 { UnicodeScript_kHebrew,
63 UnicodeScript_kHebrew,
64 UnicodeScript_kHebrew }, // 10,
65 { UnicodeScript_kArabic,
66 UnicodeScript_kArabic,
67 UnicodeScript_kArabic }, // 11,
68 { UnicodeScript_kSyriac,
69 UnicodeScript_kSyriac,
70 UnicodeScript_kSyriac }, // 12,
71 { UnicodeScript_kThaana,
72 UnicodeScript_kThaana,
73 UnicodeScript_kThaana }, // 13,
74 { UnicodeScript_kDevanagari,
75 UnicodeScript_kDevanagari,
76 UnicodeScript_kDevanagari }, // 14,
77 { UnicodeScript_kBengali,
78 UnicodeScript_kBengali,
79 UnicodeScript_kBengali }, // 15,
80 { UnicodeScript_kGurmukhi,
81 UnicodeScript_kGurmukhi,
82 UnicodeScript_kGurmukhi }, // 16,
83 { UnicodeScript_kGujarati,
84 UnicodeScript_kGujarati,
85 UnicodeScript_kGujarati }, // 17,
86 { UnicodeScript_kOriya,
87 UnicodeScript_kOriya,
88 UnicodeScript_kOriya }, // 18,
89 { UnicodeScript_kTamil,
90 UnicodeScript_kTamil,
91 UnicodeScript_kTamil }, // 19,
92 { UnicodeScript_kTelugu,
93 UnicodeScript_kTelugu,
94 UnicodeScript_kTelugu }, // 20,
95 { UnicodeScript_kKannada,
96 UnicodeScript_kKannada,
97 UnicodeScript_kKannada }, // 21,
98 { UnicodeScript_kMalayalam,
99 UnicodeScript_kMalayalam,
100 UnicodeScript_kMalayalam }, // 22,
101 { UnicodeScript_kSinhala,
102 UnicodeScript_kSinhala,
103 UnicodeScript_kSinhala }, // 23,
104 { UnicodeScript_kThai,
105 UnicodeScript_kThai,
106 UnicodeScript_kThai }, // 24,
107 { UnicodeScript_kLao,
108 UnicodeScript_kLao,
109 UnicodeScript_kLao }, // 25,
110 { UnicodeScript_kTibetan,
111 UnicodeScript_kTibetan,
112 UnicodeScript_kTibetan }, // 26,
113 { UnicodeScript_kMyanmar,
114 UnicodeScript_kMyanmar,
115 UnicodeScript_kMyanmar }, // 27,
116 { UnicodeScript_kGeorgian,
117 UnicodeScript_kGeorgian,
118 UnicodeScript_kGeorgian }, // 28,
119 { UnicodeScript_kHangulJamo,
120 UnicodeScript_kHangulJamo,
121 UnicodeScript_kHangulJamo }, // 29,
122 { UnicodeScript_kEthiopic,
123 UnicodeScript_kEthiopic,
124 UnicodeScript_kEthiopic }, // 30,
125 { UnicodeScript_kCherokee,
126 UnicodeScript_kCherokee,
127 UnicodeScript_kCherokee }, // 31,
128 { UnicodeScript_kUnifiedCanadianAboriginalSyllabics,
129 UnicodeScript_kUnifiedCanadianAboriginalSyllabics,
130 UnicodeScript_kUnifiedCanadianAboriginalSyllabics }, // 32,
131 { UnicodeScript_kOgham,
132 UnicodeScript_kOgham,
133 UnicodeScript_kOgham }, // 33,
134 { UnicodeScript_kRunic,
135 UnicodeScript_kRunic,
136 UnicodeScript_kRunic }, // 34,
137 { UnicodeScript_kKhmer,
138 UnicodeScript_kKhmer,
139 UnicodeScript_kKhmer }, // 35,
140 { UnicodeScript_kMongolian,
141 UnicodeScript_kMongolian,
142 UnicodeScript_kMongolian }, // 36,
143 { UnicodeScript_kLatinExtendedAdditional,
144 UnicodeScript_kLatinExtendedAdditional,
145 UnicodeScript_kLatinExtendedAdditional }, // 37,
146 { UnicodeScript_kGreekExtended,
147 UnicodeScript_kGreekExtended,
148 UnicodeScript_kGreekExtended }, // 38,
149 { UnicodeScript_kGeneralPunctuation,
150 UnicodeScript_kGeneralPunctuation,
151 UnicodeScript_kGeneralPunctuation }, // 39,
152 { UnicodeScript_kSuperSubScript,
153 UnicodeScript_kSuperSubScript,
154 UnicodeScript_kSuperSubScript }, // 40,
155 { UnicodeScript_kCurrencySymbolScript,
156 UnicodeScript_kCurrencySymbolScript,
157 UnicodeScript_kCurrencySymbolScript }, // 41,
158 { UnicodeScript_kSymbolCombiningMark,
159 UnicodeScript_kSymbolCombiningMark,
160 UnicodeScript_kSymbolCombiningMark }, // 42,
161 { UnicodeScript_kLetterlikeSymbol,
162 UnicodeScript_kLetterlikeSymbol,
163 UnicodeScript_kLetterlikeSymbol }, // 43,
164 { UnicodeScript_kNumberForm,
165 UnicodeScript_kNumberForm,
166 UnicodeScript_kNumberForm }, // 44,
167 { UnicodeScript_kArrow,
168 UnicodeScript_kArrow,
169 UnicodeScript_kArrow }, // 45,
170 { UnicodeScript_kMathOperator,
171 UnicodeScript_kMathOperator,
172 UnicodeScript_kMathOperator }, // 46,
173 { UnicodeScript_kMiscTechnical,
174 UnicodeScript_kMiscTechnical,
175 UnicodeScript_kMiscTechnical }, // 47,
176 { UnicodeScript_kControlPicture,
177 UnicodeScript_kControlPicture,
178 UnicodeScript_kControlPicture }, // 48,
179 { UnicodeScript_kOpticalCharacter,
180 UnicodeScript_kOpticalCharacter,
181 UnicodeScript_kOpticalCharacter }, // 49,
182 { UnicodeScript_kEnclosedAlphanumeric,
183 UnicodeScript_kEnclosedAlphanumeric,
184 UnicodeScript_kEnclosedAlphanumeric }, // 50,
185 { UnicodeScript_kBoxDrawing,
186 UnicodeScript_kBoxDrawing,
187 UnicodeScript_kBoxDrawing }, // 51,
188 { UnicodeScript_kBlockElement,
189 UnicodeScript_kBlockElement,
190 UnicodeScript_kBlockElement }, // 52,
191 { UnicodeScript_kGeometricShape,
192 UnicodeScript_kGeometricShape,
193 UnicodeScript_kGeometricShape }, // 53,
194 { UnicodeScript_kMiscSymbol,
195 UnicodeScript_kMiscSymbol,
196 UnicodeScript_kMiscSymbol }, // 54,
197 { UnicodeScript_kDingbat,
198 UnicodeScript_kDingbat,
199 UnicodeScript_kDingbat }, // 55,
200 { UnicodeScript_kBraillePatterns,
201 UnicodeScript_kBraillePatterns,
202 UnicodeScript_kBraillePatterns }, // 56,
203 { UnicodeScript_kCJKRadicalsSupplement,
204 UnicodeScript_kCJKRadicalsSupplement,
205 UnicodeScript_kCJKRadicalsSupplement }, // 57,
206 { UnicodeScript_kKangxiRadicals,
207 UnicodeScript_kKangxiRadicals,
208 UnicodeScript_kKangxiRadicals }, // 58,
209 { UnicodeScript_kIdeographicDescriptionCharacters,
210 UnicodeScript_kIdeographicDescriptionCharacters,
211 UnicodeScript_kIdeographicDescriptionCharacters }, // 59,
212 { UnicodeScript_kCJKSymbolPunctuation,
213 UnicodeScript_kCJKSymbolPunctuation,
214 UnicodeScript_kCJKSymbolPunctuation }, // 60,
215 { UnicodeScript_kHiragana,
216 UnicodeScript_kHiragana,
217 UnicodeScript_kHiragana }, // 61,
218 { UnicodeScript_kKatakana,
219 UnicodeScript_kKatakana,
220 UnicodeScript_kKatakana }, // 62,
221 { UnicodeScript_kBopomofo,
222 UnicodeScript_kBopomofo,
223 UnicodeScript_kBopomofo }, // 63,
224 { UnicodeScript_kHangulCompatibilityJamo,
225 UnicodeScript_kHangulCompatibilityJamo,
226 UnicodeScript_kHangulCompatibilityJamo }, // 64,
227 { UnicodeScript_kKanbun,
228 UnicodeScript_kKanbun,
229 UnicodeScript_kKanbun }, // 65,
230 { UnicodeScript_kBopomofoExtended,
231 UnicodeScript_kBopomofoExtended,
232 UnicodeScript_kBopomofoExtended }, // 66,
233 { UnicodeScript_kEnclosedCJKLetterMonth,
234 UnicodeScript_kEnclosedCJKLetterMonth,
235 UnicodeScript_kEnclosedCJKLetterMonth }, // 67,
236 { UnicodeScript_kCJKCompatibility,
237 UnicodeScript_kCJKCompatibility,
238 UnicodeScript_kCJKCompatibility }, // 68,
239 { UnicodeScript_k_CJKUnifiedIdeographsExtensionA,
240 UnicodeScript_k_CJKUnifiedIdeographsExtensionA,
241 UnicodeScript_k_CJKUnifiedIdeographsExtensionA }, // 69,
242 { UnicodeScript_kCJKUnifiedIdeograph,
243 UnicodeScript_kCJKUnifiedIdeograph,
244 UnicodeScript_kCJKUnifiedIdeograph }, // 70,
245 { UnicodeScript_kYiSyllables,
246 UnicodeScript_kYiSyllables,
247 UnicodeScript_kYiSyllables }, // 71,
248 { UnicodeScript_kYiRadicals,
249 UnicodeScript_kYiRadicals,
250 UnicodeScript_kYiRadicals }, // 72,
251 { UnicodeScript_kHangulSyllable,
252 UnicodeScript_kHangulSyllable,
253 UnicodeScript_kHangulSyllable }, // 73,
254 { UnicodeScript_kHighSurrogate,
255 UnicodeScript_kHighSurrogate,
256 UnicodeScript_kHighSurrogate }, // 74,
257 { UnicodeScript_kHighPrivateUseSurrogate,
258 UnicodeScript_kHighPrivateUseSurrogate,
259 UnicodeScript_kHighPrivateUseSurrogate }, // 75,
260 { UnicodeScript_kLowSurrogate,
261 UnicodeScript_kLowSurrogate,
262 UnicodeScript_kLowSurrogate }, // 76,
263 { UnicodeScript_kPrivateUse,
264 UnicodeScript_kPrivateUse,
265 UnicodeScript_kPrivateUse }, // 77,
266 { UnicodeScript_kCJKCompatibilityIdeograph,
267 UnicodeScript_kCJKCompatibilityIdeograph,
268 UnicodeScript_kCJKCompatibilityIdeograph }, // 78,
269 { UnicodeScript_kAlphabeticPresentation,
270 UnicodeScript_kAlphabeticPresentation,
271 UnicodeScript_kAlphabeticPresentation }, // 79,
272 { UnicodeScript_kArabicPresentationA,
273 UnicodeScript_kArabicPresentationA,
274 UnicodeScript_kArabicPresentationA }, // 80,
275 { UnicodeScript_kCombiningHalfMark,
276 UnicodeScript_kCombiningHalfMark,
277 UnicodeScript_kCombiningHalfMark }, // 81,
278 { UnicodeScript_kCJKCompatibilityForm,
279 UnicodeScript_kCJKCompatibilityForm,
280 UnicodeScript_kCJKCompatibilityForm }, // 82,
281 { UnicodeScript_kSmallFormVariant,
282 UnicodeScript_kSmallFormVariant,
283 UnicodeScript_kSmallFormVariant }, // 83,
284 { UnicodeScript_kArabicPresentationB,
285 UnicodeScript_kArabicPresentationB,
286 UnicodeScript_kArabicPresentationB }, // 84,
287 { UnicodeScript_kNoScript,
288 UnicodeScript_kNoScript,
289 UnicodeScript_kNoScript }, // 85,
290 { UnicodeScript_kHalfwidthFullwidthForm,
291 UnicodeScript_kHalfwidthFullwidthForm,
292 UnicodeScript_kHalfwidthFullwidthForm }, // 86,
293 { UnicodeScript_kScriptCount,
294 UnicodeScript_kScriptCount,
295 UnicodeScript_kNoScript } // 87,
296 };
297
298 sal_Int16 SAL_CALL
getUnicodeScriptType(const sal_Unicode ch,ScriptTypeList * typeList,sal_Int16 unknownType)299 unicode::getUnicodeScriptType( const sal_Unicode ch, ScriptTypeList* typeList, sal_Int16 unknownType ) {
300
301 if (!typeList) {
302 typeList = defaultTypeList;
303 unknownType = UnicodeScript_kNoScript;
304 }
305
306 sal_Int16 i = 0, type = typeList[0].to;
307 while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[type][UnicodeScriptTypeTo]) {
308 type = typeList[++i].to;
309 }
310
311 return (type < UnicodeScript_kScriptCount &&
312 ch >= UnicodeScriptType[typeList[i].from][UnicodeScriptTypeFrom]) ?
313 typeList[i].value : unknownType;
314 }
315
316 sal_Bool SAL_CALL
isUnicodeScriptType(const sal_Unicode ch,sal_Int16 type)317 unicode::isUnicodeScriptType( const sal_Unicode ch, sal_Int16 type) {
318 return ch >= UnicodeScriptType[type][UnicodeScriptTypeFrom] &&
319 ch <= UnicodeScriptType[type][UnicodeScriptTypeTo];
320 }
321
322 sal_Unicode SAL_CALL
getUnicodeScriptStart(UnicodeScript type)323 unicode::getUnicodeScriptStart( UnicodeScript type) {
324 return UnicodeScriptType[type][UnicodeScriptTypeFrom];
325 }
326
327 sal_Unicode SAL_CALL
getUnicodeScriptEnd(UnicodeScript type)328 unicode::getUnicodeScriptEnd( UnicodeScript type) {
329 return UnicodeScriptType[type][UnicodeScriptTypeTo];
330 }
331
332 sal_Int16 SAL_CALL
getUnicodeType(const sal_Unicode ch)333 unicode::getUnicodeType( const sal_Unicode ch ) {
334 static sal_Unicode c = 0x00;
335 static sal_Int16 r = 0x00;
336
337 if (ch == c) return r;
338 else c = ch;
339
340 sal_Int16 address = UnicodeTypeIndex[ch >> 8];
341 return r = (sal_Int16)((address < UnicodeTypeNumberBlock) ? UnicodeTypeBlockValue[address] :
342 UnicodeTypeValue[((address - UnicodeTypeNumberBlock) << 8) + (ch & 0xff)]);
343 }
344
345 sal_uInt8 SAL_CALL
getUnicodeDirection(const sal_Unicode ch)346 unicode::getUnicodeDirection( const sal_Unicode ch ) {
347 static sal_Unicode c = 0x00;
348 static sal_uInt8 r = 0x00;
349
350 if (ch == c) return r;
351 else c = ch;
352
353 sal_Int16 address = UnicodeDirectionIndex[ch >> 8];
354 return r = ((address < UnicodeDirectionNumberBlock) ? UnicodeDirectionBlockValue[address] :
355 UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)]);
356
357 }
358
359 #define bit(name) (1 << name)
360
361 #define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER)
362
363 #define LOWERMASK bit(UnicodeType::LOWERCASE_LETTER)
364
365 #define TITLEMASK bit(UnicodeType::TITLECASE_LETTER)
366
367 #define DIGITMASK bit(UnicodeType::DECIMAL_DIGIT_NUMBER)|\
368 bit(UnicodeType::LETTER_NUMBER)|\
369 bit(UnicodeType::OTHER_NUMBER)
370
371 #define ALPHAMASK UPPERMASK|LOWERMASK|TITLEMASK|\
372 bit(UnicodeType::MODIFIER_LETTER)|\
373 bit(UnicodeType::OTHER_LETTER)
374
375 #define BASEMASK DIGITMASK|ALPHAMASK|\
376 bit(UnicodeType::NON_SPACING_MARK)|\
377 bit(UnicodeType::ENCLOSING_MARK)|\
378 bit(UnicodeType::COMBINING_SPACING_MARK)
379
380 #define SPACEMASK bit(UnicodeType::SPACE_SEPARATOR)|\
381 bit(UnicodeType::LINE_SEPARATOR)|\
382 bit(UnicodeType::PARAGRAPH_SEPARATOR)
383
384 #define PUNCTUATIONMASK bit(UnicodeType::DASH_PUNCTUATION)|\
385 bit(UnicodeType::INITIAL_PUNCTUATION)|\
386 bit(UnicodeType::FINAL_PUNCTUATION)|\
387 bit(UnicodeType::CONNECTOR_PUNCTUATION)|\
388 bit(UnicodeType::OTHER_PUNCTUATION)
389
390 #define SYMBOLMASK bit(UnicodeType::MATH_SYMBOL)|\
391 bit(UnicodeType::CURRENCY_SYMBOL)|\
392 bit(UnicodeType::MODIFIER_SYMBOL)|\
393 bit(UnicodeType::OTHER_SYMBOL)
394
395 #define PRINTMASK BASEMASK|SPACEMASK|PUNCTUATIONMASK|SYMBOLMASK
396
397 #define CONTROLMASK bit(UnicodeType::CONTROL)|\
398 bit(UnicodeType::FORMAT)|\
399 bit(UnicodeType::LINE_SEPARATOR)|\
400 bit(UnicodeType::PARAGRAPH_SEPARATOR)
401
402 #define IsType(func, mask) \
403 sal_Bool SAL_CALL func( const sal_Unicode ch) {\
404 return (bit(getUnicodeType(ch)) & (mask)) != 0;\
405 }
406
IsType(unicode::isUpper,UPPERMASK)407 IsType(unicode::isUpper, UPPERMASK)
408 IsType(unicode::isLower, LOWERMASK)
409 IsType(unicode::isTitle, DIGITMASK)
410 IsType(unicode::isControl, CONTROLMASK)
411 IsType(unicode::isPrint, PRINTMASK)
412 IsType(unicode::isAlpha, ALPHAMASK)
413 IsType(unicode::isDigit, DIGITMASK)
414 IsType(unicode::isAlphaDigit, ALPHAMASK|DIGITMASK)
415 IsType(unicode::isSpace, SPACEMASK)
416 IsType(unicode::isBase, BASEMASK)
417 IsType(unicode::isPunctuation, PUNCTUATIONMASK)
418
419 #define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
420 bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
421
422 sal_Bool SAL_CALL unicode::isWhiteSpace( const sal_Unicode ch) {
423 return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE)));
424 }
425
getCharType(const sal_Unicode ch)426 sal_Int32 SAL_CALL unicode::getCharType( const sal_Unicode ch )
427 {
428 using namespace ::com::sun::star::i18n::KCharacterType;
429
430 switch ( getUnicodeType( ch ) ) {
431 // Upper
432 case UnicodeType::UPPERCASE_LETTER :
433 return UPPER|LETTER|PRINTABLE|BASE_FORM;
434
435 // Lower
436 case UnicodeType::LOWERCASE_LETTER :
437 return LOWER|LETTER|PRINTABLE|BASE_FORM;
438
439 // Title
440 case UnicodeType::TITLECASE_LETTER :
441 return TITLE_CASE|LETTER|PRINTABLE|BASE_FORM;
442
443 // Letter
444 case UnicodeType::MODIFIER_LETTER :
445 case UnicodeType::OTHER_LETTER :
446 return LETTER|PRINTABLE|BASE_FORM;
447
448 // Digit
449 case UnicodeType::DECIMAL_DIGIT_NUMBER:
450 case UnicodeType::LETTER_NUMBER:
451 case UnicodeType::OTHER_NUMBER:
452 return DIGIT|PRINTABLE|BASE_FORM;
453
454 // Base
455 case UnicodeType::NON_SPACING_MARK:
456 case UnicodeType::ENCLOSING_MARK:
457 case UnicodeType::COMBINING_SPACING_MARK:
458 return BASE_FORM|PRINTABLE;
459
460 // Print
461 case UnicodeType::SPACE_SEPARATOR:
462
463 case UnicodeType::DASH_PUNCTUATION:
464 case UnicodeType::INITIAL_PUNCTUATION:
465 case UnicodeType::FINAL_PUNCTUATION:
466 case UnicodeType::CONNECTOR_PUNCTUATION:
467 case UnicodeType::OTHER_PUNCTUATION:
468
469 case UnicodeType::MATH_SYMBOL:
470 case UnicodeType::CURRENCY_SYMBOL:
471 case UnicodeType::MODIFIER_SYMBOL:
472 case UnicodeType::OTHER_SYMBOL:
473 return PRINTABLE;
474
475 // Control
476 case UnicodeType::CONTROL:
477 case UnicodeType::FORMAT:
478 return CONTROL;
479
480 case UnicodeType::LINE_SEPARATOR:
481 case UnicodeType::PARAGRAPH_SEPARATOR:
482 return CONTROL|PRINTABLE;
483
484 // for all others
485 default:
486 return 0;
487 }
488 }
489
490
491