xref: /aoo4110/main/sal/textenc/unichars.c (revision b1cdbd2c)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 #include "unichars.h"
25 #include "osl/diagnose.h"
26 #include "sal/types.h"
27 
ImplIsNoncharacter(sal_uInt32 nUtf32)28 int ImplIsNoncharacter(sal_uInt32 nUtf32)
29 {
30     /* All code points that are noncharacters, as of Unicode 3.1.1. */
31     return (nUtf32 >= 0xFDD0 && nUtf32 <= 0xFDEF)
32            || (nUtf32 & 0xFFFF) >= 0xFFFE
33            || nUtf32 > 0x10FFFF;
34 }
35 
ImplIsControlOrFormat(sal_uInt32 nUtf32)36 int ImplIsControlOrFormat(sal_uInt32 nUtf32)
37 {
38     /* All code points of <http://www.unicode.org/Public/UNIDATA/
39        UnicodeData.txt>, Version 3.1.1, that have a General Category of Cc
40        (Other, Control) or Cf (Other, Format).
41      */
42     return nUtf32 <= 0x001F
43            || (nUtf32 >= 0x007F && nUtf32 <= 0x009F)
44            || nUtf32 == 0x070F /* SYRIAC ABBREVIATION MARK */
45            || nUtf32 == 0x180B /* MONGOLIAN FREE VARIATION SELECTOR ONE */
46            || nUtf32 == 0x180C /* MONGOLIAN FREE VARIATION SELECTOR TWO */
47            || nUtf32 == 0x180D /* MONGOLIAN FREE VARIATION SELECTOR THREE */
48            || nUtf32 == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
49            || nUtf32 == 0x200C /* ZERO WIDTH NON-JOINER */
50            || nUtf32 == 0x200D /* ZERO WIDTH JOINER */
51            || nUtf32 == 0x200E /* LEFT-TO-RIGHT MARK */
52            || nUtf32 == 0x200F /* RIGHT-TO-LEFT MARK */
53            || nUtf32 == 0x202A /* LEFT-TO-RIGHT EMBEDDING */
54            || nUtf32 == 0x202B /* RIGHT-TO-LEFT EMBEDDING */
55            || nUtf32 == 0x202C /* POP DIRECTIONAL FORMATTING */
56            || nUtf32 == 0x202D /* LEFT-TO-RIGHT OVERRIDE */
57            || nUtf32 == 0x202E /* RIGHT-TO-LEFT OVERRIDE */
58            || nUtf32 == 0x206A /* INHIBIT SYMMETRIC SWAPPING */
59            || nUtf32 == 0x206B /* ACTIVATE SYMMETRIC SWAPPING */
60            || nUtf32 == 0x206C /* INHIBIT ARABIC FORM SHAPING */
61            || nUtf32 == 0x206D /* ACTIVATE ARABIC FORM SHAPING */
62            || nUtf32 == 0x206E /* NATIONAL DIGIT SHAPES */
63            || nUtf32 == 0x206F /* NOMINAL DIGIT SHAPES */
64            || nUtf32 == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */
65            || nUtf32 == 0xFFF9 /* INTERLINEAR ANNOTATION ANCHOR */
66            || nUtf32 == 0xFFFA /* INTERLINEAR ANNOTATION SEPARATOR */
67            || nUtf32 == 0xFFFB /* INTERLINEAR ANNOTATION TERMINATOR */
68            || nUtf32 == 0x1D173 /* MUSICAL SYMBOL BEGIN BEAM */
69            || nUtf32 == 0x1D174 /* MUSICAL SYMBOL END BEAM */
70            || nUtf32 == 0x1D175 /* MUSICAL SYMBOL BEGIN TIE */
71            || nUtf32 == 0x1D176 /* MUSICAL SYMBOL END TIE */
72            || nUtf32 == 0x1D177 /* MUSICAL SYMBOL BEGIN SLUR */
73            || nUtf32 == 0x1D178 /* MUSICAL SYMBOL END SLUR */
74            || nUtf32 == 0x1D179 /* MUSICAL SYMBOL BEGIN PHRASE */
75            || nUtf32 == 0x1D17A /* MUSICAL SYMBOL END PHRASE */
76            || nUtf32 == 0xE0001 /* LANGUAGE TAG */
77            || (nUtf32 >= 0xE0020 && nUtf32 <= 0xE007F);
78 }
79 
ImplIsHighSurrogate(sal_uInt32 nUtf32)80 int ImplIsHighSurrogate(sal_uInt32 nUtf32)
81 {
82     /* All code points that are high-surrogates, as of Unicode 3.1.1. */
83     return nUtf32 >= 0xD800 && nUtf32 <= 0xDBFF;
84 }
85 
ImplIsLowSurrogate(sal_uInt32 nUtf32)86 int ImplIsLowSurrogate(sal_uInt32 nUtf32)
87 {
88     /* All code points that are low-surrogates, as of Unicode 3.1.1. */
89     return nUtf32 >= 0xDC00 && nUtf32 <= 0xDFFF;
90 }
91 
ImplIsPrivateUse(sal_uInt32 nUtf32)92 int ImplIsPrivateUse(sal_uInt32 nUtf32)
93 {
94     /* All code points of <http://www.unicode.org/Public/UNIDATA/
95        UnicodeData.txt>, Version 3.1.1, that have a General Category of Co
96        (Other, Private Use).
97      */
98     return (nUtf32 >= 0xE000 && nUtf32 <= 0xF8FF)
99            || (nUtf32 >= 0xF0000 && nUtf32 <= 0xFFFFD)
100            || (nUtf32 >= 0x100000 && nUtf32 <= 0x10FFFD);
101 }
102 
ImplIsZeroWidth(sal_uInt32 nUtf32)103 int ImplIsZeroWidth(sal_uInt32 nUtf32)
104 {
105     /* All code points of <http://www.unicode.org/Public/UNIDATA/
106        UnicodeData.txt>, Version 3.1.1, that have "ZERO WIDTH" in their
107        Character name.
108      */
109     return nUtf32 == 0x200B /* ZERO WIDTH SPACE */
110            || nUtf32 == 0x200C /* ZERO WIDTH NON-JOINER */
111            || nUtf32 == 0x200D /* ZERO WIDTH JOINER */
112            || nUtf32 == 0xFEFF; /* ZEOR WIDTH NO-BREAK SPACE */
113 }
114 
ImplGetHighSurrogate(sal_uInt32 nUtf32)115 sal_uInt32 ImplGetHighSurrogate(sal_uInt32 nUtf32)
116 {
117     OSL_ENSURE(nUtf32 >= 0x10000, "specification violation");
118     return ((nUtf32 - 0x10000) >> 10) | 0xD800;
119 }
120 
ImplGetLowSurrogate(sal_uInt32 nUtf32)121 sal_uInt32 ImplGetLowSurrogate(sal_uInt32 nUtf32)
122 {
123     OSL_ENSURE(nUtf32 >= 0x10000, "specification violation");
124     return ((nUtf32 - 0x10000) & 0x3FF) | 0xDC00;
125 }
126 
ImplCombineSurrogates(sal_uInt32 nHigh,sal_uInt32 nLow)127 sal_uInt32 ImplCombineSurrogates(sal_uInt32 nHigh, sal_uInt32 nLow)
128 {
129     OSL_ENSURE(ImplIsHighSurrogate(nHigh) && ImplIsLowSurrogate(nLow),
130                "specification violation");
131     return (((nHigh & 0x3FF) << 10) | (nLow & 0x3FF)) + 0x10000;
132 }
133