xref: /aoo4110/main/sal/textenc/unichars.c (revision b1cdbd2c)
1*b1cdbd2cSJim Jagielski /**************************************************************
2*b1cdbd2cSJim Jagielski  *
3*b1cdbd2cSJim Jagielski  * Licensed to the Apache Software Foundation (ASF) under one
4*b1cdbd2cSJim Jagielski  * or more contributor license agreements.  See the NOTICE file
5*b1cdbd2cSJim Jagielski  * distributed with this work for additional information
6*b1cdbd2cSJim Jagielski  * regarding copyright ownership.  The ASF licenses this file
7*b1cdbd2cSJim Jagielski  * to you under the Apache License, Version 2.0 (the
8*b1cdbd2cSJim Jagielski  * "License"); you may not use this file except in compliance
9*b1cdbd2cSJim Jagielski  * with the License.  You may obtain a copy of the License at
10*b1cdbd2cSJim Jagielski  *
11*b1cdbd2cSJim Jagielski  *   http://www.apache.org/licenses/LICENSE-2.0
12*b1cdbd2cSJim Jagielski  *
13*b1cdbd2cSJim Jagielski  * Unless required by applicable law or agreed to in writing,
14*b1cdbd2cSJim Jagielski  * software distributed under the License is distributed on an
15*b1cdbd2cSJim Jagielski  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16*b1cdbd2cSJim Jagielski  * KIND, either express or implied.  See the License for the
17*b1cdbd2cSJim Jagielski  * specific language governing permissions and limitations
18*b1cdbd2cSJim Jagielski  * under the License.
19*b1cdbd2cSJim Jagielski  *
20*b1cdbd2cSJim Jagielski  *************************************************************/
21*b1cdbd2cSJim Jagielski 
22*b1cdbd2cSJim Jagielski 
23*b1cdbd2cSJim Jagielski 
24*b1cdbd2cSJim Jagielski #include "unichars.h"
25*b1cdbd2cSJim Jagielski #include "osl/diagnose.h"
26*b1cdbd2cSJim Jagielski #include "sal/types.h"
27*b1cdbd2cSJim Jagielski 
ImplIsNoncharacter(sal_uInt32 nUtf32)28*b1cdbd2cSJim Jagielski int ImplIsNoncharacter(sal_uInt32 nUtf32)
29*b1cdbd2cSJim Jagielski {
30*b1cdbd2cSJim Jagielski     /* All code points that are noncharacters, as of Unicode 3.1.1. */
31*b1cdbd2cSJim Jagielski     return (nUtf32 >= 0xFDD0 && nUtf32 <= 0xFDEF)
32*b1cdbd2cSJim Jagielski            || (nUtf32 & 0xFFFF) >= 0xFFFE
33*b1cdbd2cSJim Jagielski            || nUtf32 > 0x10FFFF;
34*b1cdbd2cSJim Jagielski }
35*b1cdbd2cSJim Jagielski 
ImplIsControlOrFormat(sal_uInt32 nUtf32)36*b1cdbd2cSJim Jagielski int ImplIsControlOrFormat(sal_uInt32 nUtf32)
37*b1cdbd2cSJim Jagielski {
38*b1cdbd2cSJim Jagielski     /* All code points of <http://www.unicode.org/Public/UNIDATA/
39*b1cdbd2cSJim Jagielski        UnicodeData.txt>, Version 3.1.1, that have a General Category of Cc
40*b1cdbd2cSJim Jagielski        (Other, Control) or Cf (Other, Format).
41*b1cdbd2cSJim Jagielski      */
42*b1cdbd2cSJim Jagielski     return nUtf32 <= 0x001F
43*b1cdbd2cSJim Jagielski            || (nUtf32 >= 0x007F && nUtf32 <= 0x009F)
44*b1cdbd2cSJim Jagielski            || nUtf32 == 0x070F /* SYRIAC ABBREVIATION MARK */
45*b1cdbd2cSJim Jagielski            || nUtf32 == 0x180B /* MONGOLIAN FREE VARIATION SELECTOR ONE */
46*b1cdbd2cSJim Jagielski            || nUtf32 == 0x180C /* MONGOLIAN FREE VARIATION SELECTOR TWO */
47*b1cdbd2cSJim Jagielski            || nUtf32 == 0x180D /* MONGOLIAN FREE VARIATION SELECTOR THREE */
48*b1cdbd2cSJim Jagielski            || nUtf32 == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
49*b1cdbd2cSJim Jagielski            || nUtf32 == 0x200C /* ZERO WIDTH NON-JOINER */
50*b1cdbd2cSJim Jagielski            || nUtf32 == 0x200D /* ZERO WIDTH JOINER */
51*b1cdbd2cSJim Jagielski            || nUtf32 == 0x200E /* LEFT-TO-RIGHT MARK */
52*b1cdbd2cSJim Jagielski            || nUtf32 == 0x200F /* RIGHT-TO-LEFT MARK */
53*b1cdbd2cSJim Jagielski            || nUtf32 == 0x202A /* LEFT-TO-RIGHT EMBEDDING */
54*b1cdbd2cSJim Jagielski            || nUtf32 == 0x202B /* RIGHT-TO-LEFT EMBEDDING */
55*b1cdbd2cSJim Jagielski            || nUtf32 == 0x202C /* POP DIRECTIONAL FORMATTING */
56*b1cdbd2cSJim Jagielski            || nUtf32 == 0x202D /* LEFT-TO-RIGHT OVERRIDE */
57*b1cdbd2cSJim Jagielski            || nUtf32 == 0x202E /* RIGHT-TO-LEFT OVERRIDE */
58*b1cdbd2cSJim Jagielski            || nUtf32 == 0x206A /* INHIBIT SYMMETRIC SWAPPING */
59*b1cdbd2cSJim Jagielski            || nUtf32 == 0x206B /* ACTIVATE SYMMETRIC SWAPPING */
60*b1cdbd2cSJim Jagielski            || nUtf32 == 0x206C /* INHIBIT ARABIC FORM SHAPING */
61*b1cdbd2cSJim Jagielski            || nUtf32 == 0x206D /* ACTIVATE ARABIC FORM SHAPING */
62*b1cdbd2cSJim Jagielski            || nUtf32 == 0x206E /* NATIONAL DIGIT SHAPES */
63*b1cdbd2cSJim Jagielski            || nUtf32 == 0x206F /* NOMINAL DIGIT SHAPES */
64*b1cdbd2cSJim Jagielski            || nUtf32 == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */
65*b1cdbd2cSJim Jagielski            || nUtf32 == 0xFFF9 /* INTERLINEAR ANNOTATION ANCHOR */
66*b1cdbd2cSJim Jagielski            || nUtf32 == 0xFFFA /* INTERLINEAR ANNOTATION SEPARATOR */
67*b1cdbd2cSJim Jagielski            || nUtf32 == 0xFFFB /* INTERLINEAR ANNOTATION TERMINATOR */
68*b1cdbd2cSJim Jagielski            || nUtf32 == 0x1D173 /* MUSICAL SYMBOL BEGIN BEAM */
69*b1cdbd2cSJim Jagielski            || nUtf32 == 0x1D174 /* MUSICAL SYMBOL END BEAM */
70*b1cdbd2cSJim Jagielski            || nUtf32 == 0x1D175 /* MUSICAL SYMBOL BEGIN TIE */
71*b1cdbd2cSJim Jagielski            || nUtf32 == 0x1D176 /* MUSICAL SYMBOL END TIE */
72*b1cdbd2cSJim Jagielski            || nUtf32 == 0x1D177 /* MUSICAL SYMBOL BEGIN SLUR */
73*b1cdbd2cSJim Jagielski            || nUtf32 == 0x1D178 /* MUSICAL SYMBOL END SLUR */
74*b1cdbd2cSJim Jagielski            || nUtf32 == 0x1D179 /* MUSICAL SYMBOL BEGIN PHRASE */
75*b1cdbd2cSJim Jagielski            || nUtf32 == 0x1D17A /* MUSICAL SYMBOL END PHRASE */
76*b1cdbd2cSJim Jagielski            || nUtf32 == 0xE0001 /* LANGUAGE TAG */
77*b1cdbd2cSJim Jagielski            || (nUtf32 >= 0xE0020 && nUtf32 <= 0xE007F);
78*b1cdbd2cSJim Jagielski }
79*b1cdbd2cSJim Jagielski 
ImplIsHighSurrogate(sal_uInt32 nUtf32)80*b1cdbd2cSJim Jagielski int ImplIsHighSurrogate(sal_uInt32 nUtf32)
81*b1cdbd2cSJim Jagielski {
82*b1cdbd2cSJim Jagielski     /* All code points that are high-surrogates, as of Unicode 3.1.1. */
83*b1cdbd2cSJim Jagielski     return nUtf32 >= 0xD800 && nUtf32 <= 0xDBFF;
84*b1cdbd2cSJim Jagielski }
85*b1cdbd2cSJim Jagielski 
ImplIsLowSurrogate(sal_uInt32 nUtf32)86*b1cdbd2cSJim Jagielski int ImplIsLowSurrogate(sal_uInt32 nUtf32)
87*b1cdbd2cSJim Jagielski {
88*b1cdbd2cSJim Jagielski     /* All code points that are low-surrogates, as of Unicode 3.1.1. */
89*b1cdbd2cSJim Jagielski     return nUtf32 >= 0xDC00 && nUtf32 <= 0xDFFF;
90*b1cdbd2cSJim Jagielski }
91*b1cdbd2cSJim Jagielski 
ImplIsPrivateUse(sal_uInt32 nUtf32)92*b1cdbd2cSJim Jagielski int ImplIsPrivateUse(sal_uInt32 nUtf32)
93*b1cdbd2cSJim Jagielski {
94*b1cdbd2cSJim Jagielski     /* All code points of <http://www.unicode.org/Public/UNIDATA/
95*b1cdbd2cSJim Jagielski        UnicodeData.txt>, Version 3.1.1, that have a General Category of Co
96*b1cdbd2cSJim Jagielski        (Other, Private Use).
97*b1cdbd2cSJim Jagielski      */
98*b1cdbd2cSJim Jagielski     return (nUtf32 >= 0xE000 && nUtf32 <= 0xF8FF)
99*b1cdbd2cSJim Jagielski            || (nUtf32 >= 0xF0000 && nUtf32 <= 0xFFFFD)
100*b1cdbd2cSJim Jagielski            || (nUtf32 >= 0x100000 && nUtf32 <= 0x10FFFD);
101*b1cdbd2cSJim Jagielski }
102*b1cdbd2cSJim Jagielski 
ImplIsZeroWidth(sal_uInt32 nUtf32)103*b1cdbd2cSJim Jagielski int ImplIsZeroWidth(sal_uInt32 nUtf32)
104*b1cdbd2cSJim Jagielski {
105*b1cdbd2cSJim Jagielski     /* All code points of <http://www.unicode.org/Public/UNIDATA/
106*b1cdbd2cSJim Jagielski        UnicodeData.txt>, Version 3.1.1, that have "ZERO WIDTH" in their
107*b1cdbd2cSJim Jagielski        Character name.
108*b1cdbd2cSJim Jagielski      */
109*b1cdbd2cSJim Jagielski     return nUtf32 == 0x200B /* ZERO WIDTH SPACE */
110*b1cdbd2cSJim Jagielski            || nUtf32 == 0x200C /* ZERO WIDTH NON-JOINER */
111*b1cdbd2cSJim Jagielski            || nUtf32 == 0x200D /* ZERO WIDTH JOINER */
112*b1cdbd2cSJim Jagielski            || nUtf32 == 0xFEFF; /* ZEOR WIDTH NO-BREAK SPACE */
113*b1cdbd2cSJim Jagielski }
114*b1cdbd2cSJim Jagielski 
ImplGetHighSurrogate(sal_uInt32 nUtf32)115*b1cdbd2cSJim Jagielski sal_uInt32 ImplGetHighSurrogate(sal_uInt32 nUtf32)
116*b1cdbd2cSJim Jagielski {
117*b1cdbd2cSJim Jagielski     OSL_ENSURE(nUtf32 >= 0x10000, "specification violation");
118*b1cdbd2cSJim Jagielski     return ((nUtf32 - 0x10000) >> 10) | 0xD800;
119*b1cdbd2cSJim Jagielski }
120*b1cdbd2cSJim Jagielski 
ImplGetLowSurrogate(sal_uInt32 nUtf32)121*b1cdbd2cSJim Jagielski sal_uInt32 ImplGetLowSurrogate(sal_uInt32 nUtf32)
122*b1cdbd2cSJim Jagielski {
123*b1cdbd2cSJim Jagielski     OSL_ENSURE(nUtf32 >= 0x10000, "specification violation");
124*b1cdbd2cSJim Jagielski     return ((nUtf32 - 0x10000) & 0x3FF) | 0xDC00;
125*b1cdbd2cSJim Jagielski }
126*b1cdbd2cSJim Jagielski 
ImplCombineSurrogates(sal_uInt32 nHigh,sal_uInt32 nLow)127*b1cdbd2cSJim Jagielski sal_uInt32 ImplCombineSurrogates(sal_uInt32 nHigh, sal_uInt32 nLow)
128*b1cdbd2cSJim Jagielski {
129*b1cdbd2cSJim Jagielski     OSL_ENSURE(ImplIsHighSurrogate(nHigh) && ImplIsLowSurrogate(nLow),
130*b1cdbd2cSJim Jagielski                "specification violation");
131*b1cdbd2cSJim Jagielski     return (((nHigh & 0x3FF) << 10) | (nLow & 0x3FF)) + 0x10000;
132*b1cdbd2cSJim Jagielski }
133