1cdf0e10cSrcweir#!/usr/bin/perl
2*7e90fac2SAndrew Rist#**************************************************************
3*7e90fac2SAndrew Rist#
4*7e90fac2SAndrew Rist#  Licensed to the Apache Software Foundation (ASF) under one
5*7e90fac2SAndrew Rist#  or more contributor license agreements.  See the NOTICE file
6*7e90fac2SAndrew Rist#  distributed with this work for additional information
7*7e90fac2SAndrew Rist#  regarding copyright ownership.  The ASF licenses this file
8*7e90fac2SAndrew Rist#  to you under the Apache License, Version 2.0 (the
9*7e90fac2SAndrew Rist#  "License"); you may not use this file except in compliance
10*7e90fac2SAndrew Rist#  with the License.  You may obtain a copy of the License at
11*7e90fac2SAndrew Rist#
12*7e90fac2SAndrew Rist#    http://www.apache.org/licenses/LICENSE-2.0
13*7e90fac2SAndrew Rist#
14*7e90fac2SAndrew Rist#  Unless required by applicable law or agreed to in writing,
15*7e90fac2SAndrew Rist#  software distributed under the License is distributed on an
16*7e90fac2SAndrew Rist#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17*7e90fac2SAndrew Rist#  KIND, either express or implied.  See the License for the
18*7e90fac2SAndrew Rist#  specific language governing permissions and limitations
19*7e90fac2SAndrew Rist#  under the License.
20*7e90fac2SAndrew Rist#
21*7e90fac2SAndrew Rist#**************************************************************
22*7e90fac2SAndrew Rist
23*7e90fac2SAndrew Rist
24cdf0e10cSrcweir
25cdf0e10cSrcweir# The following files must be available in a ./input subdir:
26cdf0e10cSrcweir
27cdf0e10cSrcweir# <http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/
28cdf0e10cSrcweir# gb-18030-2000.xml?rev=1.4&content-type=text/plain>:
29cdf0e10cSrcweir#  "modified version="3" date="2001-02-21""
30cdf0e10cSrcweir
31cdf0e10cSrcweir$id = "Gb180302000";
32cdf0e10cSrcweir
33cdf0e10cSrcweirsub printUtf32
34cdf0e10cSrcweir{
35cdf0e10cSrcweir    my $utf32 = $_[0];
36cdf0e10cSrcweir    return sprintf("U+%04X", $utf32);
37cdf0e10cSrcweir}
38cdf0e10cSrcweir
39cdf0e10cSrcweirsub printGb
40cdf0e10cSrcweir{
41cdf0e10cSrcweir    if (defined($_[2]))
42cdf0e10cSrcweir    {
43cdf0e10cSrcweir        return sprintf("%02X%02X%02X%02X", $_[0], $_[1], $_[2], $_[3]);
44cdf0e10cSrcweir    }
45cdf0e10cSrcweir    elsif (defined($_[1]))
46cdf0e10cSrcweir    {
47cdf0e10cSrcweir        return sprintf("%02X%02X", $_[0], $_[1]);
48cdf0e10cSrcweir    }
49cdf0e10cSrcweir    else
50cdf0e10cSrcweir    {
51cdf0e10cSrcweir        return sprintf("%02X", $_[0]);
52cdf0e10cSrcweir    }
53cdf0e10cSrcweir}
54cdf0e10cSrcweir
55cdf0e10cSrcweir$gb_map_2_count = 0;
56cdf0e10cSrcweir$gb_map_4_count = 0;
57cdf0e10cSrcweir$gb_map_4_ranges = 0;
58cdf0e10cSrcweir$gb_map_4_max = 0;
59cdf0e10cSrcweir$uni_map_count = 0;
60cdf0e10cSrcweir
61cdf0e10cSrcweir$range_count = 0;
62cdf0e10cSrcweir
63cdf0e10cSrcweirif (1)
64cdf0e10cSrcweir{
65cdf0e10cSrcweir    $filename = "gb-18030-2000.xml";
66cdf0e10cSrcweir    open IN, ("input/" . $filename) or die "Cannot read " . $filename;
67cdf0e10cSrcweir    while (<IN>)
68cdf0e10cSrcweir    {
69cdf0e10cSrcweir        if (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([0-7][0-9A-F])\"\/>$/)
70cdf0e10cSrcweir        {
71cdf0e10cSrcweir            $utf32 = oct("0x" . $1);
72cdf0e10cSrcweir            $gb1 = oct("0x" . $2);
73cdf0e10cSrcweir            ($utf32 == $gb1)
74cdf0e10cSrcweir                or die "Bad " . printUtf32($utf32) . " to " . printGb($gb1);
75cdf0e10cSrcweir        }
76cdf0e10cSrcweir        elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) ([4-789A-F][0-9A-F])\"\/>$/)
77cdf0e10cSrcweir        {
78cdf0e10cSrcweir            $utf32 = oct("0x" . $1);
79cdf0e10cSrcweir            $gb1 = oct("0x" . $2);
80cdf0e10cSrcweir            $gb2 = oct("0x" . $3);
81cdf0e10cSrcweir            $gb_code = ($gb1 - 0x81) * 190
82cdf0e10cSrcweir                           + ($gb2 <= 0x7E ? $gb2 - 0x40 : $gb2 - 0x80 + 63);
83cdf0e10cSrcweir            !defined($gb_map_2[$gb_code])
84cdf0e10cSrcweir                or die "Redefined " . printGb($gb1, $gb2);
85cdf0e10cSrcweir            $gb_map_2[$gb_code] = $utf32;
86cdf0e10cSrcweir            ++$gb_map_2_count;
87cdf0e10cSrcweir
88cdf0e10cSrcweir            !defined($uni_map[$utf32]) or die "Double Unicode mapping";
89cdf0e10cSrcweir            $uni_map[$utf32] = $gb1 << 8 | $gb2;
90cdf0e10cSrcweir            ++$uni_map_count;
91cdf0e10cSrcweir        }
92cdf0e10cSrcweir        elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\"\/>$/)
93cdf0e10cSrcweir        {
94cdf0e10cSrcweir            $utf32 = oct("0x" . $1);
95cdf0e10cSrcweir            $gb1 = oct("0x" . $2);
96cdf0e10cSrcweir            $gb2 = oct("0x" . $3);
97cdf0e10cSrcweir            $gb3 = oct("0x" . $4);
98cdf0e10cSrcweir            $gb4 = oct("0x" . $5);
99cdf0e10cSrcweir            $gb_code = ($gb1 - 0x81) * 12600
100cdf0e10cSrcweir                           + ($gb2 - 0x30) * 1260
101cdf0e10cSrcweir                           + ($gb3 - 0x81) * 10
102cdf0e10cSrcweir                           + ($gb4 - 0x30);
103cdf0e10cSrcweir            !defined($gb_map_4[$gb_code])
104cdf0e10cSrcweir                or die "Redefined " . printGb($gb1, $gb2, $gb3, $gb4);
105cdf0e10cSrcweir            $gb_map_4[$gb_code] = $utf32;
106cdf0e10cSrcweir            ++$gb_map_4_count;
107cdf0e10cSrcweir            $gb_map_4_max = $gb_code if ($gb_code > $gb_map_4_max);
108cdf0e10cSrcweir
109cdf0e10cSrcweir            !defined($uni_map[$utf32]) or die "Double Unicode mapping";
110cdf0e10cSrcweir            $uni_map[$utf32] = $gb1 << 24 | $gb2 << 16 | $gb3 << 8 | $gb4;
111cdf0e10cSrcweir            ++$uni_map_count;
112cdf0e10cSrcweir        }
113cdf0e10cSrcweir        elsif (/<a /)
114cdf0e10cSrcweir        {
115cdf0e10cSrcweir            die "Bad format";
116cdf0e10cSrcweir        }
117cdf0e10cSrcweir        elsif (/^[ \t]*<range +uFirst=\"([0-9A-F]+)\" +uLast=\"([0-9A-F]+)\" +bFirst=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bLast=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bMin=\"81 30 81 30\" +bMax=\"FE 39 FE 39\"\/>$/)
118cdf0e10cSrcweir        {
119cdf0e10cSrcweir            $utf32_first = oct("0x" . $1);
120cdf0e10cSrcweir            $utf32_last = oct("0x" . $2);
121cdf0e10cSrcweir            $gb1_first = oct("0x" . $3);
122cdf0e10cSrcweir            $gb2_first = oct("0x" . $4);
123cdf0e10cSrcweir            $gb3_first = oct("0x" . $5);
124cdf0e10cSrcweir            $gb4_first = oct("0x" . $6);
125cdf0e10cSrcweir            $gb1_last = oct("0x" . $7);
126cdf0e10cSrcweir            $gb2_last = oct("0x" . $8);
127cdf0e10cSrcweir            $gb3_last = oct("0x" . $9);
128cdf0e10cSrcweir            $gb4_last = oct("0x" . $10);
129cdf0e10cSrcweir            $linear_first
130cdf0e10cSrcweir                = ($gb1_first - 0x81) * 12600
131cdf0e10cSrcweir                    + ($gb2_first - 0x30) * 1260
132cdf0e10cSrcweir                        + ($gb3_first - 0x81) * 10
133cdf0e10cSrcweir                            + ($gb4_first - 0x30);
134cdf0e10cSrcweir            $linear_last
135cdf0e10cSrcweir                = ($gb1_last - 0x81) * 12600
136cdf0e10cSrcweir                    + ($gb2_last - 0x30) * 1260
137cdf0e10cSrcweir                        + ($gb3_last - 0x81) * 10
138cdf0e10cSrcweir                            + ($gb4_last - 0x30);
139cdf0e10cSrcweir            ($utf32_last - $utf32_first == $linear_last - $linear_first)
140cdf0e10cSrcweir                or die "Bad range";
141cdf0e10cSrcweir            if ($linear_first != 189000 || $linear_last != 1237575)
142cdf0e10cSrcweir            {
143cdf0e10cSrcweir                $range_uni_first[$range_count] = $utf32_first;
144cdf0e10cSrcweir                $range_uni_last[$range_count]
145cdf0e10cSrcweir                    = ($utf32_last == 0xD7FF ? 0xDFFF : $utf32_last);
146cdf0e10cSrcweir                $range_linear_first[$range_count] = $linear_first;
147cdf0e10cSrcweir                $range_linear_last[$range_count] = $linear_last;
148cdf0e10cSrcweir                ++$range_count;
149cdf0e10cSrcweir                $gb_map_4_ranges += $linear_last - $linear_first + 1;
150cdf0e10cSrcweir                $gb_map_4_max = $linear_last
151cdf0e10cSrcweir                    if ($linear_last > $gb_map_4_max);
152cdf0e10cSrcweir            }
153cdf0e10cSrcweir        }
154cdf0e10cSrcweir        elsif (/<range /)
155cdf0e10cSrcweir        {
156cdf0e10cSrcweir            die "Bad format";
157cdf0e10cSrcweir        }
158cdf0e10cSrcweir    }
159cdf0e10cSrcweir    close IN;
160cdf0e10cSrcweir}
161cdf0e10cSrcweir
162cdf0e10cSrcweirprint "gb_map_2_count = ", $gb_map_2_count,
163cdf0e10cSrcweir      ", gb_map_4_count = ", $gb_map_4_count,
164cdf0e10cSrcweir      ", gb_map_4_ranges = ", $gb_map_4_ranges,
165cdf0e10cSrcweir      ", gb_map_4_max = ", $gb_map_4_max,
166cdf0e10cSrcweir      ", uni_map_count = ", $uni_map_count, "\n";
167cdf0e10cSrcweir($gb_map_2_count == 23940) or die "Bad gb_map_2_count != 23940";
168cdf0e10cSrcweir($gb_map_4_max == $gb_map_4_count + $gb_map_4_ranges - 1)
169cdf0e10cSrcweir    or die "Bad gb_map_4_max != gb_map_4_count + gb_map_4_ranges";
170cdf0e10cSrcweir($uni_map_count + $gb_map_4_ranges == 0x10000 - (0xE000 - 0xD800) - 0x80)
171cdf0e10cSrcweir    or die "Bad uni_map_count";
172cdf0e10cSrcweir
173cdf0e10cSrcweir$range_index = 0;
174cdf0e10cSrcweir$gb_nonrangedataindex[$range_index] = $gb_map_2_count;
175cdf0e10cSrcweirfor ($gb_code = 0; $gb_code < $gb_map_4_max; ++$gb_code)
176cdf0e10cSrcweir{
177cdf0e10cSrcweir    if (defined($gb_map_4[$gb_code]))
178cdf0e10cSrcweir    {
179cdf0e10cSrcweir        $gb_map_2[$gb_map_2_count++] = $gb_map_4[$gb_code];
180cdf0e10cSrcweir    }
181cdf0e10cSrcweir    else
182cdf0e10cSrcweir    {
183cdf0e10cSrcweir        ($gb_code == $range_linear_first[$range_index]) or die "Bad input";
184cdf0e10cSrcweir        $gb_code = $range_linear_last[$range_index];
185cdf0e10cSrcweir        ++$range_index;
186cdf0e10cSrcweir        $gb_nonrangedataindex[$range_index] = $gb_map_2_count;
187cdf0e10cSrcweir    }
188cdf0e10cSrcweir}
189cdf0e10cSrcweir($range_index == $range_count) or die "Bad input";
190cdf0e10cSrcweir
191cdf0e10cSrcweir$filename = lc($id) . ".tab";
192cdf0e10cSrcweiropen OUT, ("> " . $filename) or die "Cannot write " . $filename;
193cdf0e10cSrcweir
194cdf0e10cSrcweir{
195cdf0e10cSrcweir    $filename = lc($id). ".pl";
196cdf0e10cSrcweir    open IN, $filename or die "Cannot read ". $filename;
197cdf0e10cSrcweir    $first = 1;
198cdf0e10cSrcweir    while (<IN>)
199cdf0e10cSrcweir    {
200cdf0e10cSrcweir        if (/^\#!.*$/)
201cdf0e10cSrcweir        {
202cdf0e10cSrcweir        }
203cdf0e10cSrcweir        elsif (/^\#(\*.*)$/)
204cdf0e10cSrcweir        {
205cdf0e10cSrcweir            if ($first == 1)
206cdf0e10cSrcweir            {
207cdf0e10cSrcweir                print OUT "/", $1, "\n";
208cdf0e10cSrcweir                $first = 0;
209cdf0e10cSrcweir            }
210cdf0e10cSrcweir            else
211cdf0e10cSrcweir            {
212cdf0e10cSrcweir                print OUT " ", substr($1, 0, length($1) - 1), "/\n";
213cdf0e10cSrcweir            }
214cdf0e10cSrcweir        }
215cdf0e10cSrcweir        elsif (/^\# (.*)$/)
216cdf0e10cSrcweir        {
217cdf0e10cSrcweir            print OUT " *", $1, "\n";
218cdf0e10cSrcweir        }
219cdf0e10cSrcweir        elsif (/^\#(.*)$/)
220cdf0e10cSrcweir        {
221cdf0e10cSrcweir            print OUT " *", $1, "\n";
222cdf0e10cSrcweir        }
223cdf0e10cSrcweir        else
224cdf0e10cSrcweir        {
225cdf0e10cSrcweir            goto done;
226cdf0e10cSrcweir        }
227cdf0e10cSrcweir    }
228cdf0e10cSrcweir  done:
229cdf0e10cSrcweir}
230cdf0e10cSrcweir
231cdf0e10cSrcweirprint OUT "\n",
232cdf0e10cSrcweir          "#ifndef INCLUDED_RTL_TEXTENC_CONVERTGB18030_H\n",
233cdf0e10cSrcweir          "#include \"convertgb18030.h\"\n",
234cdf0e10cSrcweir          "#endif\n",
235cdf0e10cSrcweir          "\n",
236cdf0e10cSrcweir          "#ifndef _SAL_TYPES_H_\n",
237cdf0e10cSrcweir          "#include \"sal/types.h\"\n",
238cdf0e10cSrcweir          "#endif\n",
239cdf0e10cSrcweir          "\n";
240cdf0e10cSrcweir
241cdf0e10cSrcweirprint OUT "static sal_Unicode const aImpl", $id, "ToUnicodeData[] = {\n  ";
242cdf0e10cSrcweirfor ($gb_code = 0; $gb_code < $gb_map_2_count; ++$gb_code)
243cdf0e10cSrcweir{
244cdf0e10cSrcweir    printf OUT "0x%04X,", $gb_map_2[$gb_code];
245cdf0e10cSrcweir    if ($gb_code % 8 == 7 && $gb_code != $gb_map_2_count - 1)
246cdf0e10cSrcweir    {
247cdf0e10cSrcweir        print OUT "\n  ";
248cdf0e10cSrcweir    }
249cdf0e10cSrcweir}
250cdf0e10cSrcweirprint OUT "\n};\n\n";
251cdf0e10cSrcweir
252cdf0e10cSrcweirprint OUT "static ImplGb180302000ToUnicodeRange const\n    aImpl",
253cdf0e10cSrcweir          $id,
254cdf0e10cSrcweir          "ToUnicodeRanges[] = {\n";
255cdf0e10cSrcweirfor ($range_index = 0; $range_index < $range_count; ++$range_index)
256cdf0e10cSrcweir{
257cdf0e10cSrcweir    printf OUT "  { %d, %d, %d, 0x%04X },\n",
258cdf0e10cSrcweir               $gb_nonrangedataindex[$range_index],
259cdf0e10cSrcweir               $range_linear_first[$range_index],
260cdf0e10cSrcweir               $range_linear_last[$range_index] + 1,
261cdf0e10cSrcweir               $range_uni_first[$range_index];
262cdf0e10cSrcweir}
263cdf0e10cSrcweirprint OUT "  { -1, 0, 0, 0 }\n};\n\n";
264cdf0e10cSrcweir
265cdf0e10cSrcweirprint OUT "static sal_uInt32 const aImplUnicodeTo", $id, "Data[] = {\n  ";
266cdf0e10cSrcweir$index = 0;
267cdf0e10cSrcweir$range_index = 0;
268cdf0e10cSrcweir$uni_nonrangedataindex[$range_index] = $index;
269cdf0e10cSrcweirfor ($utf32 = 0x80; $utf32 <= 0xFFFF; ++$utf32)
270cdf0e10cSrcweir{
271cdf0e10cSrcweir    if (defined($uni_map[$utf32]))
272cdf0e10cSrcweir    {
273cdf0e10cSrcweir        if ($index > 0 && ($index - 1) % 6 == 5)
274cdf0e10cSrcweir        {
275cdf0e10cSrcweir            print OUT "\n  ";
276cdf0e10cSrcweir        }
277cdf0e10cSrcweir        $bytes = $uni_map[$utf32];
278cdf0e10cSrcweir        printf OUT ($bytes <= 0xFFFF ? "    0x%04X," : "0x%08X,"), $bytes;
279cdf0e10cSrcweir        ++$index;
280cdf0e10cSrcweir    }
281cdf0e10cSrcweir    else
282cdf0e10cSrcweir    {
283cdf0e10cSrcweir        ($utf32 == $range_uni_first[$range_index]) or die "Bad input";
284cdf0e10cSrcweir        $utf32 = $range_uni_last[$range_index];
285cdf0e10cSrcweir        ++$range_index;
286cdf0e10cSrcweir        $uni_nonrangedataindex[$range_index] = $index;
287cdf0e10cSrcweir    }
288cdf0e10cSrcweir}
289cdf0e10cSrcweir($range_index == $range_count) or die "Bad input";
290cdf0e10cSrcweirprint OUT "\n};\n\n";
291cdf0e10cSrcweir
292cdf0e10cSrcweirprint OUT "static ImplUnicodeToGb180302000Range const\n    aImplUnicodeTo",
293cdf0e10cSrcweir          $id,
294cdf0e10cSrcweir          "Ranges[] = {\n";
295cdf0e10cSrcweirfor ($range_index = 0; $range_index < $range_count; ++$range_index)
296cdf0e10cSrcweir{
297cdf0e10cSrcweir    printf OUT "  { %d, 0x%04X, 0x%04X, %d },\n",
298cdf0e10cSrcweir               $uni_nonrangedataindex[$range_index],
299cdf0e10cSrcweir               $range_uni_first[$range_index],
300cdf0e10cSrcweir               $range_uni_last[$range_index],
301cdf0e10cSrcweir               $range_linear_first[$range_index];
302cdf0e10cSrcweir}
303cdf0e10cSrcweirprint OUT "};\n";
304cdf0e10cSrcweir
305cdf0e10cSrcweirclose OUT;
306