1cdf0e10cSrcweir#!/usr/bin/perl 2*7e90fac2SAndrew Rist#************************************************************** 3*7e90fac2SAndrew Rist# 4*7e90fac2SAndrew Rist# Licensed to the Apache Software Foundation (ASF) under one 5*7e90fac2SAndrew Rist# or more contributor license agreements. See the NOTICE file 6*7e90fac2SAndrew Rist# distributed with this work for additional information 7*7e90fac2SAndrew Rist# regarding copyright ownership. The ASF licenses this file 8*7e90fac2SAndrew Rist# to you under the Apache License, Version 2.0 (the 9*7e90fac2SAndrew Rist# "License"); you may not use this file except in compliance 10*7e90fac2SAndrew Rist# with the License. You may obtain a copy of the License at 11*7e90fac2SAndrew Rist# 12*7e90fac2SAndrew Rist# http://www.apache.org/licenses/LICENSE-2.0 13*7e90fac2SAndrew Rist# 14*7e90fac2SAndrew Rist# Unless required by applicable law or agreed to in writing, 15*7e90fac2SAndrew Rist# software distributed under the License is distributed on an 16*7e90fac2SAndrew Rist# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 17*7e90fac2SAndrew Rist# KIND, either express or implied. See the License for the 18*7e90fac2SAndrew Rist# specific language governing permissions and limitations 19*7e90fac2SAndrew Rist# under the License. 20*7e90fac2SAndrew Rist# 21*7e90fac2SAndrew Rist#************************************************************** 22*7e90fac2SAndrew Rist 23*7e90fac2SAndrew Rist 24cdf0e10cSrcweir 25cdf0e10cSrcweir# The following files must be available in a ./input subdir: 26cdf0e10cSrcweir 27cdf0e10cSrcweir# <http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/ 28cdf0e10cSrcweir# gb-18030-2000.xml?rev=1.4&content-type=text/plain>: 29cdf0e10cSrcweir# "modified version="3" date="2001-02-21"" 30cdf0e10cSrcweir 31cdf0e10cSrcweir$id = "Gb180302000"; 32cdf0e10cSrcweir 33cdf0e10cSrcweirsub printUtf32 34cdf0e10cSrcweir{ 35cdf0e10cSrcweir my $utf32 = $_[0]; 36cdf0e10cSrcweir return sprintf("U+%04X", $utf32); 37cdf0e10cSrcweir} 38cdf0e10cSrcweir 39cdf0e10cSrcweirsub printGb 40cdf0e10cSrcweir{ 41cdf0e10cSrcweir if (defined($_[2])) 42cdf0e10cSrcweir { 43cdf0e10cSrcweir return sprintf("%02X%02X%02X%02X", $_[0], $_[1], $_[2], $_[3]); 44cdf0e10cSrcweir } 45cdf0e10cSrcweir elsif (defined($_[1])) 46cdf0e10cSrcweir { 47cdf0e10cSrcweir return sprintf("%02X%02X", $_[0], $_[1]); 48cdf0e10cSrcweir } 49cdf0e10cSrcweir else 50cdf0e10cSrcweir { 51cdf0e10cSrcweir return sprintf("%02X", $_[0]); 52cdf0e10cSrcweir } 53cdf0e10cSrcweir} 54cdf0e10cSrcweir 55cdf0e10cSrcweir$gb_map_2_count = 0; 56cdf0e10cSrcweir$gb_map_4_count = 0; 57cdf0e10cSrcweir$gb_map_4_ranges = 0; 58cdf0e10cSrcweir$gb_map_4_max = 0; 59cdf0e10cSrcweir$uni_map_count = 0; 60cdf0e10cSrcweir 61cdf0e10cSrcweir$range_count = 0; 62cdf0e10cSrcweir 63cdf0e10cSrcweirif (1) 64cdf0e10cSrcweir{ 65cdf0e10cSrcweir $filename = "gb-18030-2000.xml"; 66cdf0e10cSrcweir open IN, ("input/" . $filename) or die "Cannot read " . $filename; 67cdf0e10cSrcweir while (<IN>) 68cdf0e10cSrcweir { 69cdf0e10cSrcweir if (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([0-7][0-9A-F])\"\/>$/) 70cdf0e10cSrcweir { 71cdf0e10cSrcweir $utf32 = oct("0x" . $1); 72cdf0e10cSrcweir $gb1 = oct("0x" . $2); 73cdf0e10cSrcweir ($utf32 == $gb1) 74cdf0e10cSrcweir or die "Bad " . printUtf32($utf32) . " to " . printGb($gb1); 75cdf0e10cSrcweir } 76cdf0e10cSrcweir elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) ([4-789A-F][0-9A-F])\"\/>$/) 77cdf0e10cSrcweir { 78cdf0e10cSrcweir $utf32 = oct("0x" . $1); 79cdf0e10cSrcweir $gb1 = oct("0x" . $2); 80cdf0e10cSrcweir $gb2 = oct("0x" . $3); 81cdf0e10cSrcweir $gb_code = ($gb1 - 0x81) * 190 82cdf0e10cSrcweir + ($gb2 <= 0x7E ? $gb2 - 0x40 : $gb2 - 0x80 + 63); 83cdf0e10cSrcweir !defined($gb_map_2[$gb_code]) 84cdf0e10cSrcweir or die "Redefined " . printGb($gb1, $gb2); 85cdf0e10cSrcweir $gb_map_2[$gb_code] = $utf32; 86cdf0e10cSrcweir ++$gb_map_2_count; 87cdf0e10cSrcweir 88cdf0e10cSrcweir !defined($uni_map[$utf32]) or die "Double Unicode mapping"; 89cdf0e10cSrcweir $uni_map[$utf32] = $gb1 << 8 | $gb2; 90cdf0e10cSrcweir ++$uni_map_count; 91cdf0e10cSrcweir } 92cdf0e10cSrcweir elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\"\/>$/) 93cdf0e10cSrcweir { 94cdf0e10cSrcweir $utf32 = oct("0x" . $1); 95cdf0e10cSrcweir $gb1 = oct("0x" . $2); 96cdf0e10cSrcweir $gb2 = oct("0x" . $3); 97cdf0e10cSrcweir $gb3 = oct("0x" . $4); 98cdf0e10cSrcweir $gb4 = oct("0x" . $5); 99cdf0e10cSrcweir $gb_code = ($gb1 - 0x81) * 12600 100cdf0e10cSrcweir + ($gb2 - 0x30) * 1260 101cdf0e10cSrcweir + ($gb3 - 0x81) * 10 102cdf0e10cSrcweir + ($gb4 - 0x30); 103cdf0e10cSrcweir !defined($gb_map_4[$gb_code]) 104cdf0e10cSrcweir or die "Redefined " . printGb($gb1, $gb2, $gb3, $gb4); 105cdf0e10cSrcweir $gb_map_4[$gb_code] = $utf32; 106cdf0e10cSrcweir ++$gb_map_4_count; 107cdf0e10cSrcweir $gb_map_4_max = $gb_code if ($gb_code > $gb_map_4_max); 108cdf0e10cSrcweir 109cdf0e10cSrcweir !defined($uni_map[$utf32]) or die "Double Unicode mapping"; 110cdf0e10cSrcweir $uni_map[$utf32] = $gb1 << 24 | $gb2 << 16 | $gb3 << 8 | $gb4; 111cdf0e10cSrcweir ++$uni_map_count; 112cdf0e10cSrcweir } 113cdf0e10cSrcweir elsif (/<a /) 114cdf0e10cSrcweir { 115cdf0e10cSrcweir die "Bad format"; 116cdf0e10cSrcweir } 117cdf0e10cSrcweir elsif (/^[ \t]*<range +uFirst=\"([0-9A-F]+)\" +uLast=\"([0-9A-F]+)\" +bFirst=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bLast=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bMin=\"81 30 81 30\" +bMax=\"FE 39 FE 39\"\/>$/) 118cdf0e10cSrcweir { 119cdf0e10cSrcweir $utf32_first = oct("0x" . $1); 120cdf0e10cSrcweir $utf32_last = oct("0x" . $2); 121cdf0e10cSrcweir $gb1_first = oct("0x" . $3); 122cdf0e10cSrcweir $gb2_first = oct("0x" . $4); 123cdf0e10cSrcweir $gb3_first = oct("0x" . $5); 124cdf0e10cSrcweir $gb4_first = oct("0x" . $6); 125cdf0e10cSrcweir $gb1_last = oct("0x" . $7); 126cdf0e10cSrcweir $gb2_last = oct("0x" . $8); 127cdf0e10cSrcweir $gb3_last = oct("0x" . $9); 128cdf0e10cSrcweir $gb4_last = oct("0x" . $10); 129cdf0e10cSrcweir $linear_first 130cdf0e10cSrcweir = ($gb1_first - 0x81) * 12600 131cdf0e10cSrcweir + ($gb2_first - 0x30) * 1260 132cdf0e10cSrcweir + ($gb3_first - 0x81) * 10 133cdf0e10cSrcweir + ($gb4_first - 0x30); 134cdf0e10cSrcweir $linear_last 135cdf0e10cSrcweir = ($gb1_last - 0x81) * 12600 136cdf0e10cSrcweir + ($gb2_last - 0x30) * 1260 137cdf0e10cSrcweir + ($gb3_last - 0x81) * 10 138cdf0e10cSrcweir + ($gb4_last - 0x30); 139cdf0e10cSrcweir ($utf32_last - $utf32_first == $linear_last - $linear_first) 140cdf0e10cSrcweir or die "Bad range"; 141cdf0e10cSrcweir if ($linear_first != 189000 || $linear_last != 1237575) 142cdf0e10cSrcweir { 143cdf0e10cSrcweir $range_uni_first[$range_count] = $utf32_first; 144cdf0e10cSrcweir $range_uni_last[$range_count] 145cdf0e10cSrcweir = ($utf32_last == 0xD7FF ? 0xDFFF : $utf32_last); 146cdf0e10cSrcweir $range_linear_first[$range_count] = $linear_first; 147cdf0e10cSrcweir $range_linear_last[$range_count] = $linear_last; 148cdf0e10cSrcweir ++$range_count; 149cdf0e10cSrcweir $gb_map_4_ranges += $linear_last - $linear_first + 1; 150cdf0e10cSrcweir $gb_map_4_max = $linear_last 151cdf0e10cSrcweir if ($linear_last > $gb_map_4_max); 152cdf0e10cSrcweir } 153cdf0e10cSrcweir } 154cdf0e10cSrcweir elsif (/<range /) 155cdf0e10cSrcweir { 156cdf0e10cSrcweir die "Bad format"; 157cdf0e10cSrcweir } 158cdf0e10cSrcweir } 159cdf0e10cSrcweir close IN; 160cdf0e10cSrcweir} 161cdf0e10cSrcweir 162cdf0e10cSrcweirprint "gb_map_2_count = ", $gb_map_2_count, 163cdf0e10cSrcweir ", gb_map_4_count = ", $gb_map_4_count, 164cdf0e10cSrcweir ", gb_map_4_ranges = ", $gb_map_4_ranges, 165cdf0e10cSrcweir ", gb_map_4_max = ", $gb_map_4_max, 166cdf0e10cSrcweir ", uni_map_count = ", $uni_map_count, "\n"; 167cdf0e10cSrcweir($gb_map_2_count == 23940) or die "Bad gb_map_2_count != 23940"; 168cdf0e10cSrcweir($gb_map_4_max == $gb_map_4_count + $gb_map_4_ranges - 1) 169cdf0e10cSrcweir or die "Bad gb_map_4_max != gb_map_4_count + gb_map_4_ranges"; 170cdf0e10cSrcweir($uni_map_count + $gb_map_4_ranges == 0x10000 - (0xE000 - 0xD800) - 0x80) 171cdf0e10cSrcweir or die "Bad uni_map_count"; 172cdf0e10cSrcweir 173cdf0e10cSrcweir$range_index = 0; 174cdf0e10cSrcweir$gb_nonrangedataindex[$range_index] = $gb_map_2_count; 175cdf0e10cSrcweirfor ($gb_code = 0; $gb_code < $gb_map_4_max; ++$gb_code) 176cdf0e10cSrcweir{ 177cdf0e10cSrcweir if (defined($gb_map_4[$gb_code])) 178cdf0e10cSrcweir { 179cdf0e10cSrcweir $gb_map_2[$gb_map_2_count++] = $gb_map_4[$gb_code]; 180cdf0e10cSrcweir } 181cdf0e10cSrcweir else 182cdf0e10cSrcweir { 183cdf0e10cSrcweir ($gb_code == $range_linear_first[$range_index]) or die "Bad input"; 184cdf0e10cSrcweir $gb_code = $range_linear_last[$range_index]; 185cdf0e10cSrcweir ++$range_index; 186cdf0e10cSrcweir $gb_nonrangedataindex[$range_index] = $gb_map_2_count; 187cdf0e10cSrcweir } 188cdf0e10cSrcweir} 189cdf0e10cSrcweir($range_index == $range_count) or die "Bad input"; 190cdf0e10cSrcweir 191cdf0e10cSrcweir$filename = lc($id) . ".tab"; 192cdf0e10cSrcweiropen OUT, ("> " . $filename) or die "Cannot write " . $filename; 193cdf0e10cSrcweir 194cdf0e10cSrcweir{ 195cdf0e10cSrcweir $filename = lc($id). ".pl"; 196cdf0e10cSrcweir open IN, $filename or die "Cannot read ". $filename; 197cdf0e10cSrcweir $first = 1; 198cdf0e10cSrcweir while (<IN>) 199cdf0e10cSrcweir { 200cdf0e10cSrcweir if (/^\#!.*$/) 201cdf0e10cSrcweir { 202cdf0e10cSrcweir } 203cdf0e10cSrcweir elsif (/^\#(\*.*)$/) 204cdf0e10cSrcweir { 205cdf0e10cSrcweir if ($first == 1) 206cdf0e10cSrcweir { 207cdf0e10cSrcweir print OUT "/", $1, "\n"; 208cdf0e10cSrcweir $first = 0; 209cdf0e10cSrcweir } 210cdf0e10cSrcweir else 211cdf0e10cSrcweir { 212cdf0e10cSrcweir print OUT " ", substr($1, 0, length($1) - 1), "/\n"; 213cdf0e10cSrcweir } 214cdf0e10cSrcweir } 215cdf0e10cSrcweir elsif (/^\# (.*)$/) 216cdf0e10cSrcweir { 217cdf0e10cSrcweir print OUT " *", $1, "\n"; 218cdf0e10cSrcweir } 219cdf0e10cSrcweir elsif (/^\#(.*)$/) 220cdf0e10cSrcweir { 221cdf0e10cSrcweir print OUT " *", $1, "\n"; 222cdf0e10cSrcweir } 223cdf0e10cSrcweir else 224cdf0e10cSrcweir { 225cdf0e10cSrcweir goto done; 226cdf0e10cSrcweir } 227cdf0e10cSrcweir } 228cdf0e10cSrcweir done: 229cdf0e10cSrcweir} 230cdf0e10cSrcweir 231cdf0e10cSrcweirprint OUT "\n", 232cdf0e10cSrcweir "#ifndef INCLUDED_RTL_TEXTENC_CONVERTGB18030_H\n", 233cdf0e10cSrcweir "#include \"convertgb18030.h\"\n", 234cdf0e10cSrcweir "#endif\n", 235cdf0e10cSrcweir "\n", 236cdf0e10cSrcweir "#ifndef _SAL_TYPES_H_\n", 237cdf0e10cSrcweir "#include \"sal/types.h\"\n", 238cdf0e10cSrcweir "#endif\n", 239cdf0e10cSrcweir "\n"; 240cdf0e10cSrcweir 241cdf0e10cSrcweirprint OUT "static sal_Unicode const aImpl", $id, "ToUnicodeData[] = {\n "; 242cdf0e10cSrcweirfor ($gb_code = 0; $gb_code < $gb_map_2_count; ++$gb_code) 243cdf0e10cSrcweir{ 244cdf0e10cSrcweir printf OUT "0x%04X,", $gb_map_2[$gb_code]; 245cdf0e10cSrcweir if ($gb_code % 8 == 7 && $gb_code != $gb_map_2_count - 1) 246cdf0e10cSrcweir { 247cdf0e10cSrcweir print OUT "\n "; 248cdf0e10cSrcweir } 249cdf0e10cSrcweir} 250cdf0e10cSrcweirprint OUT "\n};\n\n"; 251cdf0e10cSrcweir 252cdf0e10cSrcweirprint OUT "static ImplGb180302000ToUnicodeRange const\n aImpl", 253cdf0e10cSrcweir $id, 254cdf0e10cSrcweir "ToUnicodeRanges[] = {\n"; 255cdf0e10cSrcweirfor ($range_index = 0; $range_index < $range_count; ++$range_index) 256cdf0e10cSrcweir{ 257cdf0e10cSrcweir printf OUT " { %d, %d, %d, 0x%04X },\n", 258cdf0e10cSrcweir $gb_nonrangedataindex[$range_index], 259cdf0e10cSrcweir $range_linear_first[$range_index], 260cdf0e10cSrcweir $range_linear_last[$range_index] + 1, 261cdf0e10cSrcweir $range_uni_first[$range_index]; 262cdf0e10cSrcweir} 263cdf0e10cSrcweirprint OUT " { -1, 0, 0, 0 }\n};\n\n"; 264cdf0e10cSrcweir 265cdf0e10cSrcweirprint OUT "static sal_uInt32 const aImplUnicodeTo", $id, "Data[] = {\n "; 266cdf0e10cSrcweir$index = 0; 267cdf0e10cSrcweir$range_index = 0; 268cdf0e10cSrcweir$uni_nonrangedataindex[$range_index] = $index; 269cdf0e10cSrcweirfor ($utf32 = 0x80; $utf32 <= 0xFFFF; ++$utf32) 270cdf0e10cSrcweir{ 271cdf0e10cSrcweir if (defined($uni_map[$utf32])) 272cdf0e10cSrcweir { 273cdf0e10cSrcweir if ($index > 0 && ($index - 1) % 6 == 5) 274cdf0e10cSrcweir { 275cdf0e10cSrcweir print OUT "\n "; 276cdf0e10cSrcweir } 277cdf0e10cSrcweir $bytes = $uni_map[$utf32]; 278cdf0e10cSrcweir printf OUT ($bytes <= 0xFFFF ? " 0x%04X," : "0x%08X,"), $bytes; 279cdf0e10cSrcweir ++$index; 280cdf0e10cSrcweir } 281cdf0e10cSrcweir else 282cdf0e10cSrcweir { 283cdf0e10cSrcweir ($utf32 == $range_uni_first[$range_index]) or die "Bad input"; 284cdf0e10cSrcweir $utf32 = $range_uni_last[$range_index]; 285cdf0e10cSrcweir ++$range_index; 286cdf0e10cSrcweir $uni_nonrangedataindex[$range_index] = $index; 287cdf0e10cSrcweir } 288cdf0e10cSrcweir} 289cdf0e10cSrcweir($range_index == $range_count) or die "Bad input"; 290cdf0e10cSrcweirprint OUT "\n};\n\n"; 291cdf0e10cSrcweir 292cdf0e10cSrcweirprint OUT "static ImplUnicodeToGb180302000Range const\n aImplUnicodeTo", 293cdf0e10cSrcweir $id, 294cdf0e10cSrcweir "Ranges[] = {\n"; 295cdf0e10cSrcweirfor ($range_index = 0; $range_index < $range_count; ++$range_index) 296cdf0e10cSrcweir{ 297cdf0e10cSrcweir printf OUT " { %d, 0x%04X, 0x%04X, %d },\n", 298cdf0e10cSrcweir $uni_nonrangedataindex[$range_index], 299cdf0e10cSrcweir $range_uni_first[$range_index], 300cdf0e10cSrcweir $range_uni_last[$range_index], 301cdf0e10cSrcweir $range_linear_first[$range_index]; 302cdf0e10cSrcweir} 303cdf0e10cSrcweirprint OUT "};\n"; 304cdf0e10cSrcweir 305cdf0e10cSrcweirclose OUT; 306