1 /************************************************************************* 2 * 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * Copyright 2000, 2010 Oracle and/or its affiliates. 6 * 7 * OpenOffice.org - a multi-platform office productivity suite 8 * 9 * This file is part of OpenOffice.org. 10 * 11 * OpenOffice.org is free software: you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser General Public License version 3 13 * only, as published by the Free Software Foundation. 14 * 15 * OpenOffice.org is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License version 3 for more details 19 * (a copy is included in the LICENSE file that accompanied this code). 20 * 21 * You should have received a copy of the GNU Lesser General Public License 22 * version 3 along with OpenOffice.org. If not, see 23 * <http://www.openoffice.org/license.html> 24 * for a copy of the LGPLv3 License. 25 * 26 ************************************************************************/ 27 28 // MARKER(update_precomp.py): autogen include statement, do not remove 29 #include "precompiled_i18npool.hxx" 30 31 #include <stdio.h> 32 #include <string.h> 33 #include <stdlib.h> 34 #include <sal/main.h> 35 #include <sal/types.h> 36 #include <rtl/strbuf.hxx> 37 #include <rtl/ustring.hxx> 38 39 #include <vector> 40 41 using namespace ::rtl; 42 43 void make_hhc_char(FILE *sfp, FILE *cfp); 44 void make_stc_char(FILE *sfp, FILE *cfp); 45 void make_stc_word(FILE *sfp, FILE *cfp); 46 47 /* Main Procedure */ 48 49 SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv) 50 { 51 FILE *sfp, *cfp; 52 53 if (argc < 4) exit(-1); 54 55 56 sfp = fopen(argv[2], "rb"); // open the source file for read; 57 if (sfp == NULL) 58 { 59 printf("Open the dictionary source file failed."); 60 return -1; 61 } 62 63 // create the C source file to write 64 cfp = fopen(argv[3], "wb"); 65 if (cfp == NULL) { 66 fclose(sfp); 67 printf("Can't create the C source file."); 68 return -1; 69 } 70 71 fprintf(cfp, "/*\n"); 72 fprintf(cfp, " * Copyright(c) 1999 - 2000, Sun Microsystems, Inc.\n"); 73 fprintf(cfp, " * All Rights Reserved.\n"); 74 fprintf(cfp, " */\n\n"); 75 fprintf(cfp, "/* !!!The file is generated automatically. DONOT edit the file manually!!! */\n\n"); 76 fprintf(cfp, "#include <sal/types.h>\n"); 77 fprintf(cfp, "#include <textconversion.hxx>\n"); 78 fprintf(cfp, "\nextern \"C\" {\n"); 79 80 if (strcmp(argv[1], "hhc_char") == 0) 81 make_hhc_char(sfp, cfp); 82 else if (strcmp(argv[1], "stc_char") == 0) 83 make_stc_char(sfp, cfp); 84 else if (strcmp(argv[1], "stc_word") == 0) 85 make_stc_word(sfp, cfp); 86 87 fprintf (cfp, "}\n"); 88 89 fclose(sfp); 90 fclose(cfp); 91 92 return 0; 93 } // end of main 94 95 // Hangul/Hanja character conversion 96 void make_hhc_char(FILE *sfp, FILE *cfp) 97 { 98 sal_Int32 count, address, i, j, k; 99 sal_Unicode Hanja2HangulData[0x10000]; 100 for (i = 0; i < 0x10000; i++) { 101 Hanja2HangulData[i] = 0; 102 } 103 sal_uInt16 Hangul2HanjaData[10000][3]; 104 105 // generate main dict. data array 106 fprintf(cfp, "\nstatic const sal_Unicode Hangul2HanjaData[] = {"); 107 108 sal_Char Cstr[1024]; 109 count = 0; 110 address = 0; 111 while (fgets(Cstr, 1024, sfp)) { 112 // input file is in UTF-8 encoding (Hangul:Hanja) 113 // don't convert last new line character to Ostr. 114 OUString Ostr((const sal_Char *)Cstr, strlen(Cstr) - 1, RTL_TEXTENCODING_UTF8); 115 const sal_Unicode *Ustr = Ostr.getStr(); 116 sal_Int32 len = Ostr.getLength(); 117 118 Hangul2HanjaData[count][0] = Ustr[0]; 119 Hangul2HanjaData[count][1] = sal::static_int_cast<sal_uInt16>( address ); 120 Hangul2HanjaData[count][2] = sal::static_int_cast<sal_uInt16>( len - 2 ); 121 count++; 122 123 for (i = 2; i < len; i++) { 124 Hanja2HangulData[Ustr[i]] = Ustr[0]; 125 if (address++ % 16 == 0) 126 fprintf(cfp, "\n\t"); 127 fprintf(cfp, "0x%04x, ", Ustr[i]); 128 } 129 } 130 fprintf(cfp, "\n};\n"); 131 132 fprintf(cfp, "\nstatic const com::sun::star::i18n::Hangul_Index Hangul2HanjaIndex[] = {\n"); 133 for (i = 0; i < count; i++) 134 fprintf(cfp, "\t{ 0x%04x, 0x%04x, 0x%02x },\n", 135 Hangul2HanjaData[i][0], 136 Hangul2HanjaData[i][1], 137 Hangul2HanjaData[i][2]); 138 fprintf(cfp, "};\n"); 139 140 fprintf(cfp, "\nstatic const sal_uInt16 Hanja2HangulIndex[] = {"); 141 142 address=0; 143 for (i = 0; i < 0x10; i++) { 144 fprintf(cfp, "\n\t"); 145 for (j = 0; j < 0x10; j++) { 146 for (k = 0; k < 0x100; k++) { 147 if (Hanja2HangulData[((i*0x10)+j)*0x100+k] != 0) 148 break; 149 } 150 fprintf( 151 cfp, "0x%04lx, ", 152 sal::static_int_cast< unsigned long >( 153 k < 0x100 ? (address++)*0x100 : 0xFFFF)); 154 } 155 } 156 fprintf(cfp, "\n};\n"); 157 158 fprintf(cfp, "\nstatic const sal_Unicode Hanja2HangulData[] = {"); 159 160 for (i = 0; i < 0x100; i++) { 161 for (j = 0; j < 0x100; j++) { 162 if (Hanja2HangulData[i*0x100+j] != 0) 163 break; 164 } 165 if (j < 0x100) { 166 for (j = 0; j < 0x10; j++) { 167 fprintf(cfp, "\n\t"); 168 for (k = 0; k < 0x10; k++) { 169 sal_Unicode c = Hanja2HangulData[((i*0x10+j)*0x10)+k]; 170 fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF); 171 } 172 } 173 } 174 } 175 fprintf(cfp, "\n};\n"); 176 177 // create function to return arrays 178 fprintf (cfp, "\tconst sal_Unicode* getHangul2HanjaData() { return Hangul2HanjaData; }\n"); 179 fprintf (cfp, "\tconst com::sun::star::i18n::Hangul_Index* getHangul2HanjaIndex() { return Hangul2HanjaIndex; }\n"); 180 fprintf (cfp, "\tsal_Int16 getHangul2HanjaIndexCount() { return sizeof(Hangul2HanjaIndex) / sizeof(com::sun::star::i18n::Hangul_Index); }\n"); 181 fprintf (cfp, "\tconst sal_uInt16* getHanja2HangulIndex() { return Hanja2HangulIndex; }\n"); 182 fprintf (cfp, "\tconst sal_Unicode* getHanja2HangulData() { return Hanja2HangulData; }\n"); 183 } 184 185 // Simplified/Traditional Chinese character conversion 186 void make_stc_char(FILE *sfp, FILE *cfp) 187 { 188 sal_Int32 address, i, j, k; 189 sal_Unicode SChinese2TChineseData[0x10000]; 190 sal_Unicode SChinese2VChineseData[0x10000]; 191 sal_Unicode TChinese2SChineseData[0x10000]; 192 for (i = 0; i < 0x10000; i++) { 193 SChinese2TChineseData[i] = 0; 194 SChinese2VChineseData[i] = 0; 195 TChinese2SChineseData[i] = 0; 196 } 197 198 sal_Char Cstr[1024]; 199 while (fgets(Cstr, 1024, sfp)) { 200 // input file is in UTF-8 encoding (SChinese:TChinese) 201 // don't convert last new line character to Ostr. 202 OUString Ostr((const sal_Char *)Cstr, strlen(Cstr) - 1, RTL_TEXTENCODING_UTF8); 203 const sal_Unicode *Ustr = Ostr.getStr(); 204 sal_Int32 len = Ostr.getLength(); 205 if (Ustr[1] == sal_Unicode('v')) 206 SChinese2VChineseData[Ustr[0]] = Ustr[2]; 207 else { 208 SChinese2TChineseData[Ustr[0]] = Ustr[2]; 209 if (SChinese2VChineseData[Ustr[0]] == 0) 210 SChinese2VChineseData[Ustr[0]] = Ustr[2]; 211 } 212 for (i = 2; i < len; i++) 213 TChinese2SChineseData[Ustr[i]] = Ustr[0]; 214 } 215 216 fprintf(cfp, "\nstatic const sal_uInt16 STC_CharIndex_S2T[] = {"); 217 218 address=0; 219 for (i = 0; i < 0x10; i++) { 220 fprintf(cfp, "\n\t"); 221 for (j = 0; j < 0x10; j++) { 222 for (k = 0; k < 0x100; k++) { 223 if (SChinese2TChineseData[((i*0x10)+j)*0x100+k] != 0) 224 break; 225 } 226 fprintf( 227 cfp, "0x%04lx, ", 228 sal::static_int_cast< unsigned long >( 229 k < 0x100 ? (address++)*0x100 : 0xFFFF)); 230 } 231 } 232 fprintf(cfp, "\n};\n"); 233 234 fprintf(cfp, "\nstatic const sal_Unicode STC_CharData_S2T[] = {"); 235 236 for (i = 0; i < 0x100; i++) { 237 for (j = 0; j < 0x100; j++) { 238 if (SChinese2TChineseData[i*0x100+j] != 0) 239 break; 240 } 241 if (j < 0x100) { 242 for (j = 0; j < 0x10; j++) { 243 fprintf(cfp, "\n\t"); 244 for (k = 0; k < 0x10; k++) { 245 sal_Unicode c = SChinese2TChineseData[((i*0x10+j)*0x10)+k]; 246 fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF); 247 } 248 } 249 } 250 } 251 fprintf(cfp, "\n};\n"); 252 253 fprintf(cfp, "\nstatic const sal_uInt16 STC_CharIndex_S2V[] = {"); 254 255 address=0; 256 for (i = 0; i < 0x10; i++) { 257 fprintf(cfp, "\n\t"); 258 for (j = 0; j < 0x10; j++) { 259 for (k = 0; k < 0x100; k++) { 260 if (SChinese2VChineseData[((i*0x10)+j)*0x100+k] != 0) 261 break; 262 } 263 fprintf( 264 cfp, "0x%04lx, ", 265 sal::static_int_cast< unsigned long >( 266 k < 0x100 ? (address++)*0x100 : 0xFFFF)); 267 } 268 } 269 fprintf(cfp, "\n};\n"); 270 271 fprintf(cfp, "\nstatic const sal_Unicode STC_CharData_S2V[] = {"); 272 273 for (i = 0; i < 0x100; i++) { 274 for (j = 0; j < 0x100; j++) { 275 if (SChinese2VChineseData[i*0x100+j] != 0) 276 break; 277 } 278 if (j < 0x100) { 279 for (j = 0; j < 0x10; j++) { 280 fprintf(cfp, "\n\t"); 281 for (k = 0; k < 0x10; k++) { 282 sal_Unicode c = SChinese2VChineseData[((i*0x10+j)*0x10)+k]; 283 fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF); 284 } 285 } 286 } 287 } 288 fprintf(cfp, "\n};\n"); 289 290 fprintf(cfp, "\nstatic const sal_uInt16 STC_CharIndex_T2S[] = {"); 291 292 address=0; 293 for (i = 0; i < 0x10; i++) { 294 fprintf(cfp, "\n\t"); 295 for (j = 0; j < 0x10; j++) { 296 for (k = 0; k < 0x100; k++) { 297 if (TChinese2SChineseData[((i*0x10)+j)*0x100+k] != 0) 298 break; 299 } 300 fprintf( 301 cfp, "0x%04lx, ", 302 sal::static_int_cast< unsigned long >( 303 k < 0x100 ? (address++)*0x100 : 0xFFFF)); 304 } 305 } 306 fprintf(cfp, "\n};\n"); 307 308 fprintf(cfp, "\nstatic const sal_Unicode STC_CharData_T2S[] = {"); 309 310 for (i = 0; i < 0x100; i++) { 311 for (j = 0; j < 0x100; j++) { 312 if (TChinese2SChineseData[i*0x100+j] != 0) 313 break; 314 } 315 if (j < 0x100) { 316 for (j = 0; j < 0x10; j++) { 317 fprintf(cfp, "\n\t"); 318 for (k = 0; k < 0x10; k++) { 319 sal_Unicode c = TChinese2SChineseData[((i*0x10+j)*0x10)+k]; 320 fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF); 321 } 322 } 323 } 324 } 325 fprintf(cfp, "\n};\n"); 326 327 // create function to return arrays 328 fprintf (cfp, "\tconst sal_uInt16* getSTC_CharIndex_S2T() { return STC_CharIndex_S2T; }\n"); 329 fprintf (cfp, "\tconst sal_Unicode* getSTC_CharData_S2T() { return STC_CharData_S2T; }\n"); 330 fprintf (cfp, "\tconst sal_uInt16* getSTC_CharIndex_S2V() { return STC_CharIndex_S2V; }\n"); 331 fprintf (cfp, "\tconst sal_Unicode* getSTC_CharData_S2V() { return STC_CharData_S2V; }\n"); 332 fprintf (cfp, "\tconst sal_uInt16* getSTC_CharIndex_T2S() { return STC_CharIndex_T2S; }\n"); 333 fprintf (cfp, "\tconst sal_Unicode* getSTC_CharData_T2S() { return STC_CharData_T2S; }\n"); 334 } 335 336 337 typedef struct { 338 sal_uInt16 address; 339 sal_Int32 len; 340 sal_Unicode *data; 341 } Index; 342 343 extern "C" { 344 int Index_comp(const void* s1, const void* s2) 345 { 346 Index *p1 = (Index*)s1, *p2 = (Index*)s2; 347 int result = p1->len - p2->len; 348 for (int i = 0; result == 0 && i < p1->len; i++) 349 result = *(p1->data+i) - *(p2->data+i); 350 return result; 351 } 352 } 353 354 // Simplified/Traditional Chinese word conversion 355 void make_stc_word(FILE *sfp, FILE *cfp) 356 { 357 sal_Int32 count, i, length; 358 sal_Unicode STC_WordData[0x10000]; 359 std::vector<Index> STC_WordEntry_S2T(0x10000); 360 std::vector<Index> STC_WordEntry_T2S(0x10000); 361 sal_Int32 count_S2T = 0, count_T2S = 0; 362 sal_Int32 line = 0, char_total = 0; 363 sal_Char Cstr[1024]; 364 365 while (fgets(Cstr, 1024, sfp)) { 366 // input file is in UTF-8 encoding (SChinese:TChinese) 367 // don't convert last new line character to Ostr. 368 OUString Ostr((const sal_Char *)Cstr, strlen(Cstr) - 1, RTL_TEXTENCODING_UTF8); 369 sal_Int32 len = Ostr.getLength(); 370 if (char_total + len + 1 > 0xFFFF) { 371 fprintf(stderr, "Word Dictionary stc_word.dic is too big (line %ld)", sal::static_int_cast< long >(line)); 372 return; 373 } 374 sal_Int32 sep=-1, eq=-1, gt=-1, lt=-1; 375 if (((sep = eq = Ostr.indexOf(sal_Unicode('='))) > 0) || 376 ((sep = gt = Ostr.indexOf(sal_Unicode('>'))) > 0) || 377 ((sep = lt = Ostr.indexOf(sal_Unicode('<'))) > 0)) { 378 379 if (eq > 0 || gt > 0) { 380 STC_WordEntry_S2T[count_S2T].address = sal::static_int_cast<sal_uInt16>( char_total ); 381 STC_WordEntry_S2T[count_S2T].len = sep; 382 STC_WordEntry_S2T[count_S2T++].data = &STC_WordData[char_total]; 383 } 384 if (eq > 0 || lt > 0) { 385 STC_WordEntry_T2S[count_T2S].address = sal::static_int_cast<sal_uInt16>( char_total + sep + 1 ); 386 STC_WordEntry_T2S[count_T2S].len = len - sep - 1; 387 STC_WordEntry_T2S[count_T2S++].data = &STC_WordData[char_total + sep + 1]; 388 } 389 for (i = 0; i < len; i++) 390 STC_WordData[char_total++] = (i == sep) ? 0 : Ostr[i]; 391 STC_WordData[char_total++] = 0; 392 } else { 393 fprintf(stderr, "Invalid entry in stc_word.dic (line %ld)", sal::static_int_cast< long >(line)); 394 return; 395 } 396 line++; 397 } 398 399 if (char_total > 0) { 400 fprintf(cfp, "\nstatic const sal_Unicode STC_WordData[] = {"); 401 for (i = 0; i < char_total; i++) { 402 if (i % 32 == 0) fprintf(cfp, "\n\t"); 403 fprintf(cfp, "0x%04x, ", STC_WordData[i]); 404 } 405 fprintf(cfp, "\n};\n"); 406 407 fprintf(cfp, "\nstatic sal_Int32 STC_WordData_Count = %ld;\n", sal::static_int_cast< long >(char_total)); 408 409 // create function to return arrays 410 fprintf (cfp, "\tconst sal_Unicode* getSTC_WordData(sal_Int32& count) { count = STC_WordData_Count; return STC_WordData; }\n"); 411 } else { 412 fprintf (cfp, "\tconst sal_Unicode* getSTC_WordData(sal_Int32& count) { count = 0; return NULL; }\n"); 413 } 414 415 sal_uInt16 STC_WordIndex[0x100]; 416 417 if (count_S2T > 0) { 418 qsort(&STC_WordEntry_S2T[0], count_S2T, sizeof(Index), Index_comp); 419 420 fprintf(cfp, "\nstatic const sal_uInt16 STC_WordEntry_S2T[] = {"); 421 count = 0; 422 length = 0; 423 for (i = 0; i < count_S2T; i++) { 424 if (i % 32 == 0) fprintf(cfp, "\n\t"); 425 fprintf(cfp, "0x%04x, ", STC_WordEntry_S2T[i].address); 426 if (STC_WordEntry_S2T[i].len != length) { 427 length = STC_WordEntry_S2T[i].len; 428 while (count <= length) 429 STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i); 430 } 431 } 432 fprintf(cfp, "\n};\n"); 433 STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i); 434 435 fprintf(cfp, "\nstatic const sal_uInt16 STC_WordIndex_S2T[] = {"); 436 for (i = 0; i < count; i++) { 437 if (i % 16 == 0) fprintf(cfp, "\n\t"); 438 fprintf(cfp, "0x%04x, ", STC_WordIndex[i]); 439 } 440 fprintf(cfp, "\n};\n"); 441 442 fprintf(cfp, "\nstatic sal_Int32 STC_WordIndex_S2T_Count = %ld;\n", sal::static_int_cast< long >(length)); 443 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_S2T() { return STC_WordEntry_S2T; }\n"); 444 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_S2T(sal_Int32& count) { count = STC_WordIndex_S2T_Count; return STC_WordIndex_S2T; }\n"); 445 } else { 446 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_S2T() { return NULL; }\n"); 447 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_S2T(sal_Int32& count) { count = 0; return NULL; }\n"); 448 } 449 450 if (count_T2S > 0) { 451 qsort(&STC_WordEntry_T2S[0], count_T2S, sizeof(Index), Index_comp); 452 453 fprintf(cfp, "\nstatic const sal_uInt16 STC_WordEntry_T2S[] = {"); 454 count = 0; 455 length = 0; 456 for (i = 0; i < count_T2S; i++) { 457 if (i % 32 == 0) fprintf(cfp, "\n\t"); 458 fprintf(cfp, "0x%04x, ", STC_WordEntry_T2S[i].address); 459 if (STC_WordEntry_T2S[i].len != length) { 460 length = STC_WordEntry_T2S[i].len; 461 while (count <= length) 462 STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i); 463 } 464 } 465 STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i); 466 fprintf(cfp, "\n};\n"); 467 468 fprintf(cfp, "\nstatic const sal_uInt16 STC_WordIndex_T2S[] = {"); 469 for (i = 0; i < count; i++) { 470 if (i % 16 == 0) fprintf(cfp, "\n\t"); 471 fprintf(cfp, "0x%04x, ", STC_WordIndex[i]); 472 } 473 fprintf(cfp, "\n};\n"); 474 475 fprintf(cfp, "\nstatic sal_Int32 STC_WordIndex_T2S_Count = %ld;\n\n", sal::static_int_cast< long >(length)); 476 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_T2S() { return STC_WordEntry_T2S; }\n"); 477 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_T2S(sal_Int32& count) { count = STC_WordIndex_T2S_Count; return STC_WordIndex_T2S; }\n"); 478 } else { 479 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_T2S() { return NULL; }\n"); 480 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_T2S(sal_Int32& count) { count = 0; return NULL; }\n"); 481 } 482 } 483 484