1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <fcntl.h>
31 #include <errno.h>
32 #include <string.h>
33 #include <unistd.h>
34 #include <ctype.h>
35 #include <sal/alloca.h>
36 
37 #include <rtl/ustring.hxx>
38 
39 #include <map>
40 #include <string>
41 
42 /*****************************************************************************
43  * typedefs
44  *****************************************************************************/
45 
46 typedef std::map< const std::string, rtl_TextEncoding > EncodingMap;
47 
48 struct _pair {
49     const char *key;
50     rtl_TextEncoding value;
51 };
52 
53 static int _pair_compare (const char *key, const _pair *pair);
54 static const _pair* _pair_search (const char *key, const _pair *base, unsigned int member );
55 
56 
57 const _pair _ms_encoding_list[] = {
58     { "0",       RTL_TEXTENCODING_UTF8        },
59     { "1250",    RTL_TEXTENCODING_MS_1250     },
60     { "1251",    RTL_TEXTENCODING_MS_1251     },
61     { "1252",    RTL_TEXTENCODING_MS_1252     },
62     { "1253",    RTL_TEXTENCODING_MS_1253     },
63     { "1254",    RTL_TEXTENCODING_MS_1254     },
64     { "1255",    RTL_TEXTENCODING_MS_1255     },
65     { "1256",    RTL_TEXTENCODING_MS_1256     },
66     { "1257",    RTL_TEXTENCODING_MS_1257     },
67     { "1258",    RTL_TEXTENCODING_MS_1258     },
68     { "874",     RTL_TEXTENCODING_MS_874      },
69     { "932",     RTL_TEXTENCODING_MS_932      },
70     { "936",     RTL_TEXTENCODING_MS_936      },
71     { "949",     RTL_TEXTENCODING_MS_949      },
72     { "950",     RTL_TEXTENCODING_MS_950      }
73 };
74 
75 
76 /*****************************************************************************
77  * fgets that work with unix line ends on Windows
78  *****************************************************************************/
79 
80 char * my_fgets(char *s, int n, FILE *fp)
81 {
82     int i;
83     for( i=0; i < n-1; i++ )
84     {
85         int c = getc(fp);
86 
87         if( c == EOF )
88             break;
89 
90         s[i] = (char) c;
91 
92         if( s[i] == '\n' )
93         {
94             i++;
95             break;
96         }
97     }
98 
99     if( i>0 )
100     {
101         s[i] = '\0';
102         return s;
103     }
104     else
105     {
106         return NULL;
107     }
108 }
109 
110 /*****************************************************************************
111  * compare function for binary search
112  *****************************************************************************/
113 
114 static int
115 _pair_compare (const char *key, const _pair *pair)
116 {
117     int result = rtl_str_compareIgnoreAsciiCase( key, pair->key );
118     return result;
119 }
120 
121 /*****************************************************************************
122  * binary search on encoding tables
123  *****************************************************************************/
124 
125 static const _pair*
126 _pair_search (const char *key, const _pair *base, unsigned int member )
127 {
128     unsigned int lower = 0;
129     unsigned int upper = member;
130     unsigned int current;
131     int comparison;
132 
133     /* check for validity of input */
134     if ( (key == NULL) || (base == NULL) || (member == 0) )
135         return NULL;
136 
137     /* binary search */
138     while ( lower < upper )
139     {
140         current = (lower + upper) / 2;
141         comparison = _pair_compare( key, base + current );
142         if (comparison < 0)
143             upper = current;
144         else
145         if (comparison > 0)
146             lower = current + 1;
147         else
148             return base + current;
149     }
150 
151     return NULL;
152 }
153 
154 
155 /************************************************************************
156  * read_encoding_table
157  ************************************************************************/
158 
159 void read_encoding_table(char * file, EncodingMap& aEncodingMap)
160 {
161     FILE * fp = fopen(file, "r");
162     if ( ! fp  ) {
163         fprintf(stderr, "ulfconv: %s %s\n", file, strerror(errno));
164         exit(2);
165     }
166 
167     char buffer[512];
168     while ( NULL != my_fgets(buffer, sizeof(buffer), fp) ) {
169 
170         // strip comment lines
171         if ( buffer[0] == '#' )
172             continue;
173 
174         // find end of language string
175         char * cp;
176         for ( cp = buffer; ! isspace(*cp); cp++ )
177             ;
178         *cp = '\0';
179 
180         // find start of codepage string
181         for ( ++cp; isspace(*cp); ++cp )
182             ;
183         char * codepage = cp;
184 
185         // find end of codepage string
186         for ( ++cp; ! isspace(*cp); ++cp )
187             ;
188         *cp = '\0';
189 
190         // find the correct mapping for codepage
191         const unsigned int members = sizeof( _ms_encoding_list ) / sizeof( _pair );
192         const _pair *encoding = _pair_search( codepage, _ms_encoding_list, members );
193 
194         if ( encoding != NULL ) {
195             const std::string language(buffer);
196             aEncodingMap.insert( EncodingMap::value_type(language, encoding->value) );
197         }
198     }
199 
200     fclose(fp);
201 }
202 
203 /************************************************************************
204  * print_legacy_mixed
205  ************************************************************************/
206 
207 void print_legacy_mixed(
208     FILE * ostream,
209     const rtl::OUString& aString,
210     const std::string& language,
211     EncodingMap& aEncodingMap)
212 {
213     EncodingMap::iterator iter = aEncodingMap.find(language);
214 
215     if ( iter != aEncodingMap.end() ) {
216         fputs(OUStringToOString(aString, iter->second).getStr(), ostream);
217     } else {
218         fprintf(stderr, "ulfconv: WARNING: no legacy encoding found for %s\n", language.c_str());
219     }
220 }
221 
222 /************************************************************************
223  * print_java_style
224  ************************************************************************/
225 
226 void print_java_style(FILE * ostream, const rtl::OUString& aString)
227 {
228     int imax = aString.getLength();
229     for (int i = 0; i < imax; i++) {
230         sal_Unicode uc = aString[i];
231         if ( uc < 128 ) {
232             fprintf(ostream, "%c", (char) uc);
233         } else {
234             fprintf(ostream, "\\u%2.2x%2.2x", uc >> 8, uc & 0xFF );
235         }
236     }
237 }
238 
239 /************************************************************************
240  * main
241  ************************************************************************/
242 
243 int main( int argc, char * const argv[] )
244 {
245     EncodingMap aEncodingMap;
246 
247     FILE *istream = stdin;
248     FILE *ostream = stdout;
249 
250     char *outfile = NULL;
251 
252     int errflg = 0;
253     int argi;
254 
255     for( argi=1; argi < argc; argi++ )
256     {
257         if( argv[argi][0] == '-' && argv[argi][2] == '\0' )
258         {
259             switch(argv[argi][1]) {
260             case 'o':
261                 if (argi+1 >= argc || argv[argi+1][0] == '-')
262                 {
263                     fprintf(stderr, "Option -%c requires an operand\n", argv[argi][1]);
264                     errflg++;
265                     break;
266                 }
267 
268                 ++argi;
269                 outfile = argv[argi];
270                 break;
271             case 't':
272                 if (argi+1 >= argc || argv[argi+1][0] == '-')
273                 {
274                     fprintf(stderr, "Option -%c requires an operand\n", argv[argi][1]);
275                     errflg++;
276                     break;
277                 }
278 
279                 read_encoding_table(argv[++argi], aEncodingMap);
280                 break;
281             default:
282                 fprintf(stderr, "Unrecognized option: -%c\n", argv[argi][1]);
283                 errflg++;
284             }
285         }
286         else
287         {
288             break;
289         }
290     }
291 
292     if (errflg) {
293       fprintf(stderr, "Usage: ulfconv [-o <output file>] [-t <encoding table>] [<ulf file>]\n");
294       exit(2);
295     }
296 
297     /* assign input file to stdin */
298     if ( argi < argc )
299     {
300         istream = fopen(argv[argi], "r");
301         if ( istream  == NULL ) {
302             fprintf(stderr, "ulfconv: %s : %s\n", argv[argi], strerror(errno));
303             exit(2);
304         }
305     }
306 
307 	/* open output file if any */
308 	if ( outfile )
309 	{
310         ostream = fopen(outfile, "w");
311         if ( ostream == NULL ) {
312             fprintf(stderr, "ulfconv: %s : %s\n", outfile, strerror(errno));
313             fclose(istream);
314             exit(2);
315         }
316 	}
317 
318     /* read line by line from stdin */
319     char buffer[65536];
320     while ( NULL != fgets(buffer, sizeof(buffer), istream) ) {
321 
322         /* only handle lines containing " = " */
323         char * cp = strstr(buffer, " = \"");
324         if ( cp ) {
325             rtl::OUString aString;
326 
327             /* find end of lang string */
328             int n;
329             for ( n=0; ! isspace(buffer[n]); n++ )
330                 ;
331 
332             std::string line = buffer;
333             std::string lang(line, 0, n);
334 
335             cp += 4;
336             rtl_string2UString( &aString.pData, cp, strrchr(cp, '\"') - cp,
337                 RTL_TEXTENCODING_UTF8, OSTRING_TO_OUSTRING_CVTFLAGS );
338 
339             fprintf(ostream, "%s = \"", lang.c_str());
340 
341             if ( aEncodingMap.empty() ) {
342                 print_java_style(ostream, aString);
343             } else {
344                 print_legacy_mixed(ostream, aString, lang, aEncodingMap);
345             }
346 
347             fprintf(ostream, "\"\n");
348 
349 
350         } else {
351             fputs(buffer, ostream);
352         }
353     }
354 
355     fclose(ostream);
356     fclose(istream);
357 }
358