1 /**************************************************************
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing,
14 * software distributed under the License is distributed on an
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 * KIND, either express or implied. See the License for the
17 * specific language governing permissions and limitations
18 * under the License.
19 *
20 *************************************************************/
21
22
23
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <fcntl.h>
27 #include <errno.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <ctype.h>
31 #include <sal/alloca.h>
32
33 #include <rtl/ustring.hxx>
34
35 #include <map>
36 #include <string>
37
38 /*****************************************************************************
39 * typedefs
40 *****************************************************************************/
41
42 typedef std::map< const std::string, rtl_TextEncoding > EncodingMap;
43
44 struct _pair {
45 const char *key;
46 rtl_TextEncoding value;
47 };
48
49 static int _pair_compare (const char *key, const _pair *pair);
50 static const _pair* _pair_search (const char *key, const _pair *base, unsigned int member );
51
52
53 const _pair _ms_encoding_list[] = {
54 { "0", RTL_TEXTENCODING_UTF8 },
55 { "1250", RTL_TEXTENCODING_MS_1250 },
56 { "1251", RTL_TEXTENCODING_MS_1251 },
57 { "1252", RTL_TEXTENCODING_MS_1252 },
58 { "1253", RTL_TEXTENCODING_MS_1253 },
59 { "1254", RTL_TEXTENCODING_MS_1254 },
60 { "1255", RTL_TEXTENCODING_MS_1255 },
61 { "1256", RTL_TEXTENCODING_MS_1256 },
62 { "1257", RTL_TEXTENCODING_MS_1257 },
63 { "1258", RTL_TEXTENCODING_MS_1258 },
64 { "874", RTL_TEXTENCODING_MS_874 },
65 { "932", RTL_TEXTENCODING_MS_932 },
66 { "936", RTL_TEXTENCODING_MS_936 },
67 { "949", RTL_TEXTENCODING_MS_949 },
68 { "950", RTL_TEXTENCODING_MS_950 }
69 };
70
71
72 /*****************************************************************************
73 * fgets that work with unix line ends on Windows
74 *****************************************************************************/
75
my_fgets(char * s,int n,FILE * fp)76 char * my_fgets(char *s, int n, FILE *fp)
77 {
78 int i;
79 for( i=0; i < n-1; i++ )
80 {
81 int c = getc(fp);
82
83 if( c == EOF )
84 break;
85
86 s[i] = (char) c;
87
88 if( s[i] == '\n' )
89 {
90 i++;
91 break;
92 }
93 }
94
95 if( i>0 )
96 {
97 s[i] = '\0';
98 return s;
99 }
100 else
101 {
102 return NULL;
103 }
104 }
105
106 /*****************************************************************************
107 * compare function for binary search
108 *****************************************************************************/
109
110 static int
_pair_compare(const char * key,const _pair * pair)111 _pair_compare (const char *key, const _pair *pair)
112 {
113 int result = rtl_str_compareIgnoreAsciiCase( key, pair->key );
114 return result;
115 }
116
117 /*****************************************************************************
118 * binary search on encoding tables
119 *****************************************************************************/
120
121 static const _pair*
_pair_search(const char * key,const _pair * base,unsigned int member)122 _pair_search (const char *key, const _pair *base, unsigned int member )
123 {
124 unsigned int lower = 0;
125 unsigned int upper = member;
126 unsigned int current;
127 int comparison;
128
129 /* check for validity of input */
130 if ( (key == NULL) || (base == NULL) || (member == 0) )
131 return NULL;
132
133 /* binary search */
134 while ( lower < upper )
135 {
136 current = (lower + upper) / 2;
137 comparison = _pair_compare( key, base + current );
138 if (comparison < 0)
139 upper = current;
140 else
141 if (comparison > 0)
142 lower = current + 1;
143 else
144 return base + current;
145 }
146
147 return NULL;
148 }
149
150
151 /************************************************************************
152 * read_encoding_table
153 ************************************************************************/
154
read_encoding_table(char * file,EncodingMap & aEncodingMap)155 void read_encoding_table(char * file, EncodingMap& aEncodingMap)
156 {
157 FILE * fp = fopen(file, "r");
158 if ( ! fp ) {
159 fprintf(stderr, "ulfconv: %s %s\n", file, strerror(errno));
160 exit(2);
161 }
162
163 char buffer[512];
164 while ( NULL != my_fgets(buffer, sizeof(buffer), fp) ) {
165
166 // strip comment lines
167 if ( buffer[0] == '#' )
168 continue;
169
170 // find end of language string
171 char * cp;
172 for ( cp = buffer; ! isspace(*cp); cp++ )
173 ;
174 *cp = '\0';
175
176 // find start of codepage string
177 for ( ++cp; isspace(*cp); ++cp )
178 ;
179 char * codepage = cp;
180
181 // find end of codepage string
182 for ( ++cp; ! isspace(*cp); ++cp )
183 ;
184 *cp = '\0';
185
186 // find the correct mapping for codepage
187 const unsigned int members = sizeof( _ms_encoding_list ) / sizeof( _pair );
188 const _pair *encoding = _pair_search( codepage, _ms_encoding_list, members );
189
190 if ( encoding != NULL ) {
191 const std::string language(buffer);
192 aEncodingMap.insert( EncodingMap::value_type(language, encoding->value) );
193 }
194 }
195
196 fclose(fp);
197 }
198
199 /************************************************************************
200 * print_legacy_mixed
201 ************************************************************************/
202
print_legacy_mixed(FILE * ostream,const rtl::OUString & aString,const std::string & language,EncodingMap & aEncodingMap)203 void print_legacy_mixed(
204 FILE * ostream,
205 const rtl::OUString& aString,
206 const std::string& language,
207 EncodingMap& aEncodingMap)
208 {
209 EncodingMap::iterator iter = aEncodingMap.find(language);
210
211 if ( iter != aEncodingMap.end() ) {
212 fputs(OUStringToOString(aString, iter->second).getStr(), ostream);
213 } else {
214 fprintf(stderr, "ulfconv: WARNING: no legacy encoding found for %s\n", language.c_str());
215 }
216 }
217
218 /************************************************************************
219 * print_java_style
220 ************************************************************************/
221
print_java_style(FILE * ostream,const rtl::OUString & aString)222 void print_java_style(FILE * ostream, const rtl::OUString& aString)
223 {
224 int imax = aString.getLength();
225 for (int i = 0; i < imax; i++) {
226 sal_Unicode uc = aString[i];
227 if ( uc < 128 ) {
228 fprintf(ostream, "%c", (char) uc);
229 } else {
230 fprintf(ostream, "\\u%2.2x%2.2x", uc >> 8, uc & 0xFF );
231 }
232 }
233 }
234
235 /************************************************************************
236 * main
237 ************************************************************************/
238
main(int argc,char * const argv[])239 int main( int argc, char * const argv[] )
240 {
241 EncodingMap aEncodingMap;
242
243 FILE *istream = stdin;
244 FILE *ostream = stdout;
245
246 char *outfile = NULL;
247
248 int errflg = 0;
249 int argi;
250
251 for( argi=1; argi < argc; argi++ )
252 {
253 if( argv[argi][0] == '-' && argv[argi][2] == '\0' )
254 {
255 switch(argv[argi][1]) {
256 case 'o':
257 if (argi+1 >= argc || argv[argi+1][0] == '-')
258 {
259 fprintf(stderr, "Option -%c requires an operand\n", argv[argi][1]);
260 errflg++;
261 break;
262 }
263
264 ++argi;
265 outfile = argv[argi];
266 break;
267 case 't':
268 if (argi+1 >= argc || argv[argi+1][0] == '-')
269 {
270 fprintf(stderr, "Option -%c requires an operand\n", argv[argi][1]);
271 errflg++;
272 break;
273 }
274
275 read_encoding_table(argv[++argi], aEncodingMap);
276 break;
277 default:
278 fprintf(stderr, "Unrecognized option: -%c\n", argv[argi][1]);
279 errflg++;
280 }
281 }
282 else
283 {
284 break;
285 }
286 }
287
288 if (errflg) {
289 fprintf(stderr, "Usage: ulfconv [-o <output file>] [-t <encoding table>] [<ulf file>]\n");
290 exit(2);
291 }
292
293 /* assign input file to stdin */
294 if ( argi < argc )
295 {
296 istream = fopen(argv[argi], "r");
297 if ( istream == NULL ) {
298 fprintf(stderr, "ulfconv: %s : %s\n", argv[argi], strerror(errno));
299 exit(2);
300 }
301 }
302
303 /* open output file if any */
304 if ( outfile )
305 {
306 ostream = fopen(outfile, "w");
307 if ( ostream == NULL ) {
308 fprintf(stderr, "ulfconv: %s : %s\n", outfile, strerror(errno));
309 fclose(istream);
310 exit(2);
311 }
312 }
313
314 /* read line by line from stdin */
315 char buffer[65536];
316 while ( NULL != fgets(buffer, sizeof(buffer), istream) ) {
317
318 /* only handle lines containing " = " */
319 char * cp = strstr(buffer, " = \"");
320 if ( cp ) {
321 rtl::OUString aString;
322
323 /* find end of lang string */
324 int n;
325 for ( n=0; ! isspace(buffer[n]); n++ )
326 ;
327
328 std::string line = buffer;
329 std::string lang(line, 0, n);
330
331 cp += 4;
332 rtl_string2UString( &aString.pData, cp, strrchr(cp, '\"') - cp,
333 RTL_TEXTENCODING_UTF8, OSTRING_TO_OUSTRING_CVTFLAGS );
334
335 fprintf(ostream, "%s = \"", lang.c_str());
336
337 if ( aEncodingMap.empty() ) {
338 print_java_style(ostream, aString);
339 } else {
340 print_legacy_mixed(ostream, aString, lang, aEncodingMap);
341 }
342
343 fprintf(ostream, "\"\n");
344
345
346 } else {
347 fputs(buffer, ostream);
348 }
349 }
350
351 fclose(ostream);
352 fclose(istream);
353 }
354