xref: /trunk/main/i18npool/source/isolang/langid.pl (revision cdf0e10c)
1: # -*- perl -*-  vim: ft=perl
2eval 'exec perl -w -S $0 ${1+"$@"}'
3if 0;
4#*************************************************************************
5#
6# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7#
8# Copyright 2000, 2010 Oracle and/or its affiliates.
9#
10# OpenOffice.org - a multi-platform office productivity suite
11#
12# This file is part of OpenOffice.org.
13#
14# OpenOffice.org is free software: you can redistribute it and/or modify
15# it under the terms of the GNU Lesser General Public License version 3
16# only, as published by the Free Software Foundation.
17#
18# OpenOffice.org is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21# GNU Lesser General Public License version 3 for more details
22# (a copy is included in the LICENSE file that accompanied this code).
23#
24# You should have received a copy of the GNU Lesser General Public License
25# version 3 along with OpenOffice.org.  If not, see
26# <http://www.openoffice.org/license.html>
27# for a copy of the LGPLv3 License.
28#
29#*************************************************************************
30
31# See Usage() below or invoke without arguments for short instructions.
32# For long instructions use the source, Luke ;-)
33
34use strict;
35
36sub Usage()
37{
38    print STDERR
39        "\n",
40        "langid - a hackish utility to lookup lang.h language defines and LangIDs,\n",
41        "isolang.cxx ISO639/ISO3166 mapping, locale data files, langtab.src language\n",
42        "listbox entries, postset.mk, file_ooo.scp registry name, globals.pm and\n",
43        "msi-encodinglist.txt\n\n",
44
45        "Usage: $0 [--single] {language string} | {LangID} | {primarylanguage sublanguage} | {language-country}\n\n",
46
47        "A language string will be used as a generic string match in all searched files.\n",
48        "You may enclose the language string in word delimiters,\n",
49        "e.g. \\blanguage_german\\b for a specific match.\n",
50        "If the language string expression matches more than one define,\n",
51        "e.g. as in 'german', all matching defines will be processed.\n",
52        "If the language string does not match a define or an identifier in\n",
53        "langtab.src, a generic string match of the listbox entries will be tried.\n\n",
54
55        "Numeric values of LangID,primarylanguage,sublanguage can be given\n",
56        "decimal, hexagesimal (leading 0x), octal (leading 0) or binary (leading 0b).\n",
57        "The exact language_define of an exact match will be used in remaining lookups.\n\n",
58
59        "A language-country pair will lookup a xx-YY mapping from isolang.cxx,\n",
60        "for example: 'en-US' or 'de-' or '-CH',\n",
61        "xx and YY can be given case insensitive, will be lowered-uppered internally,\n",
62        "and xx and YY themselfs may be regular expressions.\n",
63        "Also here a list of matches will be processed.\n\n",
64
65        "If option --single is given, only the first match will be processed.\n\n";
66}
67
68my $SOLARVERSION = $ENV{"SOLARVERSION"};
69my $INPATH       = $ENV{"INPATH"};
70my $SRC_ROOT     = $ENV{"SRC_ROOT"};
71my $UPDMINOREXT  = $ENV{"UPDMINOREXT"};
72if (!defined($SOLARVERSION) || !defined($INPATH) || !defined($SRC_ROOT))
73{
74    print "\nNeed \$SOLARVERSION, \$INPATH and \$SRC_ROOT, please set your OOo environment!\n";
75    Usage();
76    exit 1;
77}
78if (!defined($UPDMINOREXT)) {
79    $UPDMINOREXT  = '';
80}
81my $SOLENVINC = "$SOLARVERSION/$INPATH/inc$UPDMINOREXT";
82
83my $LANGUAGE_MASK_PRIMARY = 0x03ff;
84
85sub getPrimaryLanguage($)
86{
87    my($lcid) = @_;
88    return $lcid & $LANGUAGE_MASK_PRIMARY;
89}
90
91sub getSubLanguage($)
92{
93    my($lcid) = @_;
94    return $lcid >> 10;
95}
96
97sub makeLangID($$)
98{
99    my( $sub, $pri) = @_;
100    return ($sub << 10) | $pri;
101}
102
103
104sub grepFile($$$$@)
105{
106    my( $regex, $path, $module, $name, @addregex) = @_;
107    my @result;
108    my $found = 0;
109    my $areopen = 0;
110    my $arecloser = '';
111    my $file;
112    # Try module under current working directory first to catch local
113    # modifications. A Not yet delivered lang.h is a special case.
114    if ("$path/$module/$name" eq "$SOLENVINC/i18npool/lang.h") {
115        $file = "./$module/inc/i18npool/lang.h"; }
116    else {
117        $file = "./$module/$name"; }
118    if (!($found = open( IN, $file)))
119    {
120        # Then with the given path.
121        $file = "$path/$module/$name";
122        if (!($found = open( IN, $file)))
123        {
124            print "No $file\n";
125            $file = "$path/$module.lnk/$name";
126            if (!($found = open( IN, $file))) {
127                print "No $file.\n";
128                $file = "$path/$module.link/$name";
129                if (!($found = open( IN, $file))) {
130                    print "No $file either.\n"; }
131            }
132        }
133    }
134    if ($found)
135    {
136        $found = 0;
137        while (my $line = <IN>)
138        {
139            if ($line =~ /$regex/)
140            {
141                if (!$found)
142                {
143                    $found = 1;
144                    print "$file:\n";
145                }
146                chomp( $line);
147                print "$line\n";
148                push( @result, $line);
149            }
150            elsif (@addregex)
151            {
152                # By convention first element is opener, second element is closer.
153                if (!$areopen)
154                {
155                    if ($line =~ /$addregex[0]/)
156                    {
157                        $areopen = 1;
158                        $arecloser = $addregex[1];
159                    }
160                }
161                if ($areopen)
162                {
163                    for (my $i = 2; $i < @addregex; ++$i)
164                    {
165                        if ($line =~ /$addregex[$i]/)
166                        {
167                            if (!$found)
168                            {
169                                $found = 1;
170                                print "$file:\n";
171                            }
172                            chomp( $line);
173                            print "$line\n";
174                            push( @result, $line);
175                        }
176                    }
177                    if ($line =~ /$arecloser/)
178                    {
179                        $areopen = 0;
180                    }
181                }
182            }
183        }
184        close( IN);
185    }
186    if (!$found) {
187        print "Not found in $file\n";
188        #print "Not found in $file for $regex @addregex\n";
189    }
190    return @result;
191}
192
193
194sub main()
195{
196    my( $lcid, @parts, $grepdef, $options, $single);
197    $grepdef = 0;
198    $single = 0;
199    for ($options = 0; $options < @ARGV && $ARGV[$options] =~ /^--/; ++$options)
200    {
201        if ($ARGV[$options] eq '--single') { $single = 1; }
202        else { print "Unknown option: $ARGV[$options]\n"; }
203    }
204    if (@ARGV == 1 + $options)
205    {
206        # 0x hex, 0b bin, 0 oct
207        if ($ARGV[$options] =~ /^0/) {
208            $lcid = oct( $ARGV[0]); }
209        elsif ($ARGV[$options] =~ /^[0-9]/) {
210            $lcid = $ARGV[$options]; }
211        else
212        {
213            $grepdef = $ARGV[$options];
214            $lcid = 0;
215        }
216        $parts[0] = getPrimaryLanguage( $lcid);
217        $parts[1] = getSubLanguage( $lcid);
218    }
219    elsif (@ARGV == 2 + $options)
220    {
221        for (my $i = $options; $i < 2 + $options; ++$i)
222        {
223            if ($ARGV[$i] =~ /^0/) {
224                $parts[$i] = oct( $ARGV[$i]); }
225            else {
226                $parts[$i] = $ARGV[$i]; }
227        }
228        $lcid = makeLangID( $parts[1], $parts[0]);
229    }
230    else
231    {
232        Usage();
233        return 1;
234    }
235    my $modifier = "(?i)";
236    my (@resultlist, @greplist, $result);
237    # If no string was given on the command line, but value(s) were, lookup the
238    # LangID value to obtain the define identifier.
239    if ($grepdef)
240    {
241        # #define LANGUAGE_AFRIKAANS                  0x0436
242        @resultlist = grepFile(
243            $modifier . '^\s*#\s*define\s+[A-Z_]*' . $grepdef,
244            $SOLENVINC, "i18npool", "lang.h", ());
245    }
246    else
247    {
248        printf( "LangID: 0x%04X (dec %d), primary: 0x%03x, sub 0x%02x\n", $lcid,
249                $lcid, $parts[0], $parts[1]);
250        my $buf = sprintf( "0x%04X", $lcid);
251        @resultlist = grepFile(
252            '^\s*#\s*define\s+\w+\s+' . $buf,
253            $SOLENVINC, "i18npool", "lang.h", ());
254    }
255    for $result (@resultlist)
256    {
257        # #define LANGUAGE_AFRIKAANS                  0x0436
258        if ($result =~ /^\s*#\s*define\s+(\w+)\s+(0x[0-9a-fA-F]+)/)
259        {
260            push( @greplist, '\b' . $1 . '\b');
261            $modifier = "";     # complete identifier now case sensitive
262            if ($single) {
263                last; }
264        }
265    }
266    # If the string given is of the form xx-yy lookup a language,country pair
267    # to obtain the define identifier. xx and yy themselfs may be regexps.
268    # xx- is a short form for 'xx-.*' and -yy a short form for '.*-yy'
269    if ($grepdef =~ /^(.*)-$/) {
270        $grepdef = $1 . "-.*"; }
271    if ($grepdef =~ /^-(.*)$/) {
272        $grepdef = ".*-" . $1; }
273    if ($grepdef =~ /^(.*)-(.*)$/)
274    {
275        my $lang = $1;
276        my $coun = $2;
277        $lang = lc($lang);
278        $coun = uc($coun);
279        #     { LANGUAGE_AFRIKAANS,                   "af", "ZA" },
280        @resultlist = grepFile(
281            '^\s*\{\s*\w+\s*,\s*\"' . $lang . '\"\s*,\s*\"'  . $coun . '\"\s*\}\s*,',
282            "$SRC_ROOT", "i18npool", "source/isolang/isolang.cxx", ());
283        for $result (@resultlist)
284        {
285            if ($result =~ /^\s*\{\s*(\w+)\s*,\s*\"\w+\"\s*,\s*\"(\w+)?\"\s*\}\s*,/)
286            {
287                push( @greplist, '\b' . $1 . '\b');
288                $modifier = "";     # complete identifier now case sensitive
289                if ($single) {
290                    last; }
291            }
292        }
293        $grepdef = 0;
294    }
295    if (!@greplist && $grepdef) {
296        push( @greplist, $grepdef); }
297    for $grepdef (@greplist)
298    {
299        print "\nUsing: " . $grepdef . "\n";
300
301        # Decimal LCID, was needed for Langpack.ulf but isn't used anymore,
302        # keep just in case we'd need it again.
303        # #define LANGUAGE_AFRIKAANS                  0x0436
304        @resultlist = grepFile(
305            $modifier . '^\s*#\s*define\s+[A-Z_]*' . $grepdef,
306            $SOLENVINC, "i18npool", "lang.h", ());
307        my @lcidlist;
308        for $result (@resultlist)
309        {
310            # #define LANGUAGE_AFRIKAANS                  0x0436
311            if ($result =~ /^\s*#\s*define\s+(\w+)\s+(0x[0-9a-fA-F]+)/)
312            {
313                push( @lcidlist, oct( $2));
314            }
315        }
316
317        #     { LANGUAGE_AFRIKAANS,                   "af", "ZA" },
318        @resultlist = grepFile(
319            $modifier . '^\s*\{\s*.*' . $grepdef . '.*\s*,\s*\".*\"\s*,\s*\".*\"\s*\}\s*,',
320            "$SRC_ROOT", "i18npool", "source/isolang/isolang.cxx", ());
321
322        my @langcoungreplist;
323        for $result (@resultlist)
324        {
325            if ($result =~ /^\s*\{\s*\w+\s*,\s*\"(\w+)\"\s*,\s*\"(\w+)?\"\s*\}\s*,/)
326            {
327                my $lang = $1;
328                my $coun = $2;
329                my $loca;
330                if ($coun)
331                {
332                    $loca = $lang . "_" . $coun;
333                    push( @langcoungreplist, '\b' . $lang . '\b(-' . $coun . ')?');
334                }
335                else
336                {
337                    $loca = $lang;
338                    $coun = "";
339                    push( @langcoungreplist, '\b' . $lang . '\b');
340                }
341                my $file = "$SRC_ROOT/i18npool/source/localedata/data/$loca.xml";
342                my $found;
343                if (!($found = open( LD, $file)))
344                {
345                    $file = "$SRC_ROOT/i18npool.lnk/source/localedata/data/$loca.xml";
346                    if (!($found = open( LD, $file)))
347                    {
348                        $file = "$SRC_ROOT/i18npool.link/source/localedata/data/$loca.xml";
349                        $found = open( LD, $file);
350                    }
351                }
352                if ($found)
353                {
354                    print "Found $file:\n";
355                    my $on = 0;
356                    while (my $line = <LD>)
357                    {
358                        if ($line =~ /<(Language|Country)>/) {
359                            $on = 1; }
360                        if ($on) {
361                            print $line; }
362                        if ($line =~ /<\/(Language|Country)>/) {
363                            $on = 0; }
364                    }
365                    close( LD);
366                }
367                else {
368                    print "No $SRC_ROOT/i18npool/source/localedata/data/$loca.xml\n"; }
369            }
370        }
371
372        #         case LANGUAGE_ARABIC:
373        grepFile(
374            $modifier . '^\s*case\s*.*' . $grepdef . '.*\s*:',
375            "$SRC_ROOT", "i18npool", "source/isolang/mslangid.cxx", ());
376
377        # With CWS 'langstatusbar' the language listbox resource file gets a new location.
378        my $module = "svx";
379        my $name = "source/dialog/langtab.src";
380        if (!(-e "$SRC_ROOT/$module/$name")) {
381            $module = "svtools";
382            $name = "source/misc/langtab.src";
383        }
384        #         < "Afrikaans" ; LANGUAGE_AFRIKAANS ; > ;
385        # lookup define
386        @resultlist = grepFile(
387            $modifier . '^\s*<\s*\".*\"\s*;\s*.*' . $grepdef . '.*\s*;\s*>\s*;',
388            "$SRC_ROOT", $module, $name, ());
389        # lookup string
390        if (!@resultlist) {
391            grepFile(
392                $modifier . '^\s*<\s*\".*' . $grepdef . '.*\"\s*;\s*.*\s*;\s*>\s*;',
393                "$SRC_ROOT", $module, $name, ()); }
394
395        for my $langcoun (@langcoungreplist)
396        {
397            # Name (xxx) = "/registry/spool/org/openoffice/Office/Common-ctl.xcu";
398            grepFile(
399                '^\s*Name\s*\(' . $langcoun . '\)\s*=',
400                "$SRC_ROOT", "scp2", "source/ooo/file_ooo.scp", ());
401
402            # completelangiso=af ar as-IN ... zu
403            grepFile(
404                '^\s*completelangiso\s*=\s*(\s*([a-z]{2,3})(-[A-Z][A-Z])?)*' . $langcoun . '',
405                "$SRC_ROOT", "solenv", "inc/postset.mk",
406                # needs a duplicated pair of backslashes to produce a literal \\
407                ('^\s*completelangiso\s*=', '^\s*$', '^\s*' . $langcoun . '\s*\\\\*$'));
408
409            # @noMSLocaleLangs = ( "br", "bs", ... )
410            grepFile(
411                '^\s*@noMSLocaleLangs\s*=\s*\(\s*(\s*"([a-z]{2,3})(-[A-Z][A-Z])?"\s*,?)*' . $langcoun . '',
412                "$SRC_ROOT", "solenv", "bin/modules/installer/globals.pm",
413                ('^\s*@noMSLocaleLangs\s*=', '\)\s*$', '"' . $langcoun . '"'));
414
415            # af    1252  1078   # Afrikaans
416            grepFile(
417                '^\s*' . $langcoun . '',
418                "$SRC_ROOT", "setup_native", "source/win32/msi-encodinglist.txt", ());
419        }
420    }
421    return 0;
422}
423
424main();
425