xref: /trunk/main/i18npool/source/isolang/lcid.awk (revision 5b501c92)
1cdf0e10cSrcweir#!/usr/bin/awk -f
2*5b501c92SAndrew Rist# *************************************************************
3*5b501c92SAndrew Rist#
4*5b501c92SAndrew Rist#  Licensed to the Apache Software Foundation (ASF) under one
5*5b501c92SAndrew Rist#  or more contributor license agreements.  See the NOTICE file
6*5b501c92SAndrew Rist#  distributed with this work for additional information
7*5b501c92SAndrew Rist#  regarding copyright ownership.  The ASF licenses this file
8*5b501c92SAndrew Rist#  to you under the Apache License, Version 2.0 (the
9*5b501c92SAndrew Rist#  "License"); you may not use this file except in compliance
10*5b501c92SAndrew Rist#  with the License.  You may obtain a copy of the License at
11*5b501c92SAndrew Rist#
12*5b501c92SAndrew Rist#    http://www.apache.org/licenses/LICENSE-2.0
13*5b501c92SAndrew Rist#
14*5b501c92SAndrew Rist#  Unless required by applicable law or agreed to in writing,
15*5b501c92SAndrew Rist#  software distributed under the License is distributed on an
16*5b501c92SAndrew Rist#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17*5b501c92SAndrew Rist#  KIND, either express or implied.  See the License for the
18*5b501c92SAndrew Rist#  specific language governing permissions and limitations
19*5b501c92SAndrew Rist#  under the License.
20*5b501c92SAndrew Rist#
21*5b501c92SAndrew Rist# *************************************************************
22cdf0e10cSrcweir#
23cdf0e10cSrcweir# Utility to compare MS-LANGID definitions with those defined in ../../inc/i18npool/lang.h
24cdf0e10cSrcweir# Run in i18npool/source/isolang
25cdf0e10cSrcweir#
26cdf0e10cSrcweir# outputs new #define LANGUAGE_... 0x... and also some commented out substrings
27cdf0e10cSrcweir# that were matched in already existing defines.
28cdf0e10cSrcweir#
29cdf0e10cSrcweir# ATTENTION! The sed filter in the command line examples below assures that a
30cdf0e10cSrcweir# '|' border is drawn by html2text in data tables, and nowhere else, on which
31cdf0e10cSrcweir# this awk script relies. This script also heavily relies on the column layout
32cdf0e10cSrcweir# encountered. Should MS decide to change their layout or their CSS names
33cdf0e10cSrcweir# ("data..."), this would probably break. Should html2text decide that the last
34cdf0e10cSrcweir# border="..." attribute encountered wins instead of the first, this may break
35cdf0e10cSrcweir# also.
36cdf0e10cSrcweir#
37cdf0e10cSrcweir# sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g'
38cdf0e10cSrcweir#
39cdf0e10cSrcweir# After html2text best if file cleaned up to _only_ contain the table entries,
40cdf0e10cSrcweir# but not necessary, entries are filtered. Check output.
41cdf0e10cSrcweir#
42cdf0e10cSrcweir# Expects input from the saved page of one of
43cdf0e10cSrcweir#
44cdf0e10cSrcweir# (1)
45cdf0e10cSrcweir# http://www.microsoft.com/globaldev/reference/lcid-all.mspx
46cdf0e10cSrcweir# filtered through ``html2text -nobs ...'', generated table:
47cdf0e10cSrcweir# blank,name,hex,dec,blank fields:
48cdf0e10cSrcweir#    |Afrikaans_-_South_Africa___|0436___|1078___|
49cdf0e10cSrcweir#
50cdf0e10cSrcweir# complete command line:
51cdf0e10cSrcweir# lynx -dump -source http://www.microsoft.com/globaldev/reference/lcid-all.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
52cdf0e10cSrcweir#
53cdf0e10cSrcweir#
54cdf0e10cSrcweir# (2)
55cdf0e10cSrcweir# http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx
56cdf0e10cSrcweir# filtered through ``html2text -nobs ...'', generated table:
57cdf0e10cSrcweir# blank,name,hex,dec,inputlocales,collection,blank fields:
58cdf0e10cSrcweir#    |Afrikaans   |0436   |1078   |0436:00000409,   |Basic   |
59cdf0e10cSrcweir#
60cdf0e10cSrcweir# complete command line:
61cdf0e10cSrcweir# lynx -dump -source http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
62cdf0e10cSrcweir#
63cdf0e10cSrcweir#
64cdf0e10cSrcweir# (3)
65cdf0e10cSrcweir# http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp
66cdf0e10cSrcweir# filtered through ``html2text -nobs ...'', generated table:
67cdf0e10cSrcweir# blank,hex,locale,name,blank  fields:
68cdf0e10cSrcweir#   |0x0436___|af-ZA___|Afrikaans_(South_Africa)___|
69cdf0e10cSrcweir#
70cdf0e10cSrcweir# complete command line:
71cdf0e10cSrcweir# lynx -dump -source http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
72cdf0e10cSrcweir#
73cdf0e10cSrcweir# Author: Eike Rathke <erack@sun.com>, <er@openoffice.org>
74cdf0e10cSrcweir#
75cdf0e10cSrcweir
76cdf0e10cSrcweirBEGIN {
77cdf0e10cSrcweir    while ((getline < "../../inc/i18npool/lang.h") > 0)
78cdf0e10cSrcweir    {
79cdf0e10cSrcweir        if ($0 ~ /^#define[ ]*LANGUAGE_[_A-Za-z0-9]*[ ]*0x[0-9a-fA-F]/)
80cdf0e10cSrcweir        {
81cdf0e10cSrcweir            # lang[HEX]=NAME
82cdf0e10cSrcweir            lang[toupper(substr($3,3))] = toupper($2)
83cdf0e10cSrcweir            #print substr($3,3) "=" $2
84cdf0e10cSrcweir        }
85cdf0e10cSrcweir    }
86cdf0e10cSrcweir    # html2text table follows
87cdf0e10cSrcweir    FS = "\|"
88cdf0e10cSrcweir    filetype = 0
89cdf0e10cSrcweir    lcid_all = 1
90cdf0e10cSrcweir    xp_lcid  = 2
91cdf0e10cSrcweir    nls_238z = 3
92cdf0e10cSrcweir    filetypename[filetype] = "unknown"
93cdf0e10cSrcweir    filetypename[lcid_all] = "lcid_all"
94cdf0e10cSrcweir    filetypename[xp_lcid]  = "xp_lcid"
95cdf0e10cSrcweir    filetypename[nls_238z] = "nls_238z"
96cdf0e10cSrcweir    namefield[lcid_all] = 2
97cdf0e10cSrcweir    namefield[xp_lcid]  = 2
98cdf0e10cSrcweir    namefield[nls_238z] = 4
99cdf0e10cSrcweir    hexfield[lcid_all]  = 3
100cdf0e10cSrcweir    hexfield[xp_lcid]   = 3
101cdf0e10cSrcweir    hexfield[nls_238z]  = 2
102cdf0e10cSrcweir    locfield[lcid_all]  = 0
103cdf0e10cSrcweir    locfield[xp_lcid]   = 0
104cdf0e10cSrcweir    locfield[nls_238z]  = 3
105cdf0e10cSrcweir}
106cdf0e10cSrcweir
107cdf0e10cSrcweir(NF < 5) { next }
108cdf0e10cSrcweir
109cdf0e10cSrcweir!filetype {
110cdf0e10cSrcweir    if (NF == 5)
111cdf0e10cSrcweir    {
112cdf0e10cSrcweir        if ($2 ~ /^0x/)
113cdf0e10cSrcweir            filetype = nls_238z
114cdf0e10cSrcweir        else if ($2 ~ /^Afrikaans/)
115cdf0e10cSrcweir            filetype = lcid_all
116cdf0e10cSrcweir    }
117cdf0e10cSrcweir    else if (NF == 7)
118cdf0e10cSrcweir        filetype = xp_lcid
119cdf0e10cSrcweir    if (!filetype)
120cdf0e10cSrcweir        next
121cdf0e10cSrcweir    name = namefield[filetype]
122cdf0e10cSrcweir    hex = hexfield[filetype]
123cdf0e10cSrcweir    loc = locfield[filetype]
124cdf0e10cSrcweir}
125cdf0e10cSrcweir
126cdf0e10cSrcweir{
127cdf0e10cSrcweir    gsub( /^[^:]*:/, "", $name)
128cdf0e10cSrcweir    gsub( /\..*/, "", $name)
129cdf0e10cSrcweir    gsub( /(^[ _]+)|([ _]+$)/, "", $hex)
130cdf0e10cSrcweir    gsub( /(^[ _]+)|([ _]+$)/, "", $name)
131cdf0e10cSrcweir    if (loc)
132cdf0e10cSrcweir        gsub( /(^[ _]+)|([ _]+$)/, "", $loc)
133cdf0e10cSrcweir}
134cdf0e10cSrcweir
135cdf0e10cSrcweir($hex ~ /^0x/) { $hex = substr( $hex, 3) }
136cdf0e10cSrcweir
137cdf0e10cSrcweir# if only 464 instead of 0464, make it match lang.h
138cdf0e10cSrcweir(length($hex) < 4) { $hex = "0" $hex }
139cdf0e10cSrcweir
140cdf0e10cSrcweir($hex !~ /^[0-9a-fA-F][0-9a-fA-F]*$/) { filtered[$hex] = $0; next }
141cdf0e10cSrcweir
142cdf0e10cSrcweir# all[HEX]=string
143cdf0e10cSrcweir{ all[toupper($hex)] = $name }
144cdf0e10cSrcweir
145cdf0e10cSrcweir(loc) { comment[toupper($hex)] = "  /* " $loc " */" }
146cdf0e10cSrcweir
147cdf0e10cSrcweir# new hex: newlang[HEX]=string
148cdf0e10cSrcweir!(toupper($hex) in lang) { newlang[toupper($hex)] = $name }
149cdf0e10cSrcweir
150cdf0e10cSrcweirEND {
151cdf0e10cSrcweir    if (!filetype)
152cdf0e10cSrcweir    {
153cdf0e10cSrcweir        print "No file type recognized." >>"/dev/stderr"
154cdf0e10cSrcweir        exit(1)
155cdf0e10cSrcweir    }
156cdf0e10cSrcweir    print "// assuming " filetypename[filetype] " file"
157cdf0e10cSrcweir    # every new language
158cdf0e10cSrcweir    for (x in newlang)
159cdf0e10cSrcweir    {
160cdf0e10cSrcweir        printf( "xxxxxxx LANGUAGE_%-26s 0x%s%s\n", newlang[x], x, comment[x])
161cdf0e10cSrcweir        n = split(newlang[x],arr,/[^A-Za-z0-9]/)
162cdf0e10cSrcweir        def = ""
163cdf0e10cSrcweir        for (i=1; i<=n; ++i)
164cdf0e10cSrcweir        {
165cdf0e10cSrcweir            if (length(arr[i]))
166cdf0e10cSrcweir            {
167cdf0e10cSrcweir                # each identifier word of the language name
168cdf0e10cSrcweir                if (def)
169cdf0e10cSrcweir                    def = def "_"
170cdf0e10cSrcweir                aup = toupper(arr[i])
171cdf0e10cSrcweir                def = def aup
172cdf0e10cSrcweir                for (l in lang)
173cdf0e10cSrcweir                {
174cdf0e10cSrcweir                    #  contained in already existing definitions?
175cdf0e10cSrcweir                    if (lang[l] ~ aup)
176cdf0e10cSrcweir                        printf( "// %-50s %s\n", arr[i] ": " lang[l], l)
177cdf0e10cSrcweir                }
178cdf0e10cSrcweir            }
179cdf0e10cSrcweir        }
180cdf0e10cSrcweir        printf( "#define LANGUAGE_%-26s 0x%s\n", def, x)
181cdf0e10cSrcweir    }
182cdf0e10cSrcweir    print "\n// --- reverse check follows ----------------------------------\n"
183cdf0e10cSrcweir    for (x in lang)
184cdf0e10cSrcweir    {
185cdf0e10cSrcweir        if (!(x in all))
186cdf0e10cSrcweir            print "// not in input file:   " x "  " lang[x]
187cdf0e10cSrcweir    }
188cdf0e10cSrcweir    print "\n// --- filtered table entries follow (if any) -----------------\n"
189cdf0e10cSrcweir    for (x in filtered)
190cdf0e10cSrcweir        print "// filtered:   " x "  " filtered[x]
191cdf0e10cSrcweir}
192