source/isolang/lcid.awk

#!/usr/bin/awk -f
# *************************************************************
#
#  Licensed to the Apache Software Foundation (ASF) under one
#  or more contributor license agreements.  See the NOTICE file
#  distributed with this work for additional information
#  regarding copyright ownership.  The ASF licenses this file
#  to you under the Apache License, Version 2.0 (the
#  "License"); you may not use this file except in compliance
#  with the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing,
#  software distributed under the License is distributed on an
#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
#  KIND, either express or implied.  See the License for the
#  specific language governing permissions and limitations
#  under the License.
#
# *************************************************************
#
# Utility to compare MS-LANGID definitions with those defined in ../../inc/i18npool/lang.h
# Run in i18npool/source/isolang
#
# outputs new #define LANGUAGE_... 0x... and also some commented out substrings
# that were matched in already existing defines.
#
# ATTENTION! The sed filter in the command line examples below assures that a
# '|' border is drawn by html2text in data tables, and nowhere else, on which
# this awk script relies. This script also heavily relies on the column layout
# encountered. Should MS decide to change their layout or their CSS names
# ("data..."), this would probably break. Should html2text decide that the last
# border="..." attribute encountered wins instead of the first, this may break
# also.
#
# sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g'
#
# After html2text best if file cleaned up to _only_ contain the table entries,
# but not necessary, entries are filtered. Check output.
#
# Expects input from the saved page of one of
#
# (1)
# http://www.microsoft.com/globaldev/reference/lcid-all.mspx
# filtered through ``html2text -nobs ...'', generated table:
# blank,name,hex,dec,blank fields:
#    |Afrikaans_-_South_Africa___|0436___|1078___|
#
# complete command line:
# lynx -dump -source http://www.microsoft.com/globaldev/reference/lcid-all.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
#
#
# (2)
# http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx
# filtered through ``html2text -nobs ...'', generated table:
# blank,name,hex,dec,inputlocales,collection,blank fields:
#    |Afrikaans   |0436   |1078   |0436:00000409,   |Basic   |
#
# complete command line:
# lynx -dump -source http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
#
#
# (3)
# http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp
# filtered through ``html2text -nobs ...'', generated table:
# blank,hex,locale,name,blank  fields:
#   |0x0436___|af-ZA___|Afrikaans_(South_Africa)___|
#
# complete command line:
# lynx -dump -source http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
#
# Author: Eike Rathke <erack@sun.com>, <er@openoffice.org>
#

BEGIN {
    while ((getline < "../../inc/i18npool/lang.h") > 0)
    {
        if ($0 ~ /^#define[ ]*LANGUAGE_[_A-Za-z0-9]*[ ]*0x[0-9a-fA-F]/)
        {
            # lang[HEX]=NAME
            lang[toupper(substr($3,3))] = toupper($2)
            #print substr($3,3) "=" $2
        }
    }
    # html2text table follows
    FS = "\|"
    filetype = 0
    lcid_all = 1
    xp_lcid  = 2
    nls_238z = 3
    filetypename[filetype] = "unknown"
    filetypename[lcid_all] = "lcid_all"
    filetypename[xp_lcid]  = "xp_lcid"
    filetypename[nls_238z] = "nls_238z"
    namefield[lcid_all] = 2
    namefield[xp_lcid]  = 2
    namefield[nls_238z] = 4
    hexfield[lcid_all]  = 3
    hexfield[xp_lcid]   = 3
    hexfield[nls_238z]  = 2
    locfield[lcid_all]  = 0
    locfield[xp_lcid]   = 0
    locfield[nls_238z]  = 3
}

(NF < 5) { next }

!filetype {
    if (NF == 5)
    {
        if ($2 ~ /^0x/)
            filetype = nls_238z
        else if ($2 ~ /^Afrikaans/)
            filetype = lcid_all
    }
    else if (NF == 7)
        filetype = xp_lcid
    if (!filetype)
        next
    name = namefield[filetype]
    hex = hexfield[filetype]
    loc = locfield[filetype]
}

{
    gsub( /^[^:]*:/, "", $name)
    gsub( /\..*/, "", $name)
    gsub( /(^[ _]+)|([ _]+$)/, "", $hex)
    gsub( /(^[ _]+)|([ _]+$)/, "", $name)
    if (loc)
        gsub( /(^[ _]+)|([ _]+$)/, "", $loc)
}

($hex ~ /^0x/) { $hex = substr( $hex, 3) }

# if only 464 instead of 0464, make it match lang.h
(length($hex) < 4) { $hex = "0" $hex }

($hex !~ /^[0-9a-fA-F][0-9a-fA-F]*$/) { filtered[$hex] = $0; next }

# all[HEX]=string
{ all[toupper($hex)] = $name }

(loc) { comment[toupper($hex)] = "  /* " $loc " */" }

# new hex: newlang[HEX]=string
!(toupper($hex) in lang) { newlang[toupper($hex)] = $name }

END {
    if (!filetype)
    {
        print "No file type recognized." >>"/dev/stderr"
        exit(1)
    }
    print "// assuming " filetypename[filetype] " file"
    # every new language
    for (x in newlang)
    {
        printf( "xxxxxxx LANGUAGE_%-26s 0x%s%s\n", newlang[x], x, comment[x])
        n = split(newlang[x],arr,/[^A-Za-z0-9]/)
        def = ""
        for (i=1; i<=n; ++i)
        {
            if (length(arr[i]))
            {
                # each identifier word of the language name
                if (def)
                    def = def "_"
                aup = toupper(arr[i])
                def = def aup
                for (l in lang)
                {
                    #  contained in already existing definitions?
                    if (lang[l] ~ aup)
                        printf( "// %-50s %s\n", arr[i] ": " lang[l], l)
                }
            }
        }
        printf( "#define LANGUAGE_%-26s 0x%s\n", def, x)
    }
    print "\n// --- reverse check follows ----------------------------------\n"
    for (x in lang)
    {
        if (!(x in all))
            print "// not in input file:   " x "  " lang[x]
    }
    print "\n// --- filtered table entries follow (if any) -----------------\n"
    for (x in filtered)
        print "// filtered:   " x "  " filtered[x]
}