1cdf0e10cSrcweir#!/usr/bin/awk -f 2*5b501c92SAndrew Rist# ************************************************************* 3*5b501c92SAndrew Rist# 4*5b501c92SAndrew Rist# Licensed to the Apache Software Foundation (ASF) under one 5*5b501c92SAndrew Rist# or more contributor license agreements. See the NOTICE file 6*5b501c92SAndrew Rist# distributed with this work for additional information 7*5b501c92SAndrew Rist# regarding copyright ownership. The ASF licenses this file 8*5b501c92SAndrew Rist# to you under the Apache License, Version 2.0 (the 9*5b501c92SAndrew Rist# "License"); you may not use this file except in compliance 10*5b501c92SAndrew Rist# with the License. You may obtain a copy of the License at 11*5b501c92SAndrew Rist# 12*5b501c92SAndrew Rist# http://www.apache.org/licenses/LICENSE-2.0 13*5b501c92SAndrew Rist# 14*5b501c92SAndrew Rist# Unless required by applicable law or agreed to in writing, 15*5b501c92SAndrew Rist# software distributed under the License is distributed on an 16*5b501c92SAndrew Rist# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 17*5b501c92SAndrew Rist# KIND, either express or implied. See the License for the 18*5b501c92SAndrew Rist# specific language governing permissions and limitations 19*5b501c92SAndrew Rist# under the License. 20*5b501c92SAndrew Rist# 21*5b501c92SAndrew Rist# ************************************************************* 22cdf0e10cSrcweir# 23cdf0e10cSrcweir# Utility to compare MS-LANGID definitions with those defined in ../../inc/i18npool/lang.h 24cdf0e10cSrcweir# Run in i18npool/source/isolang 25cdf0e10cSrcweir# 26cdf0e10cSrcweir# outputs new #define LANGUAGE_... 0x... and also some commented out substrings 27cdf0e10cSrcweir# that were matched in already existing defines. 28cdf0e10cSrcweir# 29cdf0e10cSrcweir# ATTENTION! The sed filter in the command line examples below assures that a 30cdf0e10cSrcweir# '|' border is drawn by html2text in data tables, and nowhere else, on which 31cdf0e10cSrcweir# this awk script relies. This script also heavily relies on the column layout 32cdf0e10cSrcweir# encountered. Should MS decide to change their layout or their CSS names 33cdf0e10cSrcweir# ("data..."), this would probably break. Should html2text decide that the last 34cdf0e10cSrcweir# border="..." attribute encountered wins instead of the first, this may break 35cdf0e10cSrcweir# also. 36cdf0e10cSrcweir# 37cdf0e10cSrcweir# sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' 38cdf0e10cSrcweir# 39cdf0e10cSrcweir# After html2text best if file cleaned up to _only_ contain the table entries, 40cdf0e10cSrcweir# but not necessary, entries are filtered. Check output. 41cdf0e10cSrcweir# 42cdf0e10cSrcweir# Expects input from the saved page of one of 43cdf0e10cSrcweir# 44cdf0e10cSrcweir# (1) 45cdf0e10cSrcweir# http://www.microsoft.com/globaldev/reference/lcid-all.mspx 46cdf0e10cSrcweir# filtered through ``html2text -nobs ...'', generated table: 47cdf0e10cSrcweir# blank,name,hex,dec,blank fields: 48cdf0e10cSrcweir# |Afrikaans_-_South_Africa___|0436___|1078___| 49cdf0e10cSrcweir# 50cdf0e10cSrcweir# complete command line: 51cdf0e10cSrcweir# lynx -dump -source http://www.microsoft.com/globaldev/reference/lcid-all.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile 52cdf0e10cSrcweir# 53cdf0e10cSrcweir# 54cdf0e10cSrcweir# (2) 55cdf0e10cSrcweir# http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx 56cdf0e10cSrcweir# filtered through ``html2text -nobs ...'', generated table: 57cdf0e10cSrcweir# blank,name,hex,dec,inputlocales,collection,blank fields: 58cdf0e10cSrcweir# |Afrikaans |0436 |1078 |0436:00000409, |Basic | 59cdf0e10cSrcweir# 60cdf0e10cSrcweir# complete command line: 61cdf0e10cSrcweir# lynx -dump -source http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile 62cdf0e10cSrcweir# 63cdf0e10cSrcweir# 64cdf0e10cSrcweir# (3) 65cdf0e10cSrcweir# http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp 66cdf0e10cSrcweir# filtered through ``html2text -nobs ...'', generated table: 67cdf0e10cSrcweir# blank,hex,locale,name,blank fields: 68cdf0e10cSrcweir# |0x0436___|af-ZA___|Afrikaans_(South_Africa)___| 69cdf0e10cSrcweir# 70cdf0e10cSrcweir# complete command line: 71cdf0e10cSrcweir# lynx -dump -source http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile 72cdf0e10cSrcweir# 73cdf0e10cSrcweir# Author: Eike Rathke <erack@sun.com>, <er@openoffice.org> 74cdf0e10cSrcweir# 75cdf0e10cSrcweir 76cdf0e10cSrcweirBEGIN { 77cdf0e10cSrcweir while ((getline < "../../inc/i18npool/lang.h") > 0) 78cdf0e10cSrcweir { 79cdf0e10cSrcweir if ($0 ~ /^#define[ ]*LANGUAGE_[_A-Za-z0-9]*[ ]*0x[0-9a-fA-F]/) 80cdf0e10cSrcweir { 81cdf0e10cSrcweir # lang[HEX]=NAME 82cdf0e10cSrcweir lang[toupper(substr($3,3))] = toupper($2) 83cdf0e10cSrcweir #print substr($3,3) "=" $2 84cdf0e10cSrcweir } 85cdf0e10cSrcweir } 86cdf0e10cSrcweir # html2text table follows 87cdf0e10cSrcweir FS = "\|" 88cdf0e10cSrcweir filetype = 0 89cdf0e10cSrcweir lcid_all = 1 90cdf0e10cSrcweir xp_lcid = 2 91cdf0e10cSrcweir nls_238z = 3 92cdf0e10cSrcweir filetypename[filetype] = "unknown" 93cdf0e10cSrcweir filetypename[lcid_all] = "lcid_all" 94cdf0e10cSrcweir filetypename[xp_lcid] = "xp_lcid" 95cdf0e10cSrcweir filetypename[nls_238z] = "nls_238z" 96cdf0e10cSrcweir namefield[lcid_all] = 2 97cdf0e10cSrcweir namefield[xp_lcid] = 2 98cdf0e10cSrcweir namefield[nls_238z] = 4 99cdf0e10cSrcweir hexfield[lcid_all] = 3 100cdf0e10cSrcweir hexfield[xp_lcid] = 3 101cdf0e10cSrcweir hexfield[nls_238z] = 2 102cdf0e10cSrcweir locfield[lcid_all] = 0 103cdf0e10cSrcweir locfield[xp_lcid] = 0 104cdf0e10cSrcweir locfield[nls_238z] = 3 105cdf0e10cSrcweir} 106cdf0e10cSrcweir 107cdf0e10cSrcweir(NF < 5) { next } 108cdf0e10cSrcweir 109cdf0e10cSrcweir!filetype { 110cdf0e10cSrcweir if (NF == 5) 111cdf0e10cSrcweir { 112cdf0e10cSrcweir if ($2 ~ /^0x/) 113cdf0e10cSrcweir filetype = nls_238z 114cdf0e10cSrcweir else if ($2 ~ /^Afrikaans/) 115cdf0e10cSrcweir filetype = lcid_all 116cdf0e10cSrcweir } 117cdf0e10cSrcweir else if (NF == 7) 118cdf0e10cSrcweir filetype = xp_lcid 119cdf0e10cSrcweir if (!filetype) 120cdf0e10cSrcweir next 121cdf0e10cSrcweir name = namefield[filetype] 122cdf0e10cSrcweir hex = hexfield[filetype] 123cdf0e10cSrcweir loc = locfield[filetype] 124cdf0e10cSrcweir} 125cdf0e10cSrcweir 126cdf0e10cSrcweir{ 127cdf0e10cSrcweir gsub( /^[^:]*:/, "", $name) 128cdf0e10cSrcweir gsub( /\..*/, "", $name) 129cdf0e10cSrcweir gsub( /(^[ _]+)|([ _]+$)/, "", $hex) 130cdf0e10cSrcweir gsub( /(^[ _]+)|([ _]+$)/, "", $name) 131cdf0e10cSrcweir if (loc) 132cdf0e10cSrcweir gsub( /(^[ _]+)|([ _]+$)/, "", $loc) 133cdf0e10cSrcweir} 134cdf0e10cSrcweir 135cdf0e10cSrcweir($hex ~ /^0x/) { $hex = substr( $hex, 3) } 136cdf0e10cSrcweir 137cdf0e10cSrcweir# if only 464 instead of 0464, make it match lang.h 138cdf0e10cSrcweir(length($hex) < 4) { $hex = "0" $hex } 139cdf0e10cSrcweir 140cdf0e10cSrcweir($hex !~ /^[0-9a-fA-F][0-9a-fA-F]*$/) { filtered[$hex] = $0; next } 141cdf0e10cSrcweir 142cdf0e10cSrcweir# all[HEX]=string 143cdf0e10cSrcweir{ all[toupper($hex)] = $name } 144cdf0e10cSrcweir 145cdf0e10cSrcweir(loc) { comment[toupper($hex)] = " /* " $loc " */" } 146cdf0e10cSrcweir 147cdf0e10cSrcweir# new hex: newlang[HEX]=string 148cdf0e10cSrcweir!(toupper($hex) in lang) { newlang[toupper($hex)] = $name } 149cdf0e10cSrcweir 150cdf0e10cSrcweirEND { 151cdf0e10cSrcweir if (!filetype) 152cdf0e10cSrcweir { 153cdf0e10cSrcweir print "No file type recognized." >>"/dev/stderr" 154cdf0e10cSrcweir exit(1) 155cdf0e10cSrcweir } 156cdf0e10cSrcweir print "// assuming " filetypename[filetype] " file" 157cdf0e10cSrcweir # every new language 158cdf0e10cSrcweir for (x in newlang) 159cdf0e10cSrcweir { 160cdf0e10cSrcweir printf( "xxxxxxx LANGUAGE_%-26s 0x%s%s\n", newlang[x], x, comment[x]) 161cdf0e10cSrcweir n = split(newlang[x],arr,/[^A-Za-z0-9]/) 162cdf0e10cSrcweir def = "" 163cdf0e10cSrcweir for (i=1; i<=n; ++i) 164cdf0e10cSrcweir { 165cdf0e10cSrcweir if (length(arr[i])) 166cdf0e10cSrcweir { 167cdf0e10cSrcweir # each identifier word of the language name 168cdf0e10cSrcweir if (def) 169cdf0e10cSrcweir def = def "_" 170cdf0e10cSrcweir aup = toupper(arr[i]) 171cdf0e10cSrcweir def = def aup 172cdf0e10cSrcweir for (l in lang) 173cdf0e10cSrcweir { 174cdf0e10cSrcweir # contained in already existing definitions? 175cdf0e10cSrcweir if (lang[l] ~ aup) 176cdf0e10cSrcweir printf( "// %-50s %s\n", arr[i] ": " lang[l], l) 177cdf0e10cSrcweir } 178cdf0e10cSrcweir } 179cdf0e10cSrcweir } 180cdf0e10cSrcweir printf( "#define LANGUAGE_%-26s 0x%s\n", def, x) 181cdf0e10cSrcweir } 182cdf0e10cSrcweir print "\n// --- reverse check follows ----------------------------------\n" 183cdf0e10cSrcweir for (x in lang) 184cdf0e10cSrcweir { 185cdf0e10cSrcweir if (!(x in all)) 186cdf0e10cSrcweir print "// not in input file: " x " " lang[x] 187cdf0e10cSrcweir } 188cdf0e10cSrcweir print "\n// --- filtered table entries follow (if any) -----------------\n" 189cdf0e10cSrcweir for (x in filtered) 190cdf0e10cSrcweir print "// filtered: " x " " filtered[x] 191cdf0e10cSrcweir} 192