1*ddc94e00SAriel Constenla-Haile--- misc/mythes-1.2.0.orig/th_gen_idx.pl 2010-02-27 12:52:58.000000000 -0300 2*ddc94e00SAriel Constenla-Haile+++ misc/build/mythes-1.2.0/th_gen_idx.pl 2012-01-12 04:13:15.149371123 -0300 3*ddc94e00SAriel Constenla-Haile@@ -1,11 +1,26 @@ 4cdf0e10cSrcweir-#!/usr/bin/perl 5cdf0e10cSrcweir- 6cdf0e10cSrcweir-# perl program to take a thesaurus structured text data file 7cdf0e10cSrcweir-# and create the proper sorted index file (.idx) 8cdf0e10cSrcweir+: 9cdf0e10cSrcweir+eval 'exec perl -wS $0 ${1+"$@"}' 10cdf0e10cSrcweir+ if 0; 11e76eebc6SAndrew Rist+#************************************************************** 12*ddc94e00SAriel Constenla-Haile+# 13e76eebc6SAndrew Rist+# Licensed to the Apache Software Foundation (ASF) under one 14e76eebc6SAndrew Rist+# or more contributor license agreements. See the NOTICE file 15e76eebc6SAndrew Rist+# distributed with this work for additional information 16e76eebc6SAndrew Rist+# regarding copyright ownership. The ASF licenses this file 17e76eebc6SAndrew Rist+# to you under the Apache License, Version 2.0 (the 18e76eebc6SAndrew Rist+# "License"); you may not use this file except in compliance 19e76eebc6SAndrew Rist+# with the License. You may obtain a copy of the License at 20*ddc94e00SAriel Constenla-Haile+# 21e76eebc6SAndrew Rist+# http://www.apache.org/licenses/LICENSE-2.0 22*ddc94e00SAriel Constenla-Haile # 23*ddc94e00SAriel Constenla-Haile-# typcially invoked as follows: 24*ddc94e00SAriel Constenla-Haile-# cat th_en_US_new.dat | ./th_gen_idx.pl > th_en_US_new.idx 25e76eebc6SAndrew Rist+# Unless required by applicable law or agreed to in writing, 26e76eebc6SAndrew Rist+# software distributed under the License is distributed on an 27e76eebc6SAndrew Rist+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 28e76eebc6SAndrew Rist+# KIND, either express or implied. See the License for the 29e76eebc6SAndrew Rist+# specific language governing permissions and limitations 30e76eebc6SAndrew Rist+# under the License. 31*ddc94e00SAriel Constenla-Haile # 32e76eebc6SAndrew Rist+#************************************************************** 33cdf0e10cSrcweir 34cdf0e10cSrcweir sub by_entry { 35cdf0e10cSrcweir my ($aent, $aoff) = split('\|',$a); 36*ddc94e00SAriel Constenla-Haile@@ -13,6 +28,27 @@ sub by_entry { 37cdf0e10cSrcweir $aent cmp $bent; 38cdf0e10cSrcweir } 39cdf0e10cSrcweir 40cdf0e10cSrcweir+#FIXME: someone may want "infile" or even parameter parsing 41cdf0e10cSrcweir+sub get_outfile { 42cdf0e10cSrcweir+ my $next_is_file = 0; 43cdf0e10cSrcweir+ foreach ( @ARGV ) { 44cdf0e10cSrcweir+ if ( $next_is_file ) { 45cdf0e10cSrcweir+ return $_ 46cdf0e10cSrcweir+ } 47cdf0e10cSrcweir+ if ( $_ eq "-o" ) { 48cdf0e10cSrcweir+ $next_is_file = 1; 49cdf0e10cSrcweir+ } 50cdf0e10cSrcweir+ } 51cdf0e10cSrcweir+ return ""; 52cdf0e10cSrcweir+} 53cdf0e10cSrcweir+ 54cdf0e10cSrcweir+sub usage { 55cdf0e10cSrcweir+ print "usage:\n"; 56cdf0e10cSrcweir+ print "$0 -o outfile < input\n"; 57cdf0e10cSrcweir+ 58cdf0e10cSrcweir+ exit 99; 59cdf0e10cSrcweir+} 60cdf0e10cSrcweir+ 61cdf0e10cSrcweir # main routine 62cdf0e10cSrcweir my $ne = 0; # number of entries in index 63cdf0e10cSrcweir my @tindex=(); # the index itself 64*ddc94e00SAriel Constenla-Haile@@ -24,6 +60,10 @@ my $nm=0; # number of meaning fo 65cdf0e10cSrcweir my $meaning=""; # current meaning and synonyms 66cdf0e10cSrcweir my $p; # misc uses 67cdf0e10cSrcweir my $encoding; # encoding used by text file 68cdf0e10cSrcweir+my $outfile = ""; 69cdf0e10cSrcweir+ 70cdf0e10cSrcweir+$outfile = get_outfile(); 71cdf0e10cSrcweir+usage() if ( $outfile eq "" ); 72cdf0e10cSrcweir 73cdf0e10cSrcweir # top line of thesaurus provides encoding 74cdf0e10cSrcweir $encoding=<STDIN>; 75*ddc94e00SAriel Constenla-Haile@@ -51,9 +91,13 @@ while ($rec=<STDIN>){ 76cdf0e10cSrcweir # now we have all of the information 77cdf0e10cSrcweir # so sort it and then output the encoding, count and index data 78cdf0e10cSrcweir @tindex = sort by_entry @tindex; 79cdf0e10cSrcweir-print STDOUT "$encoding\n"; 80cdf0e10cSrcweir-print STDOUT "$ne\n"; 81cdf0e10cSrcweir+ 82cdf0e10cSrcweir+print "$outfile\n"; 83cdf0e10cSrcweir+open OUTFILE, ">$outfile" or die "ERROR: Can't open $outfile for writing!"; 84cdf0e10cSrcweir+print OUTFILE "$encoding\n"; 85cdf0e10cSrcweir+print OUTFILE "$ne\n"; 86cdf0e10cSrcweir foreach $one (@tindex) { 87cdf0e10cSrcweir- print STDOUT "$one\n"; 88cdf0e10cSrcweir+ print OUTFILE "$one\n"; 89cdf0e10cSrcweir } 90cdf0e10cSrcweir+close OUTFILE; 91cdf0e10cSrcweir 92