1*ddc94e00SAriel Constenla-Haile--- misc/mythes-1.2.0.orig/th_gen_idx.pl	2010-02-27 12:52:58.000000000 -0300
2*ddc94e00SAriel Constenla-Haile+++ misc/build/mythes-1.2.0/th_gen_idx.pl	2012-01-12 04:13:15.149371123 -0300
3*ddc94e00SAriel Constenla-Haile@@ -1,11 +1,26 @@
4cdf0e10cSrcweir-#!/usr/bin/perl
5cdf0e10cSrcweir-
6cdf0e10cSrcweir-# perl program to take a thesaurus structured text data file
7cdf0e10cSrcweir-# and create the proper sorted index file (.idx)
8cdf0e10cSrcweir+:
9cdf0e10cSrcweir+eval 'exec perl -wS $0 ${1+"$@"}'
10cdf0e10cSrcweir+    if 0;
11e76eebc6SAndrew Rist+#**************************************************************
12*ddc94e00SAriel Constenla-Haile+#
13e76eebc6SAndrew Rist+#  Licensed to the Apache Software Foundation (ASF) under one
14e76eebc6SAndrew Rist+#  or more contributor license agreements.  See the NOTICE file
15e76eebc6SAndrew Rist+#  distributed with this work for additional information
16e76eebc6SAndrew Rist+#  regarding copyright ownership.  The ASF licenses this file
17e76eebc6SAndrew Rist+#  to you under the Apache License, Version 2.0 (the
18e76eebc6SAndrew Rist+#  "License"); you may not use this file except in compliance
19e76eebc6SAndrew Rist+#  with the License.  You may obtain a copy of the License at
20*ddc94e00SAriel Constenla-Haile+#
21e76eebc6SAndrew Rist+#    http://www.apache.org/licenses/LICENSE-2.0
22*ddc94e00SAriel Constenla-Haile #
23*ddc94e00SAriel Constenla-Haile-# typcially invoked as follows:
24*ddc94e00SAriel Constenla-Haile-# cat th_en_US_new.dat | ./th_gen_idx.pl > th_en_US_new.idx
25e76eebc6SAndrew Rist+#  Unless required by applicable law or agreed to in writing,
26e76eebc6SAndrew Rist+#  software distributed under the License is distributed on an
27e76eebc6SAndrew Rist+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
28e76eebc6SAndrew Rist+#  KIND, either express or implied.  See the License for the
29e76eebc6SAndrew Rist+#  specific language governing permissions and limitations
30e76eebc6SAndrew Rist+#  under the License.
31*ddc94e00SAriel Constenla-Haile #
32e76eebc6SAndrew Rist+#**************************************************************
33cdf0e10cSrcweir
34cdf0e10cSrcweir sub by_entry {
35cdf0e10cSrcweir     my ($aent, $aoff) = split('\|',$a);
36*ddc94e00SAriel Constenla-Haile@@ -13,6 +28,27 @@ sub by_entry {
37cdf0e10cSrcweir     $aent cmp $bent;
38cdf0e10cSrcweir }
39cdf0e10cSrcweir
40cdf0e10cSrcweir+#FIXME: someone may want "infile" or even parameter parsing
41cdf0e10cSrcweir+sub get_outfile {
42cdf0e10cSrcweir+	my $next_is_file = 0;
43cdf0e10cSrcweir+	foreach ( @ARGV ) {
44cdf0e10cSrcweir+		if ( $next_is_file ) {
45cdf0e10cSrcweir+			return $_
46cdf0e10cSrcweir+		}
47cdf0e10cSrcweir+		if ( $_ eq "-o" ) {
48cdf0e10cSrcweir+			$next_is_file = 1;
49cdf0e10cSrcweir+		}
50cdf0e10cSrcweir+	}
51cdf0e10cSrcweir+	return "";
52cdf0e10cSrcweir+}
53cdf0e10cSrcweir+
54cdf0e10cSrcweir+sub usage {
55cdf0e10cSrcweir+	print "usage:\n";
56cdf0e10cSrcweir+	print "$0 -o outfile < input\n";
57cdf0e10cSrcweir+
58cdf0e10cSrcweir+	exit 99;
59cdf0e10cSrcweir+}
60cdf0e10cSrcweir+
61cdf0e10cSrcweir # main routine
62cdf0e10cSrcweir my $ne = 0;       # number of entries in index
63cdf0e10cSrcweir my @tindex=();    # the index itself
64*ddc94e00SAriel Constenla-Haile@@ -24,6 +60,10 @@ my $nm=0;         # number of meaning fo
65cdf0e10cSrcweir my $meaning="";   # current meaning and synonyms
66cdf0e10cSrcweir my $p;            # misc uses
67cdf0e10cSrcweir my $encoding;     # encoding used by text file
68cdf0e10cSrcweir+my $outfile = "";
69cdf0e10cSrcweir+
70cdf0e10cSrcweir+$outfile = get_outfile();
71cdf0e10cSrcweir+usage() if ( $outfile eq "" );
72cdf0e10cSrcweir
73cdf0e10cSrcweir # top line of thesaurus provides encoding
74cdf0e10cSrcweir $encoding=<STDIN>;
75*ddc94e00SAriel Constenla-Haile@@ -51,9 +91,13 @@ while ($rec=<STDIN>){
76cdf0e10cSrcweir # now we have all of the information
77cdf0e10cSrcweir # so sort it and then output the encoding, count and index data
78cdf0e10cSrcweir @tindex = sort by_entry @tindex;
79cdf0e10cSrcweir-print STDOUT "$encoding\n";
80cdf0e10cSrcweir-print STDOUT "$ne\n";
81cdf0e10cSrcweir+
82cdf0e10cSrcweir+print "$outfile\n";
83cdf0e10cSrcweir+open OUTFILE, ">$outfile" or die "ERROR: Can't open $outfile for writing!";
84cdf0e10cSrcweir+print OUTFILE "$encoding\n";
85cdf0e10cSrcweir+print OUTFILE "$ne\n";
86cdf0e10cSrcweir foreach $one (@tindex) {
87cdf0e10cSrcweir-    print STDOUT "$one\n";
88cdf0e10cSrcweir+    print OUTFILE "$one\n";
89cdf0e10cSrcweir }
90cdf0e10cSrcweir+close OUTFILE;
91cdf0e10cSrcweir
92