Sample WAIS indexing script

The following script was used to index the University of Melbourne Undergraduate Handbook using freewais0.3.


#!/usr/local/bin/perl
#
# Perl script to index the contents of a www tree. This is derived from a csh
# script that Kevin Hughes of EIT constructed for indexing files.
#
# #### Customised for Handbook 95 by davm/ss 29-Nov-94
#

$rootdir = "/home/servers/cern/unimelb/HB";  #### mod davm
#       This is the root directory of the Web tree you want to index

$index = "/home/servers/wais/db/HB";  #### mod davm
#       This is the name and location of the index to be created

$indexprog = "/usr/local/bin/waisindex";
#       The full pathname of the waisindex program

$url = "http://www.unimelb.edu.au/HB";   ###### ss: added /HB
#       The main URL for your Web. No slash at the end!

$numdir = $num = 0;

# Generate a list of directory names, then for each directory, generate an
# array of all the filenames in that directory except for . and .. . Sort this
# list so that if there is an .htaccess file in that directory, it comes near
# the front of the list. We assume that if you've bothered to put special
# access controls into a directory, then maybe you don't want these files
# indexed in a general index. You of course can remove this restriction if you
# want. Then we separate all the files in the directory into two lists: one
# list is those filetypes for which it is appropriate to index the contents of
# the files, and the second list are those whose filetypes are such we don't
# want to index the contents, just the filename (gif, for instance). Then
# if there are any files in either of these lists, we call waisindex to index
# them. The first time we index, we do not include the -a flag, so that the
# index replaces the current one. Every subsequent call to waisindex includes
# the -a flag so that we then add to the new index we are building. We include
# the -nopairs option on all waisindex calls, because this saves a lot of
# unused info from being put into the index.

# If this is run by cron, redirect print statements to file (or /dev/null).
# Probably want to add a "-l 0" option to the waisindex call also.
#open (LOGIT, ">>/tmp/waisindex.run");
#select LOGIT;

# Put in the appropriate path on your system to each of the commands
# "du", "cut" and "tail", in case you want to run this from a cronjob and
# these commands are not in the default path. Note that "du" will not follow
# symbolic links out of this "tree".

open (PATHNAMES,"/usr/bin/du $rootdir | /usr/bin/cut -f2 | /usr/ucb/tail -r | ");
DO_PATH: while ( $pathname =  ) {
        chop $pathname;

        # The following are "path patterns" that we don't want to
        # follow (subdirectories whose files we do not want to index).
        # Add or subtract from this list as appropriate. These may
        # be directories you don't want to index at all, or directories
        # for which you want to build their own separate index.
        next DO_PATH if $pathname =~ /uiucnet/i;
        #next DO_PATH if $pathname =~ /demopict/i;
        next DO_PATH if $pathname =~ /images/i;
        next DO_PATH if $pathname =~ /testdir/i;
        next DO_PATH if $pathname =~ /00ADMIN/i;

        print "Current pathname is: $pathname\n";
        $numdir++;
        @contents = @nocontents = ();
        opendir(CURRENT_DIR, "$pathname")
                        || die "Can't open directory $pathname: $!\n";
        @allfiles = sort (grep(!/^\.\.?$/, readdir(CURRENT_DIR)));
        closedir(CURRENT_DIR);

        DO_FILE: foreach $file (@allfiles) {
                        # skip directories that contain a .htaccess file
                        # note this is NOT smart enough to be recursive (if a
                        # directory below this does not itself contain an
                        # .htaccess file, it WILL be indexed).
                next DO_PATH if $file eq '.htaccess';
                next DO_PATH if $file eq 'index.html';
                next DO_PATH if $file eq 'help.html';
                next DO_PATH if $file eq 'helpsearch.html';
                next DO_PATH if $file eq 'GeneralInfo.html';
                next DO_PATH if $file =~ m/swish/;
                next DO_PATH if $file =~ m/00ADMIN/;

                        # filetypes for which we want to index contents
                $file =~ /\.html?$/i && 
                   do { push(@contents, "$pathname/$file"); next DO_FILE;};
                $file =~ /\.te?xt$/i &&
                   do { push(@contents, "$pathname/$file"); next DO_FILE;};
                $file =~ /\.pdf$/i &&
                   do { push(@contents, "$pathname/$file"); next DO_FILE;};
                #$file =~ /\.ps$/i &&
                   #do { push(@contents, "$pathname/$file"); next DO_FILE;};

                        # filetypes for which we DON'T want to index contents
                $file =~ /\.gif$/i && 
                   do { push(@nocontents, "$pathname/$file"); next DO_FILE;};
                #$file =~ /\.au$/i &&
                   #do { push(@nocontents, "$pathname/$file"); next DO_FILE;};
                #$file =~ /\.mpg$/i &&
                   #do { push(@nocontents, "$pathname/$file"); next DO_FILE;};
                #$file =~ /\.hqx$/i &&
                   #do { push(@nocontents, "$pathname/$file"); next DO_FILE;};
        # Comment out the above lines to your liking, depending on what
        # filetypes you are actually interested in indexing.
#       For instance, if the ".mpg" line is commented out, then
#       MPEG files will *not* be indexed into the database (and thus
#       won't be searchable by others).
        } # end DO_FILE loop

        if ($#contents >= 0) {          # Index if any files in list.
                @waisflags = ("-a", "-nopairs");
                @waisflags = ("-nopairs") if $num == 0;
                $num ++;
                system($indexprog, "-d", $index, @waisflags, "-t", "URL",
                                $rootdir, $url, @contents);
        }
        if ($#nocontents >= 0) {        # Index if any files in list.
                @waisflags = ("-a", "-nopairs");
                @waisflags = ("-nopairs") if $num == 0;
                $num ++;
                system($indexprog, "-d", $index, @waisflags, "-t", "URL",
                                $rootdir, $url, "-nocontents", @nocontents);
                # note that "-nocontents" flag must follow any -T or -t option
        }
} # end DO_PATH loop

close(PATHNAMES);
print "Waisindex called $num times.\n";
print "Tried indexing $numdir directories.\n";
# end of script