The following script was used to index the University of Melbourne Undergraduate Handbook using freewais0.3.
#!/usr/local/bin/perl # # Perl script to index the contents of a www tree. This is derived from a csh # script that Kevin Hughes of EIT constructed for indexing files. # # #### Customised for Handbook 95 by davm/ss 29-Nov-94 # $rootdir = "/home/servers/cern/unimelb/HB"; #### mod davm # This is the root directory of the Web tree you want to index $index = "/home/servers/wais/db/HB"; #### mod davm # This is the name and location of the index to be created $indexprog = "/usr/local/bin/waisindex"; # The full pathname of the waisindex program $url = "http://www.unimelb.edu.au/HB"; ###### ss: added /HB # The main URL for your Web. No slash at the end! $numdir = $num = 0; # Generate a list of directory names, then for each directory, generate an # array of all the filenames in that directory except for . and .. . Sort this # list so that if there is an .htaccess file in that directory, it comes near # the front of the list. We assume that if you've bothered to put special # access controls into a directory, then maybe you don't want these files # indexed in a general index. You of course can remove this restriction if you # want. Then we separate all the files in the directory into two lists: one # list is those filetypes for which it is appropriate to index the contents of # the files, and the second list are those whose filetypes are such we don't # want to index the contents, just the filename (gif, for instance). Then # if there are any files in either of these lists, we call waisindex to index # them. The first time we index, we do not include the -a flag, so that the # index replaces the current one. Every subsequent call to waisindex includes # the -a flag so that we then add to the new index we are building. We include # the -nopairs option on all waisindex calls, because this saves a lot of # unused info from being put into the index. # If this is run by cron, redirect print statements to file (or /dev/null). # Probably want to add a "-l 0" option to the waisindex call also. #open (LOGIT, ">>/tmp/waisindex.run"); #select LOGIT; # Put in the appropriate path on your system to each of the commands # "du", "cut" and "tail", in case you want to run this from a cronjob and # these commands are not in the default path. Note that "du" will not follow # symbolic links out of this "tree". open (PATHNAMES,"/usr/bin/du $rootdir | /usr/bin/cut -f2 | /usr/ucb/tail -r | "); DO_PATH: while ( $pathname =) { chop $pathname; # The following are "path patterns" that we don't want to # follow (subdirectories whose files we do not want to index). # Add or subtract from this list as appropriate. These may # be directories you don't want to index at all, or directories # for which you want to build their own separate index. next DO_PATH if $pathname =~ /uiucnet/i; #next DO_PATH if $pathname =~ /demopict/i; next DO_PATH if $pathname =~ /images/i; next DO_PATH if $pathname =~ /testdir/i; next DO_PATH if $pathname =~ /00ADMIN/i; print "Current pathname is: $pathname\n"; $numdir++; @contents = @nocontents = (); opendir(CURRENT_DIR, "$pathname") || die "Can't open directory $pathname: $!\n"; @allfiles = sort (grep(!/^\.\.?$/, readdir(CURRENT_DIR))); closedir(CURRENT_DIR); DO_FILE: foreach $file (@allfiles) { # skip directories that contain a .htaccess file # note this is NOT smart enough to be recursive (if a # directory below this does not itself contain an # .htaccess file, it WILL be indexed). next DO_PATH if $file eq '.htaccess'; next DO_PATH if $file eq 'index.html'; next DO_PATH if $file eq 'help.html'; next DO_PATH if $file eq 'helpsearch.html'; next DO_PATH if $file eq 'GeneralInfo.html'; next DO_PATH if $file =~ m/swish/; next DO_PATH if $file =~ m/00ADMIN/; # filetypes for which we want to index contents $file =~ /\.html?$/i && do { push(@contents, "$pathname/$file"); next DO_FILE;}; $file =~ /\.te?xt$/i && do { push(@contents, "$pathname/$file"); next DO_FILE;}; $file =~ /\.pdf$/i && do { push(@contents, "$pathname/$file"); next DO_FILE;}; #$file =~ /\.ps$/i && #do { push(@contents, "$pathname/$file"); next DO_FILE;}; # filetypes for which we DON'T want to index contents $file =~ /\.gif$/i && do { push(@nocontents, "$pathname/$file"); next DO_FILE;}; #$file =~ /\.au$/i && #do { push(@nocontents, "$pathname/$file"); next DO_FILE;}; #$file =~ /\.mpg$/i && #do { push(@nocontents, "$pathname/$file"); next DO_FILE;}; #$file =~ /\.hqx$/i && #do { push(@nocontents, "$pathname/$file"); next DO_FILE;}; # Comment out the above lines to your liking, depending on what # filetypes you are actually interested in indexing. # For instance, if the ".mpg" line is commented out, then # MPEG files will *not* be indexed into the database (and thus # won't be searchable by others). } # end DO_FILE loop if ($#contents >= 0) { # Index if any files in list. @waisflags = ("-a", "-nopairs"); @waisflags = ("-nopairs") if $num == 0; $num ++; system($indexprog, "-d", $index, @waisflags, "-t", "URL", $rootdir, $url, @contents); } if ($#nocontents >= 0) { # Index if any files in list. @waisflags = ("-a", "-nopairs"); @waisflags = ("-nopairs") if $num == 0; $num ++; system($indexprog, "-d", $index, @waisflags, "-t", "URL", $rootdir, $url, "-nocontents", @nocontents); # note that "-nocontents" flag must follow any -T or -t option } } # end DO_PATH loop close(PATHNAMES); print "Waisindex called $num times.\n"; print "Tried indexing $numdir directories.\n"; # end of script