#!/usr/local/bin/perl
 
#  #####  print_hit_bold.pl ######
# expects a URL of the form:
# http://your.www.server/cgibindir/print_hit_bold.pl/path/to/file?terms+to+hilite#first_hit
# where the "path/to/file" is relative to your document root directory

# NOTE: This routine uses the Environment Variable 'PATH_TRANSLATED' to
# indicate the fully qualified path to the file that the user wishes to see
# highlighted. If there are no terms following a '?' in the url, then the
# 'Location:' directive will be used to return the original file.
# The only files that can be accessed thru this script are those which can be
# reached thru the Docs tree of your server.
###################
# VERY IMPORTANT
###################
# This routine READS the file directly -- it does not go thru the Web server
# to obtain the file. Thus if you have directories in your main Web docs
# tree that are restricted by ip/domain address or user authentication, then
# either don't install this script, or modify it to not allow it to access
# any files in such directories.
# EXAMPLE if you want to exclude directories/files -- Uncomment the following
# line and set its value appropriately
# $DO_NOT_ACCESS = 'directory1/|directory2/|directory3/';

# print_hit_bold.pl -- Display documents selected by a WAIS search interface.
#     This attempts to "BOLD" every string which matches one of the search
#     words. It skips over anything within <>'s so that it hopefully doesn't
#     screw up any html stuff. It also skips over "non-formatted" sections
#     (LISTING, XMP, PLAINTEXT) and over the TITLE section.
#     It presents text files within <PRE> ... </PRE> tags.
# Michael A. Grady, UIUC/CCSO June 1994 (m-grady@uiuc.edu)
# <A HREF="http://ljordal.cso.uiuc.edu/">Mike Grady</A>
# modified on 7/12/94 to fix a bug with the "first_hit" anchor when the first
# term that gets bolded is the "text" for a link. Small change on 7/20/94 to
# exempt TITLE field from being bolded. Change on 7/21/94 to have file read
# work in line mode rather than paragraph mode -- paragraph mode appears to
# be "dangerous" with lots of the WWW type files (few blank lines in
# documents). Change on 8/15/94 to add BASE tag to front so relative URLs from
# this document DO NOT go back thru print_hit_bold.
# Change on 01/18/95 to fix occasional problem with placing of "first_hit"
# anchor. Added a "buffer" (variable $buffer_it) to hold the anchor/link text
# until we are sure we don't need to place the 'name="first_hit"' into it.
# 09/19/95 -- Added some security comments to the top of the script, and some
# code just in case a given WWW server allows a '..' in the additional
# PATH_INFO part of a url. Also put in place code that can be used to restrict
# this script from reading files in a particular directory.
#
# 06/6/96 -- Modified by William F. Maton:  Included the option to have this
# script retrieve the document via the server, instead of doing a direct open
# through the filesystem, which could get nasty.  Uses a bit o' magic from
# Jeffrey Friedl's www.pl script.
# 06/7/96 -- Modified by William F. Maton:  Added <BLINK></BLINK> tags around
# hits for greater visibility
#

require 'network.pl';
require 'www.pl';

## Fetch via server?  Set to 0 if not.
$UseServer = 1;

## If your document is behind an HTTP authenticated system, supply a login
## and password.
$login = '';
$password = '';

# Specify the url for this WWW server -- same as in "kidofwais.pl" script
#  (this is the same string you added when you waisindexed (using -t url))
$serverURL = "http://your.host.com/~user/some/place/";

# For DEBUG purposes, specify the fully-qualified name of a file in the
# logging directory for your server. Then if you turn on the $DEBUG flag,
# all the lines that this script sends to the HTTPD server will instead be
# sent into this logfile, except for the beginning "Content-type" lines and
# the pointer to the original document before it has been mucked with by this
# script. The match structure built by "prepare_matching_program()" is also
# written into the log (actually will precede the "real file").
$debugLOG = "/var/info/www/httpd/logs/cgi_log";
$DEBUG = 0;	# set to 1 to turn on debugging code

# Specify the "first hit anchor" to be used with hiliting,
# should match up with anchor value in "kidofwais.pl". See &hilite.
# Also need to add </A> to end of hit to end anchor.
$anchor = ' name="first_hit"';

# who maintains this service?
$maintainer = '<A HREF="mailto:wmaton@miredespa.com">William F. Maton</A>';

# you shouldn't have to edit anything below this line

sub send_file {
    print "Location: /$partial_name\n\n";
}

sub prepare_matching_program {
    # prepare the query word matching patterns (will use eval on this to
    # hopefully make it faster). Creates "$match_terms" and initializes
    # "$looking_for_first" to "on" (=1). This sets up the terms that we will
    # search and substitute for, adding BOLD tags around. The bold tags we
    # add are <B > ... </B >. <B > was chosen (with cap and space between B
    # and >) so as to be unlikely to be the same as any of the original BOLD
    # tags in the document. This enables us, on the "first match", to find
    # the position in the string of the "first match" and add the anchor tag
    # in front of that (so the document is positioned to the first hit when
    # presented to the user).
    local($searchterm);
    $looking_for_first = 1;
    $match_terms = "{study;\n";
    foreach $searchterm (@query) {
	# remove parens, quotes, words: and or not
	$searchterm =~ tr/\(\)\'\"//d;

	$searchterm =~ /^and$/i && next;
	$searchterm =~ /^or$/i  && next;
	$searchterm =~ /^not$/i && next;
	$searchterm =~ /^\s*$/  && next;

	$searchterm =~ /^.+\*$/ && $searchterm =~ tr/*//d &&
	  ($match_terms .=
	    "\$matched = 1 if s!\\b($searchterm)!<B ><BLINK>\$1</BLINK></B >!ig;\n")
			&& next;
	#default case
	$searchterm =~ tr/*//d;
	$match_terms .=
	  "\$matched = 1 if s!\\b($searchterm)\\b!<B ><BLINK>\$1</BLINK></B >!ig;\n";
    } #end foreach searchterm
    $match_terms .= "}";
    $DEBUG && print LOG "$match_terms\n";
    return;
}

sub hilite {
	# Search for the query words and place BOLD tags around each if found.
	# If the "first hit" in this document, put in the "first_hit" anchor.
	# Also need to add </A> to end of hit to end anchor.
	local($_) = @_;		# text to hilite
	local($position,$end_position);
	$matched = 0;
	eval ($match_terms);	# do the matching/replacing
	if ($looking_for_first && $matched) { # something matched for the 1st
		$looking_for_first = 0;	      # time, try to place an anchor
		$position = index( $_, "<B ><BLINK>" );
				# put anchor text into string if above found
		if ($position >= 0) {
		    # If text that has been bolded first is already within a
		    # <a> .., </a> construct, then we can't put another such
		    # construct around it. So instead stick the "name=" string
		    # into the <a ...> construct that is already there. If it
		    # already has a name= clause, not sure which will be
		    # recognized.
		    if ($buffer_it && $in_anchor) {
			$position = index( $buffer_it, ">" );
			substr($buffer_it, $position, 0) = $anchor;
		    } elsif (! $in_anchor) {
		    	substr($_, $position, 0) = "<A $anchor>";
		    	$end_position = index( $_, "</BLINK></B >", $position);
		    	($end_position >= 0) &&
				(substr($_, $end_position + 13, 0) = "</A>");
		    }
		}
	}
	&print_it ( $_ );
	return;
}

sub print_it {	# This routine does the actual "printing" of the document
	# If we are in an anchor string, and have not yet found the
	# "first_hit", then save the text to be printed in a buffer. This will
	# allow us to put the 'name="first_hit"' into the anchor/link
	# construct itself if the first hit happens to be in the text that the
	# anchor/link is "around".

	local($to_print) = @_;
	if ($in_anchor && $looking_for_first) {
		$buffer_it .= $to_print;
	} else {
		print $buffer_it if $buffer_it;
		print $to_print;
		$buffer_it = '';
	}
}

sub process_file {

    # if 'PATH_INFO'/'PATH_TRANSLATED' has a non-null value, then 
    # 'PATH_TRANSLATED' should be the fully qualified path to the file to
    # to be hilited. If these environment variables are null, then someone
    # has constructed an incorrect url
    $partial_name = $ENV{'PATH_INFO'};
    $full_name    = $ENV{'PATH_TRANSLATED'};

    # The servers I'm familiar with don't allow a '..' in the 'PATH_INFO',
    # but just in case some server does, check for it.
    # Also make sure the file requested is NOT in a restricted directory.
    if (($partial_name eq "") || ($partial_name =~ /\.\./) ||
	($DO_NOT_ACCESS && ($partial_name =~ m"$DO_NOT_ACCESS"))) {
    	print "Content-type: text/html\n\n";
	print "<HEAD><TITLE>Incorrect url specified</TITLE></HEAD>\n";
	print "<BODY><H2>Incorrect url specified</H2>\n";
	print "<p>";
	print "This script has been referenced by an incorrect url. Please ";
	print "contact $maintainer if you have any questions.<p>\n";
	print "<A HREF=\"$serverURL\">Main page for this server.</A></BODY>\n";
	return;
    }

    $partial_name = substr($partial_name,1); # remove beginning slash
    TYPE: for ($partial_name) {
	/.html$/	&& do { $type = 'html'; $ok = 1; last TYPE };
	/.txt$/		&& do { $type = 'text'; $ok = 1; last TYPE };
	/.TEXT$/	&& do { $type = 'text'; $ok = 1; last TYPE };
	$ok = 0;
    }

    do { &send_file; return; } unless defined @ARGV; # nothing to hilite
	# probably can't try to hilite without messing up file unless html/txt
    do { &send_file; return; } unless $ok;

    local(@query) = @ARGV;
    local($pquery) = join(" ", @query);
    # NCSA's HTTPD puts backslashes in front of "funny" or "dangerous"
    # characters in the input supplied thru argv. In the case of search terms
    # for WAIS, this can screw up the search (parens and "*" get backslashed
    # and then don't work correctly). So remove the backslashes, AND the
    # potentially "dangerous" characters ( ; ` ! ).
    $pquery =~ tr/!\;\`\\//d;           # just in case, get rid of ;`! and \
    @query = split(' ',$pquery);        # and recreate query word array

    # DEBUG code - write stuff into file
    $DEBUG && do { open (LOG, ">>$debugLOG") || die "can't open log";};

    &prepare_matching_program;  # create $match_terms string

    # just send file if there are no words left to hilite
    do { &send_file; return; } if ($match_terms eq "{study;\n}");

    # start the html document to "return"
    print "Content-type: text/html\n\n";
    print "<BASE HREF=\"$serverURL$partial_name\">\n";
    print "<A HREF=\"$serverURL$partial_name\">Get the original (un<b>BOLD</b>ed) document.</A><p>\n";

    $DEBUG && select (LOG); # write rest of stuff to file for DEBUG

    # try and open file for reading
    if ($UseServer == 1) {
	## Build the user/password if they exist and pass them.
	if (defined $password || defined $login) {
    	    local($auth) = join(':', ($login || ''), ($password || ''));
	    push(@options, "authorization=$auth");
	}
	push(@options, 'quiet');

	## Use a function from Jeffrey Friedl's bag o' tricks!
	## (http://www.wg.omron.co.jp/~jfriedl/perl/index.html)
	($status, $memo) = &www'open_http_url(*FP, $serverURL.$partial_name, @options);
	die "$memo\n" if $status ne 'ok';
    } else {
	open (FP, "$full_name") || print "File $full_name can't be read. Please contact $maintainer." && return;
    }

    if ($type eq 'text') {	# Start document for "text"
    	print "<HEAD><TITLE>File: $partial_name</TITLE></HEAD>\n";
	print "<BODY BGCOLOR=\"#FFFFFF\"><p>\n<PRE>";
    }

    local($in_tag,$skip_till_endtag,$endtag,$line_left);
    $in_tag = 0;
    $skip_till_endtag = $in_anchor = 0;
    $endtag = $buffer_it = '';
    READ_LINE: while (<FP>) {
	$line_left = $_;
	PROCESS_LINE: for ($line_left) {

		($line_left eq "") && next READ_LINE;	# nothing left of this
							# paragraph
		$skip_till_endtag && $line_left =~ /$endtag/i && do {
			   $line_left = $';
			   &print_it ( "$`$endtag" );
			   $skip_till_endtag = 0;
			   $endtag = "";
			   redo PROCESS_LINE;
			};
		$skip_till_endtag && do {  # endtag must be in next paragraph
			   &print_it ( $line_left );
			   next READ_LINE;
			};
		! $in_tag && $line_left =~ /</  && do {
			   $line_left = $';
			   &hilite($`) if ($` ne "");
			   &print_it ( "<" );
			   if ($line_left =~
				 /^(PLAINTEXT|XMP|LISTING|TITLE)/i) {
				$line_left = $';
				$endtag = "</$1";
				&print_it ( $1 );
				$skip_till_endtag = 1;
			   } else {
				($line_left =~ /^\/A/i) && ($in_anchor = 0);
				($line_left =~ /^A\s+/i) && ($in_anchor = 1);
				$in_tag = 1;
			   }
			   redo PROCESS_LINE;
			};
		$in_tag   && $line_left =~ />/  && do {
			   $line_left = $';
			   &print_it ( "$`>" );
			   $in_tag = 0;
			   redo PROCESS_LINE;
			};
		$in_tag   && do {	# ending ">" must be in next paragraph
			   &print_it ( $line_left );
			   next READ_LINE;
			};
		&hilite($line_left);  # default case: hilite rest of paragraph
		next READ_LINE;
	} # end line_left foreach
    }
    print $buffer_it if $buffer_it; # print out "leftover stuff" if any
    if ($UseServer == 0) {
    	close(FP);
    }

    print "\n</PRE>\n</BODY>\n" if ($type eq 'text'); # end document for "text"
    return;
}

open (STDERR,"> /dev/null");
eval '&process_file';

