#!/usr/bin/perl -w
#######################################################################
#
#                        O N S E A R C H . P L
#
#             Copyright (C) 1998 by Jaroslav Benkovsky, PVT a.s.
#                     e-mail: benkovsk@pha.pvt.cz
#
#                           $Revision: 1.3 $
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
# ONSEARCH - on-line searching of documents on WWW site
#
# Please, do mail me any bug you experience or find, particularly
# if it can compromise website security. Thanks.
#
# RCS: $Header: /home/benkovsk/worx/onsearch/RCS/onsearch,v 1.3 1999/02/18 13:46:53 benkovsk Exp benkovsk $
#
#######################################################################

unshift (@INC, '/usr/internet/ns-home/cgi-lib');
#unshift (@INC, '/home/benkovsk/worx/lib/perl');

use strict;
use CGI;
require 'nls.pl';

my $UserRootDir       = "/www/doc_root/www";
my $PrivRootDir       = "/www/doc_root/pvtnet";
my $UserCfgSubdir     = "config";
my $PrivCfgSubdir     = "config";
my $DefaultConfigFile = "search.cfg";

my $EnablePrivUser    = 1;

my $METHOD            = 'GET'; # method used in built-in search form

my $ContextSize       = 80;    # ctx extends cca 80 chars to both sides
my $ContextMargin     = 20;    # but can be up to 20 chars shorter/longer

my $LeftMargin        = ' ';   # preferred point of context cutting on the left
my $RightMargin       = ' ';   #    ditto  on the right

my $CONFIG_MAGIC      = "# OnSearch config file"; # mandatory first line in cfg

# Error messages
my $ER_OPENFILE       = "Can't open file '%s'";
my $ER_TOOBIG         = "File is too big at line %d";
my $ER_TOOLONG        = "Line %d is too long";
my $ER_FILENAME       = "Invalid file name '%s'";
my $ER_DIRNAME        = "Invalid dir name '%s'";
my $ER_USERNAME       = "Invalid user name '%s'";
my $ER_NOMAGIC        = "Invalid config file '%s'";
my $ER_NOWORD         = "Nothing to search for";

my $STRIP_NO          = 0;
my $STRIP_AUTO        = 1;
my $STRIP_YES         = 2;

my ($query, $user, $config_file, $word, $in_charset);
my (@FileList, %Results, $config_dirfile, $UserHomeDir, $priv_mode);
my ($stat_time, $stat_files_ok, $stat_files_match, $stat_matches,
    $stat_bytes, $stat_speed);

my ($use_case, $use_whole, $use_regexp, $use_ascii, $use_strip, $use_first,
    $use_all, $use_alpha, $use_notintags);

my $XlatWord;                          # xlat of the search word
my $XlatHtml;                          # xlat of searched docs
my $XlatIntr;                          # xlat of the internal encoding
my $XlatFind;                          # xlat to denature compared (html/word)

# templates
my %ConfigParams = (
		    'header' => <<EOT,
<HTML><HEAD>
<TITLE>!!! title !!!</TITLE>
</HEAD><BODY>
<H1>!!! title !!!</H1>
EOT

		    'title' => <<EOT,
Search results in !!! user !!!
EOT

		    'footer' => <<EOT,
<HR><CENTER><FONT SIZE=-2>OnSearch script: Copyright 1998 by Jaroslav Benkovsk</FONT></CENTER></BODY></HTML>
EOT

		    'item' => <<EOT,
<HR><P>
<A HREF="http://$ENV{SERVER_NAME}/www/!!! user !!!/!!! dirfile !!!">!!! heading !!!</A> (!!! matches !!!)<BR>
&nbsp;&nbsp;&nbsp;&nbsp;<I>!!! context !!!</I>
</P>
EOT

		    'empty_item' => <<EOT,
<HR><P>
Vraz '!!! word !!!' nebyl nalezen. Mete zkusit zadat slovo bez koncovky
nebo zvolit jin vraz.
</P>
EOT

		    'stat' => <<EOT,
time: !!! stat_time !!!<BR>
bytes: !!! stat_bytes !!!<BR>
speed: !!! stat_speed !!!<BR>
EOT

		    'file_on' => '',
		    'file_off' => '',
		    'em_on' => '<FONT COLOR="#FF0000"><B>',
		    'em_off' => '</B></FONT>',
		    'context_size' => $ContextSize,
		    'context_margin' => $ContextMargin,
		    'left_margin' => $LeftMargin,
		    'right_margin' => $RightMargin
		    );

#############################################################
sub Warning {
    my ($msg, @args) = @_;

    print "Warning: " . sprintf ($msg, @args) . "<BR>\n";
}

#############################################################
sub Error {
    my ($msg, @args) = @_;

    print "Error: " . sprintf ($msg, @args) . "<BR>\n";
    exit (1);
}

#############################################################
sub CheckFileName {
    my ($file) = @_;

    return 0 if ($file !~ /^[a-zA-Z0-9][a-zA-Z0-9\.\-\_]*$/);
    return 1;
}


#############################################################
sub CheckDirName {
    my ($dir) = @_;

    return 0 if ($dir !~ /^([a-zA-Z0-9][a-zA-Z0-9\_\-\.]*(\/?))*$/o
		 || $dir =~ /(^|\/)\./o 
		 || $dir =~ m|^/|o 
		 || $dir =~ m|//|o);
    return 1;
}

#############################################################
sub CheckDirFileName  {
    my ($dirfile) = @_;
    my ($dir, $file);

    if ($dirfile =~ /^(.+\/)?([^\/]+)$/o) {
	$dir = $1;
	$file = $2;

	return 0 if (!&CheckFileName ($file) || !&CheckDirName ($dir));
    }
    else { return 0; }

    return 1;
}

#############################################################
sub CheckUserName {
    my ($user) = @_;

    if ($user !~ /^[a-zA-Z0-9][a-zA-Z0-9\.\_]*$/) {
	&Error ($ER_USERNAME, $user);
    }
}

#############################################################
sub ReadConfigFile {
    my ($dirfile) = @_;
    my ($line, $linecnt);
    local (*FILE, *TMP);

    open (FILE, $dirfile) || &Error ($ER_OPENFILE, $dirfile);

    $linecnt=1;
    while (defined ($line = <FILE>)) {
	if ($linecnt > 200) { &Error ($ER_TOOBIG, $linecnt); }
	if (length ($line) > 200) { &Error ($ER_TOOLONG, $linecnt); }

	chomp ($line);
	$line =~ s/^\s+//go;
	$line =~ s/\s+$//go;
	if ($linecnt == 1 && $line ne $CONFIG_MAGIC) { 
	    &Error ($ER_NOMAGIC, $dirfile); 
	}
	if ($line =~ /^\#/ || $line eq "") {
	    $linecnt++;
	    next;
	}

	$line = &nls::XlatString ($line, $XlatHtml, $XlatIntr);

# operatory  :, =, <<, <, ^=, $=, *=
#  : prirazeni 
#  = 
#  << here document
#  <  ze souboru
#  ^=, $=  pripojit doleva n. doprava
#  *= interpolovat

	if ($line =~ /^([A-Za-z0-9_\-]+)\s*(:|<<|<|[\^\$\*]?\=)\s*(.*)$/o) {
	    my $key = $1;
	    my $op = $2;
	    my $value = $3;

	    if ($op eq '<') {
		if (!&CheckDirFileName ($value)) {
		    &Error ($ER_FILENAME, $value);
		}
		open (TMP, "<$UserHomeDir/$value") || &Error ($ER_OPENFILE, $value);
		$value = &nls::XlatString (join ('', <TMP>), $XlatHtml, $XlatIntr);
		close (TMP);
	    }
	    if ($op eq '^=') { $value = $value . $ConfigParams{$key}; }
	    if ($op eq '$=') { $value = $ConfigParams{$key} . $value; }

#	    elsif ($op eq '<<') {}
#	    elsif ($op eq '=') {}
#	    elsif ($op eq '*=') {}
#	    elsif ($op eq ':') {}

	    $ConfigParams{$key} = $value;

	    # we have to process 'charset' immediately ...
	    if ($key eq 'charset') {
		$XlatHtml = &nls::GetXlat ($value);
		if ($XlatHtml eq $XlatIntr) { $XlatHtml = undef; }
	    }
	}
	else {
	    push (@FileList, $line);
	}
	$linecnt++;
    }

    close (FILE);
}

#############################################################
sub SearchFile {
    my ($dir, $file) = @_;
    my ($matches) = 0;
    my ($dirfile, $line, $save_line, $do_strip, $is_html);
    my ($context, $heading, $netto_size, $bytes_match, $match_ratio);
    local (*FILE);

    $dirfile = $dir . '/' .$file;

    if (!-f $dirfile || !open (FILE, $dirfile)) {
	&Warning ($ER_OPENFILE, $file);
	return 0;
    }

    $is_html = $file =~ /\.(.?)HTM(L?)$/io;

    if ($use_strip == $STRIP_AUTO) { $do_strip = $is_html; }
    else { $do_strip = $use_strip; }

    $line = join ('', <FILE>);
    close (FILE);
    $stat_bytes += length ($line);

    if ($is_html) {
	$line =~ /<TITLE>(.+?)<\/TITLE>/ois;
	$heading = $1;
	if (!defined ($heading) || $heading eq '') {
	    $line =~ /<H1[^>]*>(.+?)<\/H1/ois;
	    $heading = $1;
	}
    }
    if (!defined ($heading) || ($heading eq '')) {
	$heading = $file;
    }

    $line = &StripTags ($line) if ($do_strip);
    $line =~ tr/\ \t\r\n/\ /s;

    $save_line = &nls::XlatString ($line, $XlatHtml, $XlatIntr);
    $line = &nls::XlatString ($save_line, $XlatIntr, $XlatFind);

    $heading =~ tr/\ \t\r\n/\ /s;
    $heading = &nls::XlatString ($heading, $XlatHtml, $XlatIntr);

    $netto_size = length ($line);
    $bytes_match = 0;

    while ($line =~ /$word/go) {
	my $a = substr ($save_line, 0, length ($`));
	my $b = substr ($save_line, -length ($'));
	my $c = substr ($save_line, length ($a), length ($&));

#    my $px = 0;
#    while (($px = index ($line, $word, $px)) >= 0) {
#	my $a = substr ($save_line, 0, $px);
#	my $b = substr ($save_line, $px + length ($word));
#	my $c = $word;
#	$px += length ($word);

	$bytes_match += length ($c);

	if (!$matches || $use_all) {
	    my ($pos, $p1, $p2);

	    $pos = length ($a) - $ContextSize;
	    if ($pos < 0) { $pos = 0; };

	    $p1 = rindex ($a, $LeftMargin, $pos);
	    $p2 = index ($a, $LeftMargin, $pos);

	    if ($p2 == -1) { $p2 = length ($a); }

	    if ((($pos - $p1 <= $p2 - $pos) || ($p2 == length ($a)))
		&& ($p1 > $pos - $ContextMargin)) { 
		$pos = $p1;
		$pos += length ($LeftMargin) if ($p1 >= 0);
	    }
	    elsif ($p2 < $pos + $ContextMargin) { 
		$pos = $p2;
		$pos += length ($LeftMargin) if ($p2 < length ($a));
	    }

	    $a = substr ($a, $pos);


	    $pos = $ContextSize;
	    if ($pos >= length ($b)) { $pos = length ($b) - 1; }
	    $p1 = rindex ($b, $RightMargin, $pos);
	    $p2 = index ($b, $RightMargin, $pos);
	    if ($p2 == -1) { $p2 = length ($b); }

	    if ((($p2 - $pos <= $pos - $p1) || ($p1 == -1)) 
		&& ($p2 < $pos + $ContextMargin)) { 
		$pos = $p2;
	    }
	    elsif ($p1 > $pos - $ContextMargin) { 
		$pos = $p1;
	    }

	    if ($pos == -1) { $pos = 0; }
	    $b = substr ($b, 0, $pos);

	    $context = $a . '!!! em_on !!!' . $c . '!!! em_off !!!' . $b;
	    if (!$is_html) {
		$context = &EscapeHTML ($context);
	    }

	    $Results{"$file:" . ($matches+1)} = {
			      'matches' => 0,
			      'context' => $context,
			      'heading' => $heading,
			      'match_ratio' => 0
			      };



	}
	$matches++;
	last if ($use_first);
    }

    $match_ratio = ($netto_size>0) ? ($bytes_match / $netto_size) : '0';

    if ($matches) {
	$Results{"$file:1"}{'matches'} = $matches;
	$Results{"$file:1"}{'match_ratio'} = $match_ratio;
    }

    return $matches;
}

#############################################################
sub ReplaceText {
    my ($text) = @_;
    my $rcnt = 0;
    while ($rcnt<10 && $text =~ s/!!! ([^!]+) !!!/$ConfigParams{$1}/g) { 
	$rcnt++; 
    }

    return $text;
}

#############################################################
sub StripTags {
    my ($line) = @_;

    $line =~ s/<\!\-\-.*?\-\->//gos;
    $line =~ s/<[^>]*>//gos;

    return $line;
}

#############################################################
sub EscapeHTML {
    my ($line) = @_;

#    $line =~ s/\&/\&amp;/go;
    $line =~ s/\</\&lt;/go;
    $line =~ s/\>/\&gt;/go;

    return $line;
}

#############################################################
sub XlatPrint {
    my ($text) = @_;

    print $text;
}

#############################################################
sub PageIntro {
    print <<EOT;
<HTML><HEAD><TITLE>Search</TITLE></HEAD>
<BODY><H1>Search</H1>
<FORM ACTION="$ENV{SCRIPT_NAME}" METHOD=$METHOD>
IDU:<BR><INPUT TYPE=TEXT NAME="user"><BR>
Search for:<BR><INPUT TYPE=TEXT NAME="word"><BR>
Config file:<BR><INPUT TYPE=TEXT NAME="file" VALUE="$DefaultConfigFile"><BR>
<INPUT TYPE=CHECKBOX NAME="case">Case sensitive<BR>
<INPUT TYPE=CHECKBOX NAME="regexp">Regular expressions<BR>
<INPUT TYPE=CHECKBOX NAME="whole">Whole words only<BR>
<INPUT TYPE=CHECKBOX NAME="ascii">Compare in ASCII<BR>
<INPUT TYPE=CHECKBOX NAME="strip" CHECKED>Strip tags<BR>
<INPUT TYPE=CHECKBOX NAME="first">Find first only<BR>
<INPUT TYPE=CHECKBOX NAME="all">Report all appearances<BR>

<INPUT TYPE=SUBMIT NAME="Search">
<INPUT TYPE=RESET NAME="Defaults">
</FORM>
<HR><CENTER><FONT SIZE=-2>OnSearch script: Copyright 1998 by Jaroslav Benkovsk</FONT></CENTER></BODY></HTML>
EOT
    exit (0);
}

#############################################################
sub GetBoolParam {
    my ($name, $default) = @_;
    my ($result);

    $result = $query->param ($name);

    if (!defined ($result)) { $result = $default; }
    elsif ($result =~ /^ON|T|Y|A$/oi) { $result = 1; }
    elsif ($result =~ /^OFF|F|N$/oi) { $result = 0; }
    # else leave it untouched

    return $result;
}

#############################################################
sub sort_fn {
    my ($a, $b) = @_;
    my ($af, $ai) = split (/:/, $a, 2);
    my ($bf, $bi) = split (/:/, $b, 2);
    my ($rc);


    if ($use_alpha) {
	$rc = ($af cmp $bf);
	if (!$rc) {
	    $rc = ($ai <=> $bi);
	}
    } else {
	if ($af eq $bf) {
	    $rc = ($ai <=> $bi);
	} else {
	    $rc = ($Results{"$bf:1"}{'matches'} <=> $Results{"$af:1"}{'matches'});
	    if (!$rc) {
		$rc = ($af cmp $bf);
	    }
	}
    }

    return $rc;
}
#############################################################
#print "Content-type: text/html\n";
#print "Content-encoding: iso-8859-2\n\n";

print "Content-type: text/html; charset=iso-8859-2\n\n";


$query = new CGI;

&PageIntro ()  if (!$query->param () );

$user = $query->param ('user');
$config_file = $query->param ('file');
$word = $query->param ('word');

$in_charset = $query->param ('charset');
$in_charset = $ENV{'INPUT_CHARSET'}  if (! defined ($in_charset));

$use_case = &GetBoolParam ('case', 0);
$use_strip = &GetBoolParam ('strip', $STRIP_AUTO);
$use_regexp = &GetBoolParam ('regexp', 0);
$use_whole = &GetBoolParam ('whole', 0);
$use_ascii = &GetBoolParam ('ascii', 0);
$use_first = &GetBoolParam ('first', 0);
$use_all = &GetBoolParam ('all', 0);
$use_alpha = &GetBoolParam ('alpha', 0);
$use_notintags = &GetBoolParam ('notintags', 0);

$priv_mode = 0;
if ($EnablePrivUser && (!defined ($user) || $user eq '')) {
    $user = 'www';
    $priv_mode = 1;
}

if (!defined ($word) || $word eq '') { &Error ($ER_NOWORD); }
if (!defined ($config_file) || $config_file eq '') { $config_file = $DefaultConfigFile; }

&CheckUserName ($user);
&CheckFileName ($config_file) || &Error ($ER_FILENAME, $config_file);

if ($priv_mode) {
    $UserHomeDir = "$PrivRootDir";
    $config_dirfile = "$UserHomeDir/$PrivCfgSubdir/$config_file";
}
else {
    $UserHomeDir = "$UserRootDir/$user";
    $config_dirfile = "$UserHomeDir/$UserCfgSubdir/$config_file";
}


$XlatIntr = &nls::GetXlat ('ISO-8859-2');
$XlatHtml = undef;                 # set in &ReadConfigFile()
$XlatWord = &nls::GetXlat ($in_charset);

if ($use_ascii) { $XlatFind = &nls::GetXlat ('ASCII'); }
else            { $XlatFind = &nls::GetXlat ('ISO-8859-2'); }

if (!$use_case) { 
    $XlatFind = &nls::XlatString ($XlatFind, $XlatFind, &nls::GetXlat ('ISO-8859-2-UP')); 
}

if ($XlatWord eq $XlatIntr) { $XlatWord = undef; }
if ($XlatIntr eq $XlatFind) { $XlatFind = undef; }

&ReadConfigFile ($config_dirfile);

$ContextSize = $ConfigParams{'context_size'};
$ContextMargin = $ConfigParams{'context_margin'};
$LeftMargin = $ConfigParams{'left_margin'};
$RightMargin = $ConfigParams{'right_margin'};

$word = &nls::XlatString ($word, $XlatWord, $XlatIntr);
$ConfigParams{'word'} = &EscapeHTML ($word);
$ConfigParams{'user'} = $user;

$word = &nls::XlatString ($word, $XlatIntr, $XlatFind);

if (!$use_regexp) {
    $word = quotemeta ($word);
#   $word =~ s/([^A-Za-z0-9])/\\$1/g;
}

if ($use_whole) {
    $word = '\b' . $word . '\b';
#    $word = '(^|[^\w])' . $word . '($|[^\w])';
}

if ($use_notintags) {
    $word = $word . '(?=[^>]*(<|$))';
}

$stat_time = time ();
$stat_files_ok = 0;
$stat_files_match = 0;
$stat_matches = 0;
$stat_bytes = 0;

foreach my $dirfile (@FileList) {
    if (!&CheckDirFileName ($dirfile)) {
	&Warning ($ER_FILENAME, $dirfile);
	next;
    }

    $stat_files_ok++;
    my $matches  = &SearchFile ($UserHomeDir, $dirfile);
    if ($matches) {
	$stat_files_match++;
        $stat_matches += $matches;
    }
}
$stat_time = time () - $stat_time;
$ConfigParams{'stat_time'} = $stat_time;
$ConfigParams{'stat_files'} = $#FileList - 1;
$ConfigParams{'stat_files_ok'} = $stat_files_ok;
$ConfigParams{'stat_files_match'} = $stat_files_match;
$ConfigParams{'stat_matches'} = $stat_matches;
$ConfigParams{'stat_bytes'} = $stat_bytes;
$ConfigParams{'stat_speed'} = ($stat_time>0) ? int ($stat_bytes/$stat_time) : 'INF';

&XlatPrint (&ReplaceText ($ConfigParams{'header'}));

foreach my $key (sort { &sort_fn ($a, $b); } keys %Results) {
    my ($dirfile, $i) = split (/:/, $key, 2);
    my ($value) = $Results{$key};

    if ($i == 1) {
	$ConfigParams{'dirfile'} = $dirfile;
	$ConfigParams{'matches'} = $$value{'matches'};
	$ConfigParams{'heading'} = $$value{'heading'};
	$ConfigParams{'stat_match_ratio'} = $$value{'match_ratio'};

	&XlatPrint (&ReplaceText ($ConfigParams{'file_on'}) . "\n");
    }

    $ConfigParams{'context'} = $$value{'context'};

    &XlatPrint (&ReplaceText ($ConfigParams{'item'}) . "\n");

    if ($ConfigParams{'matches'} == $i) {
	&XlatPrint (&ReplaceText ($ConfigParams{'file_off'}) . "\n");
    }
}


if (!$stat_matches) {
    &XlatPrint (&ReplaceText ($ConfigParams{'empty_item'}) . "\n");
}

&XlatPrint (&ReplaceText ($ConfigParams{'footer'}));

#############################################################
# End of file onsearch
