#!/usr/bin/perl require 5; use Socket; # Fluid Dynamics Search Engine, Version 2.x # Copyright 1997-2000 by Fluid Dynamics. Please adhere to the copyright # notice and conditions of use, described in the attached help file and # hosted at the URL below. For the latest version and help files, visit: # http://www.xav.com/scripts/search/ # ___________________________________________________________________________ my ($CryptPassword, $AllowSetPassword, $AllowAnonAdd); # Lines 17 and 18: $CryptPassword = '$ZR.QKQhGFFB6'; $AllowSetPassword = 0; # 1 -> YES; 0 -> NO $AllowAnonAdd = 0; # This search engine is managed from the web, and it comes with a password to # keep it secure. # # You will be given a password when you first visit this script using the # special "Mode=Admin" query string - for example: # # http://my.host.com/search.pl?Mode=Admin # # On your first visit, the script gives instructions on how to set the # password variable below. # ___________________________________________________________________________ # # Security Settings: my $AllowClearTextAuth = 0; my $AllowDebug = 1; my $VERSION = '2.0.0.0001'; my $Timeout = 50; # seconds. my ($pq, $O); my $SetSaveLinks = 1; my $bUseClearTextAuth = 0; my (@HITS, @SearchTerms, %RAW, %SessionCookie, @GlobalSavedLinks, %GlobalSpiderResults) = (); my %FORM = ReadInput(); my $LimitSite = ''; my $WildCard = 'thewildcardisaveryspecialcharacter'; my $WildSearch = '([^\s+]{0,4})'; # ___________________________________________________________________________ # # Robot and Search Settings: my $SCRIPT_NAME = ($ENV{'SCRIPT_NAME'} || 'search.pl'); my $SearchTipsPage = $SCRIPT_NAME; my $REQUEST_METHOD = 'POST'; my %Rules = (); $Rules{'Max Index File Size'} = 10000000; # 10mb default $Rules{'Hits Per Page'} = 10; $Rules{'Multiplier: Title'} = 10; $Rules{'Multiplier: Keyword'} = 10; $Rules{'Multiplier: Description'} = 4; $Rules{'Minimum Page Size'} = 128; # bytes $Rules{'Max Characters: URL'} = 128; $Rules{'Max Characters: Title'} = 96; $Rules{'Max Characters: Description'} = 384; $Rules{'Max Characters: Auto Description'} = 150; $Rules{'Max Characters: Keywords'} = 256; $Rules{'Max Characters: File'} = 64000; $Rules{'Forbid All Cap Titles'} = 1; # 1 -> YES; 0 -> NO $Rules{'Forbid All Cap Descriptions'} = 1; $Rules{'Crawler: Minimum WhiteSpace'} = 0.01; $Rules{'Crawler: Max Pages Per Batch'} = 12; $Rules{'Crawler: Max Redirects'} = 6; $Rules{'Crawler: Days Til Refresh'} = 30; $Rules{'Crawler: User Agent'} = 'Mozilla/4.0 (compatible: FDSE robot)'; $Rules{'Crawler: Follow Query Strings'} = 0; # 1 -> YES; 0 -> NO # This is a pipe-delimited list of lowercase file extensions. Links to files # with these extensions will not be treated as document-type links by the # crawler; using this list will make crawl sessions faster, and will keep the # pending pages file small: $Rules{'Crawler: Ignore Links To'} = 'gif|jpg|js|css|mp3|wav|zip|exe|doc|xls|pdf'; # When the crawler hits a page, it gathers a list of all the links on the page # for future searching. This list can be overwhelming. To reduce the size of # the list, set Follow Offsite Links to 0, and then crawler will only remember # links to the same host: $Rules{'Crawler: Follow Offsite Links'} = 1; # 0 -> No, 1 -> Yes $Rules{'Crawler: Rogue'} = 0; # 1 -> Disregard robots exclusion rules $Rules{'Index ALT Text'} = 1; $Rules{'Index Links'} = 0; # The following words will be ignored when entered as part of a query: my @IgnoredWords = ( 'your', 'you', 'www', 'with', 'will', 'why', 'who', 'which', 'where', 'when', 'what', 'web', 'we', 'was', 'want', 'w', 'used', 'use', 'two', 'to', 'this', 'they', 'these', 'there', 'then', 'then', 'them', 'their', 'the', 'that', 'than', 't', 'so', 'site', 'should', 'see', 's', 're', 'quot', 'page', 'our', 'other', 'org', 'or', 'only', 'one', 'on', 'of', 'now', 'not', 'no', 'new', 'net', 'nbsp', 'name', 'n', 'my', 'ms', 'mrs', 'mr', 'most', 'more', 'me', 'may', 'lt', 'like', 'just', 'its', 'it', 'is', 'in', 'if', 'i', 'http', 'how', 'he', 'have', 'has', 'gt', 'get', 'from', 'for', 'find', 'ed', 'do', 'd', 'com', 'can', 'by', 'but', 'been', 'be', 'b', 'at', 'as', 'are', 'any', 'and', 'an', 'amp', 'also', 'all', 'after', 'about', 'a', '5', '2', '1', '0', ); # ___________________________________________________________________________ # # File Control Section: # The writable folder where all data files are stored. Use the path relative # to this script: my $DataFilesDir = 'searchdata'; # List the URLs or paths that you do not want searched. Use all forward # slashes: my @ForbidSites = ( 'http://www.umsl.edu/studentlife/current/forums', '/webstuff/htdocs/studentlife/current/forums', 'http://www.umsl.edu/studentlife/current/forums/student/messages', '/webstuff/htdocs/studentlife/current/forums/student/messages', 'http://www.umsl.edu/studentlife/current/forums/issues/messages', '/webstuff/htdocs/studentlife/current/forums/issues/messages', 'http://www.umsl.edu/studentlife/current/wwwboard/messages', '/webstuff/htdocs/studentlife/current/wwwboard/messages', 'http://www.umsl.edu/studentlife/current/css_web_design', '/webstuff/htdocs/studentlife/current/css_web_design', ); # List URLs which should receive higher ranking (cannot use paths here): my @PromoteSites = ( 'http://www.xav.com/', 'http://www.microsoft.com/', ); # Enter the rank multiplier for PromoteSites (values 2 through 99): $Rules{'Promote Value'} = 20; # Local files only - enter the file extensions to be searched. Separate list # of extensions by space: my $EXT = ' htm html '; # Local files only - specify whether to allow non-text files. This requires # both the variable below and the extension listed in $EXT above: my $AllowBinaryFiles = 1; # 1 -> YES; 0 -> NO # See http://www.xav.com/scripts/search/admin_help.html#symlink my $AllowSymbolicLinks = 1; # 1 -> Yes; 0 -> NO my $TrustSymbolicLinks = 0; # 1 -> Yes; 0 -> NO # End File Control Section. # # See also the $Exclude{'Realm'} options below to set ForbidSites file # exclusions on a per-Realm basis. # ___________________________________________________________________________ # # Realms Section: # All the Realm information is stored in a |-delimited file named 'realms': my (%IndexFile, %BaseDir, %BaseURL, %Exclude) = (); my (%HashIP) = (); my $RealmFile = 'search.realms.txt'; # ___________________________________________________________________________ # # This HTML text appears at the top of every page. It's a good place to # declare custom styles like background color and font type: my $Header = <<"EOM"; The Current Online - Search Engine
Departments
Back to the Front Page
Web Exclusives
Student Forum
The SGA President Under Fire
Search
Other Stuff
Cartoons
Legal
Advertising
Feedback
EOM # ___________________________________________________________________________ # # The document footer appears at the bottom of every page shown to end users. # Please don't remove the copyright notice from your output unless you've # paid the shareware fee: sub PrintFooter { local $_; print <<"EOM";

Search Tips - EOM if ($AllowAnonAdd) { print 'Add URL -'; } print <<"EOM"; Main Page


Fluid Dynamics Search Engine, © 1997-99 by Fluid Dynamics.
Page design and format, © 1999-2000 The Current
EOM } # ___________________________________________________________________________ # # The SearchForm uses a nested table structure - the outer table has a light # grey background (color #aaaaaa) with a 1-pixel padding. This outer table # contains only the main table, with a light yellow background (color # #ffffcc). The effect is a grey 1-pixel border around the yellow search # table. Customize as needed... sub SearchForm { local $_; my $DisplayTerms = $FORM{'Terms'}; $DisplayTerms =~ tr[A-Z][a-z]; $DisplayTerms =~ s!\"!\"!g; my $SelectMatch = ''; my $SelectRealm = ''; print <<"EOM";
The Current Online Search Engine
Match $SelectMatch in Search Index: $SelectRealm
EOM } # ___________________________________________________________________________ # # The HTML to give searchers who can't seem to find anything. Usually # this should include yet another link back to the tips page, and, if # you're a business that is not overly annoyed by people sending you # email, a mailto: link for them to request information from a human # would be in order here: my $No_Documents_Found = <<"EOM";
Results: No documents were found.
EOM # ___________________________________________________________________________ # my $SearchTipsText = <<"EOM";

Search Rules

This search engine helps you find documents on this website and related sites. Here's how it works: you tell the search service what you're looking for by typing in keywords, phrases, or questions in the search box. The search service responds by giving you a list of all the Web pages in our index relating to those topics. The most relevant content will appear at the top of your results.

How To Use:

  1. Type your keywords in the search box.
  2. Press the Search button to start your search.

Here's an example:

  1. Type recipe oatmeal raisin cookies in the search box.
  2. Press the Search button or press the Enter key.
  3. The Results page will show you numerous pages on the Web about recipes for oatmeal raisin cookies.

Tip: Don't worry if you find a large number of results. In fact, use more than a couple of words when searching. Even though the number of results will be large, the most relevant content will always appear at the top of the result pages.

More Basics - An Overview

Here's a quick overview of the rest of our Basic Help. Just click on the links to jump to these sections.

What is an 'Index'?
What is a word?
What is a phrase?
Simple Tips for More Exact Searches
Fancy Features for Typical Searches

What is an Index?

Webster's dictionary describes an "index" as a sequential arrangement of material. Our index is a large, growing, organized collection of Web pages and discussion group pages from around the world. The 'index' becomes larger every day as people send us the addresses for new Web pages. We also have technology that crawls the Web looking for links to new pages. When you use our search service, you search the entire collection using keywords or phrases.

What is a word?

When searching, think of a word as a combination of letters and numbers. The search service needs to know how to separate words and numbers to find exactly what you want on the Internet. You can separate words using white space and tabs.

What is a phrase?

You can link words and numbers together into phrases if you want specific words or numbers to appear together in your result pages. If you want to find an exact phrase, use "double quotation marks" around the phrase when you enter words in the search box.

Example #1: To find lyrics by the King, type "you ain't nothing but a hound dog" in the search box. You can also create phrases using punctuation or special characters such as dashes, underscore lines, commas, slashes, or dots.

Example #2: Try searching for 1-800-999-9999 instead of 1 800 999 9999. The dashes link the numbers together as a phrase.

Simple Tips for More Exact Searches

All searches are case insensitive and accent insensitive. Searching for "Fur" will match the lowercase "fur", uppercase "FUR", and German "für".

Including or excluding words:

To make sure that a specific word is always included in your search topic, place the plus (+) symbol before the key word in the search box. To make sure that a specific word is always excluded from your search topic, place a minus (-) sign before the keyword in the search box.

Example: To find recipes for cookies with oatmeal but without raisins, try recipe cookie +oatmeal -raisin.

Expand your search using wildcards (*):

By typing an * at the end of a keyword, you can search for the word with multiple endings.

Example: Try wish*, to find wish, wishes, wishful, wishbone, and wishy-washy.

Fancy Features for Typical Searches

You can search more than just text. Here are all of the other ways you can search on the net:

link:address Finds pages that link to the specified address, or a substring of it. Use link:microsoft.com to find all pages linking to Microsoft sites. Note: this feature is not implemented on all search engines.
text:text Finds pages that contain the specified text in any part of the page other than an image tag, link, or URL. The search text:cow9 would find all pages with the term cow9 in them.
title:text Finds pages that contain the specified word or phrase in the page title (which appears in the title bar of most browsers). The search title:Elvis would find pages with Elvis in the title.
url:text Finds pages with a specific word or phrase in the URL. Use url:altavista to find all pages on all servers that have the word altavista in the host name, path, or filename - the complete URL, in other words.
EOM # Files - below are the four files used. They will exist in $DataFilesDir. # They will be created if they don't already exist. You don't need to change # anything: my $LogFile = 'search.log.txt'; # Log of search terms, users my $TempFile = 'search.temp.txt'; # writable temp file for paging large indexes my $BackFile = 'search.back.txt'; # second temp file for paging my $PendingPagesFile = 'search.pending.txt'; # Holds URLs to be searched later my $FileMask = 0766; my @MonthNames = ('Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'); # Done setting variables. # ___________________________________________________________________________ # my %NewWords = (); foreach (sort @IgnoredWords) { my $word = RawTranslate($_); $word =~ tr[A-Z][a-z]; $NewWords{$word}++; } my $StripIgnoreWords = "s' (".join('|', sort keys %NewWords).") ' 'og;"; # Protect for leading and trailing space: $EXT = ' '.$EXT.' '; my ($HiddenPass,$StringPass) = ('','',''); # This will increment to tell how many pages were searched total: my $NumPagesSearched = 0; my $Is_IIS = (($ENV{'SERVER_SOFTWARE'}) && ($ENV{'SERVER_SOFTWARE'} =~ m!IIS!i))?1:0; my $bExitScript = 0; $|=1; # buffer on unless (Authenticate()) { $bExitScript = 1; } unless ($bExitScript) { print "Content-Type: text/html\r\n\r\n"; $|=0; # buffer off # This manually sets the current working directory to the directory that # contains this script, if the web server is IIS. IIS does not do this # for you, unlike every other web server out there. This is necessary in # case people have used a relative path to the $DataFilesDir. chdir($1) if (($Is_IIS) && ($0 =~ m!^(.*)\\!)); # Next we chdir() into the $DataFilesDir. All lookups on data files will # use their relative names so we have to be able to access that directory. # If chdir fails then this script will fail right away. unless (chdir($DataFilesDir)) { $bExitScript = 1; print <<"EOM"; $Header

Data Folder Required

EOM unless ((-e $DataFilesDir) and (-d $DataFilesDir)) { # if don't even exist! print <<"EOM";

This script requires a writable folder named "$DataFilesDir".

Please create a folder with this name (and give it write permissions).

Need help? Visit http://www.xav.com/scripts/search/admin_help.html.

EOM } else { print <<"EOM";

This script requires a writable folder named "$DataFilesDir".

A folder exists with that name, but it isn't readable and writable.

Give this folder RWX permissions for Everyone. Your ISP can usually assist with this.

Need help? Visit http://www.xav.com/scripts/search/admin_help.html.

EOM } print "
"; } } &LoadRealms; if ($bExitScript) { # do nothing; } elsif ($FORM{'NextLink'}) { # security re-director from admin screen (prevents query-string-based # password from showing up in referer logs of remote systems: print <<"EOM"; $FORM{'NextLink'} EOM } elsif (($FORM{'AddSite'}) && ($AllowAnonAdd)) { print $Header; &AnonAdd($FORM{'AddSite'}); &SearchForm; &PrintFooter; } elsif (($FORM{'Mode'} ne 'Admin') and (!$FORM{'Terms'})) { print $Header; &SearchForm; print $SearchTipsText; &PrintFooter; } elsif ($FORM{'Mode'} ne 'Admin') { # Anonymous search engine code: # Idea: add all non-forbidden terms as a string and add that as a phrase # with highest priority. Multiply hit relevance by length of string to # give longer search terms and phrases more weight. # This controls the display, so these extra terms aren't shown to the user: my $IgnoreQuotedTerms = 0; my $terms = $FORM{'Terms'}; $terms =~ tr[A-Z][a-z]; if (!$FORM{'Match'}) { if ($terms !~ m!\W!) { # don't already contain a phrase if ($terms =~ m! !) { # do contain a space (multiple terms) if ($terms !~ m!(\-)|(not )|(and )|(or )!i) { # no forbidden terms or special keywords $terms = "\"$terms\" $terms"; $IgnoreQuotedTerms = 1; } } } } my $Rank = $FORM{'Rank'} ? $FORM{'Rank'} : 1; # Search terms have been provided. Format them: # Buffer with blank spaces, then collapse multiple whitespaces onto # one white space: $terms = ' '.$terms.' '; $terms =~ s'\s+' 'g; # 'dogs AND "big fences"' becomes: # ' dogs AND "big fences" ' # Now bind double-quoted phrases together: my ($i, $ProcTerms) = (0, ''); foreach (split(/\"/,$terms)) { tr! !_! if $i; $i = $i ? 0 : 1; $ProcTerms .= $_; } # ' dogs AND "big fences" ' becomes: # ' dogs AND big_fences ' # Collapse multiple asterisks onto one placeholder, and translate # literal Boolean switches into command switches: $ProcTerms =~ s' not ' -'ig; $ProcTerms =~ s' and ' +'ig; $ProcTerms =~ s' or ' \|'ig; # ' dogs AND big_fences ' becomes: # ' dogs +big_fences ' my $bTermsExist = 0; my $Ignored_Terms = ''; my $Important_Terms = ''; my ($EvalForbid, $EvalRequired, $EvalOptional, $EvalExtraRequired, $EvalExtraOptional) = ('', '', '', '', ''); my $tm = $Rules{'Multiplier: Title'}; my $km = $Rules{'Multiplier: Keyword'}; my $dm = $Rules{'Multiplier: Description'}; Term: foreach (split(m!\s+!, $ProcTerms)) { next unless $_; my $UserFormatWord = Trim($_); $UserFormatWord = $2 if ($UserFormatWord =~ m!^(\||\-|\+)(.*)$!); push(@SearchTerms, $UserFormatWord); # Perform language translations: $_ = RawTranslate($_); # Ignore common words ("the","what","where"...) # Next line due to fix by John # Thanks! # $QMTerm = quotemeta($_); my $IWord; foreach $IWord (@IgnoredWords) { if ($_ eq $IWord) { $Ignored_Terms .= ', '.$_; next Term; } } # Remove wildcard-enchanced terms with less than 3 characters: if ((m!\*!) && ((length($_) - (s/\*/\*/g)) < 3)) { $Ignored_Terms .= ', '.$_; next; } # Remove the underscores that are binding the phrases together: s!_! !g; # Handle each term differently based on whether it is required, optional, # or forbidden. Unmarked search terms go to required or optional based # on the $Range input variable: my $Term = ''; if (m!^-!) { if (m! !) { $Ignored_Terms .= ', "'.$_.'"'; } else { $Ignored_Terms .= ', '.$_; } # See comments in sub Format_Term for exactly how these are being # handled... $Term = Format_Term($_); if ($Term !~ m!^\s*$!) { $EvalForbid .= "\tlast SearchBlock if m!$Term!o;\n"; } } elsif ((m!^\|!) || ((!$FORM{'Match'}) && (!(m!^\+!)))) { if (m! !) { $Important_Terms .= ', "'.$_.'"' unless $IgnoreQuotedTerms; } else { $Important_Terms .= ', '.$_; } $Term = Format_Term($_); if ($Term !~ m!^\s*$!) { $EvalOptional .= "\t\$WordMatches += scalar (\@WordCount = m!$Term!og);\n"; $EvalExtraOptional .= "\t\$WordMatches += $tm * (\@WordCount = (\$t =~ m!$Term!og));\n" if $tm; $EvalExtraOptional .= "\t\$WordMatches += $dm * (\@WordCount = (\$d =~ m!$Term!og));\n" if $dm; $EvalExtraOptional .= "\t\$WordMatches += $km * (\@WordCount = (\$k =~ m!$Term!og));\n" if $km; $bTermsExist = 1; } } else { if (m! !) { $Important_Terms .= ', "'.$_.'"' unless $IgnoreQuotedTerms; } else { $Important_Terms .= ', '.$_; } $Term = Format_Term($_); if ($Term !~ m!^\s*$!) { $EvalRequired .= "\t\$WordMatches += (scalar (\@WordCount = m!$Term!og)) || (last SearchBlock);\n"; $EvalExtraRequired .= "\t\$WordMatches += $tm * (\@WordCount = (\$t =~ m!$Term!og));\n" if $tm; $EvalExtraRequired .= "\t\$WordMatches += $dm * (\@WordCount = (\$d =~ m!$Term!og));\n" if $dm; $EvalExtraRequired .= "\t\$WordMatches += $km * (\@WordCount = (\$k =~ m!$Term!og));\n" if $km; $bTermsExist = 1; } } } # next term # The strings below will be used in the summary output: if ($Ignored_Terms =~ /^, (.*)$/) { $Ignored_Terms = $1; $Ignored_Terms =~ s/ ("?)-/ $1/eg; } if ($Important_Terms =~ /^, (.*)$/) { $Important_Terms = $1; } my $DocSearch = <<"EOM"; SearchBlock: { \$NumPagesSearched++; \$WordMatches = 0; $EvalForbid $EvalRequired $EvalOptional last SearchBlock unless \$WordMatches; last SearchBlock unless m!^(.*?)uM=.*?uT=(.*?)uD=(.*?)uK=(.*?)h=!o; (\$hdr, \$t, \$d, \$k) = (\$1, \$2, \$3, \$4); $EvalExtraRequired $EvalExtraOptional push(\@HITS,((\$WordMatches*substr(\$_,0,2))+10E6).'.'.\$hdr); } EOM my $RealmSearch = <<"EOM"; my \@WordCount = (); my \$WordMatches = 0; my (\$t, \$d, \$k, \$hdr); Record: while () { $DocSearch } EOM my $Realm = $FORM{'Realm'} ? $FORM{'Realm'} : 'All'; if ($bTermsExist) { # Search terms have been formatted. Now search the database(s): # each sub populates @HITS as needed. # If Realm is specific, search it - otherwise search all: if ($Realm ne 'All') { if ($IndexFile{$Realm} eq 'RUNTIME') { &SearchRunTime($Realm, $DocSearch); } else { &SearchIndexFile($IndexFile{$Realm}, $RealmSearch); } } else { foreach (sort keys %IndexFile) { if ($IndexFile{$_} eq 'RUNTIME') { &SearchRunTime($_, $DocSearch); } else { &SearchIndexFile($IndexFile{$_}, $RealmSearch); } } } } # Hits have been found. Now print output: my $HitCount = scalar @HITS; my ($PerPage, $Remaining, $Next, $New_Rank, $summary); # Get the number of hits to show per page: if (($FORM{'maxhits'} =~ m!^(\d+)$!) && ($FORM{'maxhits'} > 0)) { $PerPage = $1; } elsif ($Rules{'Hits Per Page'} =~ m!^(\d+)$!) { $PerPage = $1; } else { $PerPage = 10; } $Remaining = $HitCount - $Rank - $PerPage + 1; if ($Remaining >= $PerPage) { $Next = $PerPage; $New_Rank = $Rank + $PerPage; } elsif ($Remaining > 0) { $Next = $Remaining % $PerPage; $New_Rank = $Rank + $PerPage; } else { $New_Rank = $HitCount + 1; } my $RangeLower = $Rank; my $RangeUpper = ($New_Rank - 1); if ($Ignored_Terms) { $summary = "
Ignored: $Ignored_Terms.
"; } else { $summary = "
\n"; } $summary .= <<"EOM"; Your search for $Important_Terms found the following documents (of $NumPagesSearched documents searched):
Displaying documents $RangeLower-$RangeUpper of $HitCount, with best matches first.

EOM if (($LogFile) and (open(LOG,">>$LogFile"))) { binmode(LOG); print LOG "Time:\t", scalar localtime(time), "\n"; if ($ENV{'REMOTE_HOST'}) { print LOG "Host:\t$ENV{'REMOTE_HOST'}\n"; } elsif ($ENV{'REMOTE_ADDR'}) { print LOG "Host:\t$ENV{'REMOTE_ADDR'}\n"; } else { print LOG "Host:\tundefined\n"; } print LOG "Terms:\t$FORM{'Terms'}\n"; print LOG "Found:\t$HitCount\n"; print LOG "\n"; close(LOG); chmod($FileMask, $LogFile); } print $Header; print 'Search Results'; PrintHits: { print $summary; if ($HitCount < 1) { print $No_Documents_Found; last PrintHits; } print '
'; print "\n"; my $i = $RangeLower - 1; foreach ((reverse sort @HITS)[($RangeLower-1)..($RangeUpper-1)]) { $i++; next unless (m!^(\d+)\.(\d+) u= (.+) t= (.*?) d= (.*?) $!); my ($Rank,$URL,$Title,$Description) = ($1,$3,$4,$5); my ($DD,$MM,$YYYY,$FBYTES) = (unpack('A2A2A2A4A*',$2))[1..4]; my $Month = $MonthNames[$MM]; #print "

$Rank

"; print StandardVersion($i,$URL,$Title,$Description,$FBYTES,$DD,$Month,$YYYY); } print "Documents $RangeLower-$RangeUpper of $HitCount displayed.
\n"; print '
'; last PrintHits if ($Remaining < 1); print "

 Next "; ($Next == 1) ? print 'Match' : print "$Next Matches"; print ' 

'; } # End of PrintHit: control block. &SearchForm; &PrintFooter; } elsif ($FORM{'Mode'} eq 'Admin') { # Authenticated administration code: if ($bUseClearTextAuth) { $HiddenPass = ''; $StringPass = '&Password='.$RAW{'Password'}; } print $Header; # is the $DataFilesDir writable? unless ((-R '.') and (-W '.') and (-d '.')) { print <<"EOM";

Data Folder Required

This script requires a writable folder named "$DataFilesDir".

A folder exists with that name, but it isn't readable and writable.

Give this folder RWX permissions for Everyone. Your ISP can usually assist with this.

Need help? Visit http://www.xav.com/scripts/search/admin_help.html.

EOM } elsif (!$FORM{'Action'}) { &HTML_UI(); } elsif ($FORM{'Action'} =~ m!^Add\s?URL$!) { # support both "Add URL" and "AddURL" # allow for single URL, this will need to be cleaned up. if (defined $FORM{'NewURL'}) { $FORM{'AddLink0'} = $FORM{'NewURL'}; } # build array of AddressesToIndex my @AddressesToIndex = (); foreach (keys %FORM) { next unless (m!^AddLink!); # Add trailing slashes to basic domains, ie http://c2.net -> http://c2.net/ if ($FORM{$_} =~ m!^http://([^\/]+)/!) { push(@AddressesToIndex,$FORM{$_}); } else { push(@AddressesToIndex, Trim($FORM{$_}).'/'); } } if (($FORM{'EntireSite'}) and ('1' eq $FORM{'EntireSite'})) { $FORM{'StartTime'} = time - 15; $LimitSite = $FORM{'NewURL'}; # turns http://io.com to http://io.com/ $LimitSite .= '/' if ($LimitSite =~ m!^http://([^\/]+)$!i); # turns http://www.io.com/~bob to http://www.io.com/~bob/ $LimitSite .= '/' if ($LimitSite =~ m!/([^\/\.]+)$!i); # turns http://io.com/index.html to http://io.com/ $LimitSite = $1 if ($LimitSite =~ m!^(.*?)(\w+)\.(\w+)$!); $FORM{'Action'} = 'CrawlEntireSite'; $FORM{'LimitSite'} = $LimitSite; } &AddURL(2,@AddressesToIndex); } elsif ($FORM{'Action'} eq 'Build') { my $StartFile = 0; if (($FORM{'StartFile'}) and ($FORM{'StartFile'} =~ m!^\d+$!)) { $StartFile = $FORM{'StartFile'}; } &BuildIndex($FORM{'Realm'}, $StartFile); } elsif ($FORM{'Action'} eq 'Review') { &ReviewIndex($FORM{'Realm'}); } elsif ($FORM{'Action'} eq 'ReCrawlRealm') { unless ($FORM{'StartTime'}) { $FORM{'StartTime'} = time - 5; } &ReCrawlRealm($FORM{'Realm'}); } elsif ($FORM{'Action'} eq 'CrawlEntireSite') { &CrawlEntireSite($FORM{'Realm'}); } elsif ($FORM{'Action'} eq 'MaintainRealm') { &MaintainRealm($FORM{'Realm'}); } elsif ($FORM{'Action'} eq 'ViewLog') { &ViewLog($LogFile); } elsif ($FORM{'Action'} eq 'Edit') { &PrintEditRecordForm($FORM{'Realm'},$FORM{'URL'}); } elsif ($FORM{'Action'} eq 'SaveEditedRecord') { &SaveEditedRecord(); } elsif ($FORM{'Action'} eq 'DeleteRecord') { &DeleteRecord(); } elsif ($FORM{'Action'} eq 'forceUnLock') { &forceUnLock(); } elsif ($FORM{'Action'} eq 'CreateRealmForm') { &CreateRealmForm($FORM{'Realm'}); } elsif ($FORM{'Action'} eq 'CreateRealm') { print ""; &CreateRealm($FORM{'Realm'}, $FORM{'File'}, NixPath($FORM{'BaseDir'}), $FORM{'BaseURL'}); } elsif ($FORM{'Action'} eq 'DeleteRealm') { &DeleteRealm($FORM{'Realm'}); } else { &HTML_UI(); } &Admin_HTML_Footer(); } # End "if Mode eq Admin" else { print '

No action path followed; unhandled error

'; } # script exit. sub ViewLog { local $_; my ($LogFile) = @_; print <<"EOM";

The text-based log file for this search engine follows.

EOM if (open(LOG,"<$LogFile")) { print '
';
		while () {
			print;
			}
		close(LOG);
		print '
'; } else { print "

Error: could not read from LogFile $LogFile - $!

\n"; } print '
'; } sub ReadInput { local $_; my $InputString = ''; my ($Name,$Value); # Declare form input array, and initialize certain values to keep # those damn -w errors out of the way. my %LocalForm = ( 'Mode' => '', 'Terms' => '', 'Password' => '', 'SetPassword' => '', 'CL' => 0, 'maxhits' => 0, 'LimitIndexed' => 0, 'LimitFailed' => 0, 'LimitPending' => 0, ); $RAW{'Realm'} = ''; my @Pairs = (); #init name=value pairs as empty set if (($ENV{'REQUEST_METHOD'}) && ($ENV{'REQUEST_METHOD'} eq 'POST')) { read(STDIN,$InputString,$ENV{'CONTENT_LENGTH'}); @Pairs = split(m!\&!,$InputString); } elsif ($ENV{'QUERY_STRING'}) { $InputString = $ENV{'QUERY_STRING'}; @Pairs = split(m!\&!,$InputString); } else { # Be command-line friendly: @Pairs = @ARGV; } # The inclusion of @ARGV in this loop allows us to permit command-line # triggering of all the functions, which is useful for admin tasks. All # of the name pairs are parsed independently of the values so that we can # have a distinct $RAW{} array - that way url-encoded values can be put # back in the query string easily for future hits. foreach (@Pairs) { next unless (m!^(.*?)=(.*)$!); ($Name,$Value) = ($1,$2); $Name =~ s!\%([a-fA-F0-9][a-fA-F0-9])!pack('C',hex($1))!eg; $Name =~ tr!+! !; $RAW{$Name} = $Value; $Value =~ tr!+! !; $Value =~ s!\%([a-fA-F0-9][a-fA-F0-9])!pack('C',hex($1))!eg; $LocalForm{$Name} = $Value; } return %LocalForm; } sub Admin_HTML_Footer { local $_; print <<"EOM";


[ Search Page | Administrator's Page | Log Out | Help | Register ]
  © 1997-99 Fluid Dynamics

EOM } sub GetAbsoluteAddress { local $_; my ($Link,$URL) = @_; if (($Link =~ m!^\/!) && ($URL =~ m!^http\:\/\/([^\/]+)!i)) { # absolute link from top-level directory, prepend URL through # server address: $Link = "http://$1$Link"; } elsif (($Link =~ m!^(\w+)\:!) && ($1 !~ m!^http$!i)) { # not http protocol (mailto:, ftp:, etc) return ''; } elsif (($Link !~ m!^http\:\/\/!i) && ($URL =~ m!^(.*)\/!)) { # http: protocol not specified, some kind of internal link (?), but we know # it doesn't start with "/" so it's relative. $Link = $1.'/'.$Link; } # $Link is now an absolute URL. # Remove ./ and ../ intelligently, strip :80 if present, also internal # links. # strip # signs for internal links: if ($Link =~ m!^(.*?)\#!) { $Link = $1; } # Force http:// protocol lowercase: $Link =~ s!^(http)!http!io; # strip default port, ":80" if ($Link =~ m!^http://([^\/]+)\:80$!) { $Link = 'http://'.$1.'/'; } elsif ($Link =~ m!^http://([^\/]+)\:80/(.*)$!) { $Link = 'http://'.$1.'/'.$3; } # add / if none present if ($Link =~ m!^http://([^\/]+)$!) { $Link .= '/'; } # map "/./" to "/" $Link =~ s!/\./!/!g; # map "/folder/../" => "/" while ($Link =~ m!^http://([^\/]+)/(.*?)([^\/]+)/\.\.\/(.*)$!) { $Link = "http://$1/$2$4"; } # map http://host/../foo => http://host/foo while ($Link =~ m!^http://([^\/]+)/\.\.\/(.*)$!) { $Link = "http://$1/$2"; } return $Link; } # ------------------------------------------------------------------ # Reads in the file specified by the argument. Removes all of the # newline characters. Replaces custom non-ASCII characters with the # standard non-ASCII characters - i.e., © becomes © # If the file is non-text as per the -T test, Get_String returns null. sub Get_String { local $_; my $FileName = shift; my $FileText = ''; unless (-T $FileName) { print "\n"; } else { if (open(FILE,"<$FileName")) { binmode(FILE); read(FILE, $FileText, $Rules{'Max Characters: File'}); close(FILE); return $FileText; } else { print "\n"; } } return ''; } # ---------------------------------------------------------------------- # Opens an unbuffered socket with $THEM on $PORT. sub OpenSocket { local $_; my ($THEM, $PORT, $Error) = @_; # new code 04/08/98 taken from the client-server code samples at CPAN. # should solve socket errors experienced on BSDI 3.0. I can't find the # URL to the original anymore (CPAN *really* needs search functionality!) unless (socket(HTTP, PF_INET, SOCK_STREAM, getprotobyname('tcp'))) { $$Error = "the low-level socket() function failed with system error \"$!\""; return 0; } my $HexIP = ''; if ($HashIP{$THEM}) { $HexIP = $HashIP{$THEM}; } else { $HexIP = inet_aton($THEM); $HashIP{$THEM} = $HexIP; } if ((!$HexIP) || ($HexIP eq 'fail')) { $HexIP = 'fail'; # This step saves a lookup if the resolution failed while searching for robots.txt... $$Error = "hostname $THEM does not have a DNS entry (no corresponding IP address could be found for this machine). The address may have been mistyped, the site may no longer be online, it's domain may have expired, or network errors could have prevented resolution."; return 0; } unless (connect(HTTP, sockaddr_in($PORT,$HexIP))) { $$Error = "connect() failed with system error \"$!.\" Typically connect errors involve unreachable or non-functional servers, incorrect port numbers, local DNS problems, or a corrupt TCP environment"; return 0; } select(HTTP); $|=1; select(STDOUT); return 1; } sub GetRobotFile { local $_; my ($THEM, $PORT, $ForbidString) = @_; local $_; if ($ForbidString) { $ForbidString .= '|'; } else { $ForbidString = ''; } $ForbidString .= '('; $ForbidString .= quotemeta("$THEM.robot"); $ForbidString .= ')'; my $SocketError = ''; unless (OpenSocket($THEM, $PORT, \$SocketError)) { # access failed at the socket level. we aren't going to worry about this print "\n"; return $ForbidString; } print HTTP "GET /robots.txt HTTP/1.0\r\n"; print HTTP "Host: $THEM\r\n"; print HTTP "User-Agent: $Rules{'Crawler: User Agent'}\r\n"; if (keys %SessionCookie) { print HTTP "Cookie: "; my $buffer = ''; foreach (keys %SessionCookie) { print HTTP "$buffer$_=$SessionCookie{$_}"; $buffer = '; '; } print HTTP "\r\n"; } print HTTP "\r\n"; my $AgentToggle = 0; my $ForbiddenAgent = ''; print "\n"; while () { if (m!user-agent: ([^\r]+)\r?$!i) { $ForbiddenAgent = $1; $ForbiddenAgent =~ s!\r!!g; $ForbiddenAgent =~ s!\n!!g; $ForbiddenAgent = quotemeta($ForbiddenAgent); if (($Rules{'Crawler: User Agent'} =~ m!$ForbiddenAgent!i) || ($ForbiddenAgent eq '\*')) { $AgentToggle = 1; } else { $AgentToggle = 0; } } next unless (m!disallow:\s+([^\r]+)\r?$!i); next unless ($AgentToggle == 1); my $BadURI = Trim($1); $BadURI =~ s!\n!!; next unless ($BadURI); print "\n"; $ForbidString .= '|('.$THEM; $ForbidString .= quotemeta($BadURI); $ForbidString .= ')'; } close(HTTP); return $ForbidString; } # ---------------------------------------------------------------------- # Usage: ($URL, $HTMLText, $nFileSize, $RobotError) = GetStringByURL($URL); # the $URL is returned because it may be a 302 redirect and only the final # URL is stored in the database. sub GetStringByURL { local $_; my ($URL) = @_; my ($HTMLText, $nFileSize, $RobotError) = ('', 0, ''); unless ($URL =~ m!^http://([\w-\.]+):?(\d*)($|/(.*))!) { $RobotError = "Unrecognized URL format for $URL (protocol not supported?)\n"; return ($URL, $HTMLText, $nFileSize, $RobotError); } my $THEM = $1; my $PORT = $2 ? $2 : 80; my $URI = $3?$3:'/'; $URI =~ s/\#.*//; if ($Rules{'Crawler: Rogue'} != 1) { unless (($Rules{'RobotForbidden'}) && ("$THEM.robot" =~ m!^$Rules{'RobotForbidden'}$!i)) { $Rules{'RobotForbidden'} = GetRobotFile($THEM,$PORT,$Rules{'RobotForbidden'}); } if ($URL =~ m!($Rules{'RobotForbidden'})!i) { $RobotError = "This server's robots.txt file forbids access to the document"; return ($URL, $HTMLText, $nFileSize, $RobotError); } } my $SocketError = ''; unless (OpenSocket($THEM, $PORT, \$SocketError)) { $RobotError = "error connecting to $THEM on port $PORT - $SocketError\n"; return ($URL, $HTMLText, $nFileSize, $RobotError); } my $CRLF = chr(13) . chr(10); print HTTP "GET $URI HTTP/1.0$CRLF"; print HTTP "Host: $THEM$CRLF"; print HTTP "User-Agent: $Rules{'Crawler: User Agent'}$CRLF"; if (keys %SessionCookie) { print HTTP "Cookie: "; my $buffer = ''; foreach (keys %SessionCookie) { print HTTP "$buffer$_=$SessionCookie{$_}"; $buffer = '; '; } print HTTP $CRLF; } print HTTP $CRLF; my $StatusLine = 0; my $Location = ''; # Status OK? $StatusLine = ; if ($StatusLine =~ m!(301|302)!) { while () { if (m!^Set\-Cookie:\s+([^\=]+)\=([^\;]+)!i) { $SessionCookie{$1}=$2; } last if (m!^\r?$!); next unless (m!^Location\:\s+([^\r\n\s]+)!i); $Location = GetAbsoluteAddress($1,$URL); } if ($Location) { return ($Location, 302, $nFileSize, $RobotError); } else { $RobotError = "Received redirect status line - $StatusLine - without a corresponding Location: header."; return ($URL, $HTMLText, $nFileSize, $RobotError); } } elsif ($StatusLine !~ m!200!) { $RobotError = "System did not return a normal HTTP \"200 OK\" status header. The header returned was \"$StatusLine\"."; return ($URL, $HTMLText, $nFileSize, $RobotError); } # Get HTTP headers and return an error if there is a content-type # header which doesn't pattern match to text. No content-type headers # are okay for now (may change if it's a problem): while () { # print "Header:$_
\n"; if ((m!^Content\-Type\:\s+([^\r]*)\r?$!i) && (!(m!text!i))) { print "\n"; $RobotError = "invalid content-type header returned ($1)"; return ($URL, $HTMLText, $nFileSize, $RobotError); } elsif (m!^Content-Length:\s*(\d+)!i) { $nFileSize = $1; } elsif (m!^Set-Cookie:\s+([^\=]+)\=([^\;]+)!i) { $SessionCookie{$1}=$2; } # blank line follows headers: last if (m!^\r?$!); } read(HTTP, $HTMLText, $Rules{'Max Characters: File'}); close(HTTP); # strip newlines: $HTMLText =~ s!\r|\n! !mg; if ($HTMLText =~ m!]+)!i) { if ($2 < 10) { return (GetAbsoluteAddress($3, $URL), 302, $nFileSize, $RobotError); } } # toggle $SetSaveLinks: if ($SetSaveLinks == -1) { $SetSaveLinks = 1; } # Check for ROBOTS meta tags: if (($Rules{'Crawler: Rogue'} != 1) && ($HTMLText =~ m!]+)!i)) { my $RobotsDirectives = $7; my $NoIndex = ($RobotsDirectives =~ m!(NONE|NOINDEX)!i); my $NoFollow = ($RobotsDirectives =~ m!NOFOLLOW!i); if ($NoIndex) { # Early abort: we don't bother with gathering links from a page we # can't index. $RobotError = 'the robots meta tag of this document forbids indexing'; return ($URL, $HTMLText, $nFileSize, $RobotError); } if ($NoFollow) { $SetSaveLinks = -1; # means turn off for this page, turn on with next. print "\n"; } } # No content (arbitrary small byte limit): unless (length($HTMLText) > $Rules{'Minimum Page Size'}) { $RobotError = "Returned ".length($HTMLText)." bytes of text; minimum requirement is $Rules{'Minimum Page Size'} bytes."; return ($URL, $HTMLText, $nFileSize, $RobotError); } unless ($nFileSize) { $nFileSize = length($HTMLText); } my $NumSpaces = ($HTMLText =~ s! ! !g); if (($NumSpaces/length($HTMLText)) < $Rules{'Crawler: Minimum WhiteSpace'}) { $RobotError = "Suspicious content - only $NumSpaces blank spaces in " . length($HTMLText) . " characters. \n"; $RobotError .= "This is forbidden by the 'WhiteSpace Ratio' set up in the \$Rules{} array"; return ($URL, $HTMLText, $nFileSize, $RobotError); } else { return ($URL, $HTMLText, $nFileSize, $RobotError); } } # ---------------------------------------------------------------------- # This function translates high-bit Latin characters, and their HTML # expansions, into their English approximations. For example: # Â => A # Ã => A # Ã => A # This forms the basis for support of Latin languages in this search # engine. Because many end users do not have keyboards allowing them # to type in "Ã" or whatever, they will type in "A" instead. By # translating both the raw text and the user search terms with this # function, I will be able to match their needs. # Drawback: words like "für" in German will get false matches to words # like "fur" in English. This is fairly rare though. # Translation table based on Ian Graham's list at: # http://www.utoronto.ca/webdocs/HTMLdocs/NewHTML/iso_table.html sub RawTranslate { local $_ = shift; # Strip tabs and newline characters; replace with whitespace: tr!\n\r\t! !; s'\cM' 'og; s!&(.)(acute|grave|circ|uml|tilde);!$1!og; s'(÷|&(nbsp|divide);)' 'og; s'(&#(192|193|194|195|196|197|224|225|226|227|228|229|230);|À|Á|Â|Ã|Ä|Å|à|á|â|ã|ä|æ|å|&(.ring|aelig);)'a'og; s'(ß|ß|ß)'b'og; s'(&#(199|231);|Ç|ç|&.cedil;)'c'og; s'(&#(198|200|201|202|203|232|233|234|235);|Æ|È|É|Ê|Ë|è|é|ê|ë|Æ)'e'og; s'(&#(204|205|206|207|236|238|239);|Ì|Í|Î|Ï|ì|í|î|ï)'i'og; s'(&#(209|241);|ñ|Ñ)'n'og; s'(&#(216|210|211|212|213|214|240|242|243|244|245|246|248);|Ø|Ò|Ó|Ô|Õ|Ö|ð|ò|ó|ô|õ|ö|ø|&(.slash|eth);)'o'og; s'(&#(217|218|219|220|249|250|251|252);|Ù|Ú|Û|Ü|ù|ú|û|ü)'u'og; s'(&#(222|254);|Þ|þ|þ)'p'og; s'(×|×|×)'x'og; s'(&#(221|253);|Ý|ý)'y'og; return $_; } sub CompressStrip { local $_ = ' '.$_[0].' '; tr[A-Z][a-z]; $_ = RawTranslate($_); s'(\W|_)' 'og; s'\s+' 'og; eval($StripIgnoreWords); s'\s+' 'og; return $_; } sub CreateRealm { local $_; my ($Name, $File, $BaseDir, $BaseURL) = @_; print "

Attempting to create realm named \"$Name\".

\n"; for ($Name, $BaseDir, $BaseURL) { s!\r|\n|\|!!g; } my $Remains = $Name; $Remains =~ s!\w!!g; $Remains =~ s!\s!!g; if (($Remains) or (!($Name))) { print "

Error: You need to supply a normal name for your realm -- if can only have alpha-numerics and spaces.

"; return 1; } unless ($File) { print "

Error: you need to supply a valid filename, or the word 'RUNTIME', in the File box.

"; } elsif (($BaseDir) and (not ((-e $BaseDir) or (-R $BaseDir)))) { print "

Error: You've tried to create a realm with a base directory of $BaseDir, but that folder does not exist or is not readable.

\n"; } elsif ($File eq 'RUNTIME') { print "

Created new realm named '$Name'. This is a Runtime realm, and will be searched with each user query.

"; } elsif (not (Append($File, '', 1))) { print "

Error: the index file is not writable; returned error: $!

"; } elsif ((DeleteRealm($Name,1)) and (Append($RealmFile, "\r\n$Name|$File|$BaseDir|$BaseURL||\r\n", 1))) { print "

Created new realm named $Name - choose "; if ($BaseDir) { print "Index Files"; } else { print "Crawl Pages"; } print " from the Administrator's Page to populate the index.

\n"; } else { print "

Error: could not append to file $RealmFile. The system returned: $!

"; } # Add entries to pending pages file: Pending: { last Pending if ($File eq 'RUNTIME'); my @NewRecords = (); my $RealmName = webEncode($Name); my $Time = time(); unless (open(FILE,"<$File")) { # cannot open file - fail gracefully print "

Error: cannot read from this realm's index file '$File' - $!.

\n"; last Pending; } binmode(FILE); while () { next unless m!^\d+ u= (.*?) t=!; push(@NewRecords, "$1 $RealmName $Time\n"); } close(FILE); print "

Found ".(scalar @NewRecords)." pages already existing in index file '$File'.

\n"; unless (scalar @NewRecords) { # no records in this file last Pending; } unless (open(FILE,"<$PendingPagesFile")) { print "

Error - could not read from file '$PendingPagesFile' - $!.

\n"; last Pending; } binmode(FILE); my @OldRecords = ; close(FILE); unless (open(FILE,">$PendingPagesFile")) { print "

Error - could not write to file '$PendingPagesFile' - $!.

\n"; last Pending; } binmode(FILE); my ($Current, $Previous) = ('', ''); my ($CurrentNum, $PreviousNum) = (0, 0); foreach (sort (@OldRecords, @NewRecords)) { next unless m!^(.*?) (.*?) (\d+)$!; $Current = "$1 $2"; $CurrentNum = $3; if ($Current ne $Previous) { print FILE "$Current "; # print largest number: # 2 beats 0, so a failed index attempt (2) over rides a waiting-to-index record (0) # 234 beats 0 and 2, so an existing record betas both a failed attempt and a wait-to-index print FILE (($CurrentNum > $PreviousNum) ? $CurrentNum : $PreviousNum); print FILE "\n"; } $Previous = $Current; $PreviousNum = $CurrentNum; } close(FILE); chmod($FileMask, $PendingPagesFile); print "

Updated pending file '$PendingPagesFile' with new records.

\n"; } return 1; } sub DeleteRealm { local $_; my ($Name,$Quiet) = @_; $Quiet = 0 unless $Quiet; Append($RealmFile,'',1); # create 0-byte file if possible, if not present. unless (open(FILE, "<$RealmFile")) { print "

Error: could not read from realm file '$RealmFile' - $!

\n"; return 0; } my $Text = ''; binmode(FILE); foreach () { if (m!^$Name\|!i) { print "

Removed a realm named $Name.

\n" unless $Quiet; } else { $Text .= $_; } } close(FILE); unless (open(FILE, ">$RealmFile")) { print "

Error: could not write to realm file '$RealmFile' - $!

\n"; return 0; } binmode(FILE); print FILE $Text; close(FILE); chmod($FileMask, $RealmFile); my $File = $IndexFile{$Name} || ''; if (($File) && (-e $File)) { print "

The data file for this realm still exists, so you can restore everything by simply creating another realm which uses the same datafile ($File). To recover disk space, you can manually delete this file. It is ".int((-s $IndexFile{$Name})/1000)." kB.

" unless $Quiet; } print "

Finished updating the realm file.

\n" unless $Quiet; # Remove entries from the pending pages file: Pending: { last Pending if $Quiet; unless (open(FILE, "<$PendingPagesFile")) { print "

Error - unable to read from file '$PendingPagesFile' - $!

"; last Pending; } binmode(FILE); my $RealmName = quotemeta(webEncode($Name)); my $nRemoved = 0; my @Records = (); while () { if (m!^http://.*? $RealmName \d+!) { $nRemoved++; next; } push(@Records, $_); } close(FILE); unless (open(FILE, ">$PendingPagesFile")) { print "

Error - unable to write to file '$PendingPagesFile' - $!

"; last Pending; } binmode(FILE); foreach (@Records) { print FILE; } close(FILE); chmod($FileMask, $PendingPagesFile); print "

Successfully removed $nRemoved records from the file '$PendingPagesFile'

"; } return 1; } sub CreateRealmForm { local $_; my ($Name) = @_; $Name = '' unless $Name; my $File = $IndexFile{$Name} || ''; my $BaseDir = $BaseDir{$Name} || ''; my $BaseURL = $BaseURL{$Name} || ''; print <<"EOM";
$HiddenPass

Create/Update Realm Form

Name: Supply normal name; must be alphanumerics and spaces.
Index File: To build the index anew with every search, use the word "RUNTIME". For most cases, though, supply a filename. It will be created if it doesn't exist.
Base directory: * Applies to local realms only
Base URL: * Applies to local indexed realms only


Example - Realm for searching remote sites:

This is the most common realm. Pages in this realm are indexed by the crawler. There is no relationship to a particular base directory on this server, so the lower two inputs are left blank.

Name: Supply normal name; must be alphanumerics and spaces.
Index File: Supply a filename. It will be created if it doesn't exist.
Base directory: * Applies to local realms only (left blank)
Base URL: * Applies to local indexed realms only (left blank)

Example - Realm for indexing the local site:

This is specifically for indexing all the files in a particular directory on your server. The web address corresponding to that directory must also be supplied. These realms cannot contain remote pages. However, they are nice for local pages because administration is easy - the entire index is built with one click.

Name: Supply normal name; must be alphanumerics and spaces.
Index File: Supply a filename. It will be created if it doesn't exist.
Base directory: * Applies to local realms only
Base URL: * Applies to local indexed realms only

Example - Realm for searching, but not indexing, the local site:

This type of realm is very inefficient, since all files are searched each time a user types in a search term. The advantage is that the searches are always up-to-date. No index file is used here.

Name: Supply normal name; must be alphanumerics and spaces.
Index File: To build the index anew with every search, use the word "RUNTIME".
Base directory: * Applies to local realms only
Base URL: * Applies to local indexed realms only
EOM return 1; } sub PrintAddRemoteSiteForm { local $_; my ($Title, $SubTitle, $Realm, $IsPublic) = @_; my $ChooseRealmLine = SelectRealmList($Realm); print "$Title:\n
\n"; print <<"EOM" if ($ChooseRealmLine ne '
');

$SubTitle

EOM my $varName = 'AddSite'; unless ($IsPublic) { $varName = 'NewURL'; print <<"EOM"; $HiddenPass EOM } if ($ChooseRealmLine ne '
') { print <<"EOM"; EOM if ($FORM{'Mode'} eq 'Admin') { print "\n"; } print <<"EOM";
Web Address: 
$ChooseRealmLine

Index entire site (may take a long time)

EOM } else { print "

No Realms have been created yet. Please use the link on the Admin page to Create a New Realm, and then you will be able to add web pages.

\n"; } print '
'; } sub HTML_UI { local $_; my ($Realm) = @_; my $ChooseRealmLine = SelectRealmList($Realm); &PrintAddRemoteSiteForm('Index Remote Web Page','

Enter a web address to start the web crawler. When you click the Add New Page button, this script will retrieve the page over the web, and save its contents in the index. Later you may follow any or all links found in the document.

','',0); print <<"EOM";

Review and Maintain Entries:

EOM &PrintMaintainenceForm; print <<"EOM";
Create a New Realm
Allows you to create a new local or remote realm.
EOM print <<"EOM" if (scalar (keys %IndexFile)); # i.e., only if there are any indexes.

Edit Record:

If you know the exact web address for the record you'd like to edit, enter it and then click Edit Specific Address.

$HiddenPass
Web Address: 
$ChooseRealmLine

EOM sub PrintMaintainenceForm { local $_; my ($Realm) = @_; my $ChooseRealmLine = SelectRealmList($Realm); print <<"EOM";

Each link is a command. The commands are explained below.

EOM # print maintenance options for each realm: my $RealmName = ''; foreach $RealmName (sort keys %IndexFile) { print ""; print ""; if (($IndexFile{$RealmName} eq 'RUNTIME') or (!$IndexFile{$RealmName}) or (!(-e $IndexFile{$RealmName}))) { print ""; } else { print ""; my @Date = localtime((stat($IndexFile{$RealmName}))[9]); my $PM = ($Date[2] > 12) ? 'pm' : 'am'; $Date[2] -= 12 if ($Date[2] > 12); foreach (1..3) { $Date[$_] = sprintf("%02.f", $Date[$_]); } my $Date = "$Date[2]:$Date[1]$PM, $Date[3] $MonthNames[$Date[4]] ".(1900 + $Date[5]); print ""; } my $EncodeRealmName = escape($RealmName); print ""; print ""; print ""; if (($BaseDir{$RealmName}) and ($IndexFile{$RealmName} ne 'RUNTIME')) { print ""; } elsif ($BaseDir{$RealmName}) { print ""; } else { print ""; } print ""; } print <<"EOM";
Name Size
(in kB)
Last Updated Configure
Realm
Delete
Realm
Review
Pages
Rebuild Revisit
Old Pages
Expand
Realm
$RealmNameN/AN/A".Commas(int((-s $IndexFile{$RealmName})/1000))."$DateConfigureDeleteReviewIndex All FilesN/AN/ARuntimeN/AN/AIndex All PagesRevisitExpand
Configure Realm
Change the name or index file. For local realms, can also change the base directory and base URL.

Delete Realm
Removes the realm from the list. The index file is not removed.

Review Pages
Lists all the documents in your index file. You can edit or delete individual web pages.

Index All Files
Completely rebuilds the index file for local realms.

Index All Pages
Completely rebuilds the index file for remote realms. The crawler revisit each page. This function may take a long time.

Revisit Old Pages
Crawler indexes any remote web page that hasn't been visited in the last $Rules{'Crawler: Days Til Refresh'} days. This command should be run periodically, to keep the index file fresh.

Expand Realm
Crawler indexes any remote web page that hasn't been visited in the last $Rules{'Crawler: Days Til Refresh'} days, and then searches new web pages that are linked from existing documents. This command used to be called "Maintain Realm".

EOM } print <<"EOM";

Usage Statistics:

See Usage Statistics on visitor searches.
EOM } # This sub-procedure adds the $Record for a specific $URL to an # an index file ($File). The entire index file is scanned to # ensure that no duplicates are added - if other entries for $URL # exist, they are removed with a warning, and in any case the # $Record is added to the end of $File. # usage: # ($EntryCount, $Duplicates, %UpdateResults) = UpdateDB($IndexFile, %GlobalSpiderResults); sub UpdateDB { local $_; my ($File, %GlobalSpiderResults) = @_; my ($EntryCount, $Duplicates) = (0, 0); my %Results = (); # Check: is $TempFile in use? if (-e "$TempFile.lock") { # yep, it's in use. sleep for 3 seconds and try again... sleep(3); if (-e "$TempFile.lock") { # still in use. wait another 3 seconds... sleep(3); if (-e "$TempFile.lock") { # okay, we need to return an error. # what was the PID and timestamp of the locking process? my $lockTimeStamp = scalar localtime((stat("$TempFile.lock"))[9]); my $lockPID = ''; if (open(LOCK,"<$TempFile.lock")) { $lockPID = 'is '.; close(LOCK); } else { $lockPID = "could not be read from the lock file because the filesystem returned the error \"$!\" when a read was attempted"; } print <<"EOM";


Error: This operation could not be completed because the database system is locked by another process which is trying to edit the same data. The PID of this process $lockPID. This script waited 6 seconds for the lock to be released, but it was not released. The lock has been in place since $lockTimeStamp.

You may want to wait a few minutes, verify that no other users are editing the index files, and then attempt this operation again (by reloading this script in your browser.) If that does not work, then click here to force an unlock of the temp file.

This is a critical error which overrides any other error or success messages written below.


EOM return (0,0); # early exit with no disk I/O } # not locked after 6 seconds, continue... } # not locked after 3 seconds, continue... } # Not locked, okay so now we're going to lock it: if (open(LOCK,">$TempFile.lock")) { binmode(LOCK); print LOCK $$; # $$ is perl's var for PID close(LOCK); chmod($FileMask, "$TempFile.lock"); } else { print <<"EOM";

Error: This operation could not be completed, because this process was unable to place an exclusive lock on the database files. A lock is placed by writing to the file $TempFile.lock - writing to this file failed with the error message $!.

You may want to wait a few minutes, verify that no other users are editing the index files, and then click here to force an unlock of the temp file.

EOM return (0,0); # early exit with no disk I/O } unless (open(NEWFILES,">$TempFile")) { print "Error: could not write to temp file $TempFile in $DataFilesDir - $!\n"; return (0,0); } binmode(NEWFILES); if (!(open(ALLFILES,"<$File"))) { # open failed - this is okay if the file doesn't exist yet, fatal otherwise. if (-e $File) { print "Error: could not read from $File in $DataFilesDir - $!\n"; return (0,0); } # didn't exist, we need to create it so that the rename() operations work elsif (open(TEMP,">>$File")) { # opened okay, write one blank line: binmode(TEMP); print TEMP "\n"; close(TEMP); chmod($FileMask, $File); } else { print "Error: $File does not exist and could not be created - $!\n"; return (0,0); } } else { my $Record = ''; RECORD: while ($Record = ) { # compare whether an existing entry is there: my $URL = ''; foreach $URL (keys %GlobalSpiderResults) { if ($Record =~ m!u= $URL !) { $Duplicates++; $Results{$URL} = "Old record for this URL was removed. "; next RECORD; # this url just got skipped } } # no duplicate entry, go ahead and print it. print NEWFILES $Record; $EntryCount++; } } my ($URL, $fURL) = ('', ''); foreach $URL (keys %GlobalSpiderResults) { foreach $fURL (@ForbidSites) { next unless ($URL =~ m!^$fURL!i); if ($Results{$URL}) { $Results{$URL} .= "This site is forbidden. "; } else { $Results{$URL} = "This site is forbidden. "; } $GlobalSpiderResults{$URL} = '-1'; } if ($GlobalSpiderResults{$URL} ne '-1') { print NEWFILES $GlobalSpiderResults{$URL}; if ($Results{$URL}) { $Results{$URL} .= "New record for this URL added. "; } else { $Results{$URL} = "New record for this URL added. "; } $EntryCount++; } elsif ($Results{$URL}) { $Results{$URL} .= "No new data was entered into the database. "; } else { $Results{$URL} = "No new data was entered into the database. "; } } close(NEWFILES); close(ALLFILES); chmod($FileMask, $TempFile); my $TempSize = (-s $TempFile); if ($TempSize > $Rules{'Max Index File Size'}) { # The temp file is too big - abort everything: print <<"EOM";


Error: Indexing failed, because this database file has exceeded the maximum size limit of $Rules{'Max Index File Size'} bytes, which is set in the \$Rules{'Max Index File Size'} variable. Index file "$File" would have grown to $TempSize bytes had this process completed successfully.

To allow these web pages into your index, edit this script and increase the bytesize limit; or, delete some addresses from your database.

The entire transaction has been aborted - ignore any success or failure messages below, and ignore the address count.


EOM # Done with our disk I/O - clear the lock: unless (unlink("$TempFile.lock")) { print "


Warning: the disk operation completed successfully, but this process was unable to clear the lock on the database system. Future operations will fail until $TempFile.lock has been deleted from the \$DataFilesDir.

You may want to wait a few minutes, verify that no other users are editing the index files, and then click here to force an unlock of the temp file.


"; } return (0,0); } # next routines straight from: # http://www.xav.com/perl/pod/perlfaq5/How_do_I_change_one_line_in_a_fi.html if ((-e $BackFile) && (!(unlink($BackFile)))) { print "Error: could not delete $BackFile - $!\n"; return (0,0); } unless (rename($File, $BackFile)) { print "Error: could not rename $File to $BackFile - $!.\n"; return (0,0); } unless (rename($TempFile, $File)) { print "Error: could not rename $TempFile to $File - $!.\n"; return (0,0); } # Done with our disk I/O - clear the lock: unless (unlink("$TempFile.lock")) { print "Warning: the disk operation completed successfully, but this process was unable to clear the lock on the database system. Future operations will fail until $TempFile.lock has been deleted from the \$DataFilesDir.

You may want to wait a few minutes, verify that no other users are editing the index files, and then click here to force an unlock of the temp file.

"; } return ($EntryCount, $Duplicates, %Results); } # assumes trailing /'s have been added to URL's of form http://www.c2.org sub AddURL { local $_; my ($tag, @AddressesToIndex) = @_; my $Realm = $FORM{'Realm'}; if ($FORM{'Action'} eq 'ReCrawlRealm') { $FORM{'Batch'}++; print "

Rebuilding the \"$Realm\" Database:
Indexing pages that haven't been visited in the last $FORM{'DaysPast'} days.
Running in automatic mode on batch $FORM{'Batch'} of $FORM{'TotalBatches'}. [click here to stop]

\n"; } elsif ($FORM{'Action'} eq 'CrawlEntireSite') { $FORM{'Batch'}++; print "

Indexing \"$FORM{'LimitSite'}\" for the \"$Realm\" Database:
Running in automatic mode (batch $FORM{'Batch'}). Status: $FORM{'LimitIndexed'} documents indexed; $FORM{'LimitFailed'} failed; $FORM{'LimitPending'} waiting to be indexed. [click here to stop]

\n"; } elsif ($tag == 1) { print "

Maintaining the \"$Realm\" Database:

\n"; } else { print "

Adding Web Pages to the \"$Realm\" Database:

\n"; } print 'Crawling remote sites and updating the database may take a long time. Please be patient...'; $|=1; $SetSaveLinks = 1; # This array will hold the URLs that are forbidden by the various # robots.txt files on the servers: # $Rules{'RobotForbidden'}; # The data structure is: # $Rules{'RobotForbidden'} = 'server.name.com.hit|server.name.com/cgi-bin|server.name.com/logs'; # all disallow: /uri entries are linked in a pipe-delimited string and a pattern # match is done against the url: # example: server.name.com/cgi-bin|server.name.com/logs would be the string # if the robots.txt contains: # User-agent: * # Disallow: /cgi-bin # Disallow: /logs # then the url would be pattern matched as if ($URL =~ m!$Forbidden!) {&no_access} # this should be a very small data structure and so a string is used instead of a # more expensive associative array. my %UserResults = (); my $NumRank = 0; my $NumRedirectsFollowed = 0; my $MaxAddresses = scalar @AddressesToIndex; my $AddressIndex = 0; my @IndexedAddresses = (); ADDRESS: for ($AddressIndex = 0; $AddressIndex < $MaxAddresses; $AddressIndex++) { print '.'; if ($Rules{'Crawler: Max Pages Per Batch'} <= $AddressIndex) { push(@IndexedAddresses,'DONE'); $UserResults{'DONE'} = "

Crawler Finished
"; $UserResults{'DONE'} .= "
The crawler has indexed "; $UserResults{'DONE'} .= "$Rules{'Crawler: Max Pages Per Batch'} web pages. "; $UserResults{'DONE'} .= "It will wait for further input from you. The "; $UserResults{'DONE'} .= "maximum number of web pages searched per batch can be adjusted using the variable 'Crawler: "; $UserResults{'DONE'} .= "Max Pages Per Batch' in the \$Rules array.
\n"; last ADDRESS; } my $URL = Trim($AddressesToIndex[$AddressIndex]); if ($URL !~ m!^http://!i) { $NumRank++; $UserResults{$URL} = "

$NumRank. Error: $URL
\n"; $UserResults{$URL} .= "
This web address does not being with the string \"http://\". It will not be indexed. "; $UserResults{$URL} .= "This search engine queries only standard HTTP/1.0 and 1.1 compliant non-encrypted web servers, not secured https:// sites, nor "; $UserResults{$URL} .= "FTP nor gopher servers.
"; $GlobalSpiderResults{$URL} = -1; # delete any matching entry push(@IndexedAddresses, $URL); next ADDRESS; } my $OldURL = $URL; my ($Text, $nFileSize, $RobotError) = ('', 0, ''); ($URL, $Text, $nFileSize, $RobotError) = GetStringByURL($URL); if (($Text eq '302') && ($NumRedirectsFollowed < $Rules{'Crawler: Max Redirects'})) { $NumRank++; if (($FORM{'Action'} eq 'CrawlEntireSite') and ($URL !~ m!^$FORM{'LimitSite'}!i)) { $UserResults{$OldURL} = "

$NumRank. Redirect: $OldURL
\n"; $UserResults{$OldURL} .= "
This page is redirecting all traffic to \"$URL\". Since the destination address falls outside of the $FORM{'LimitSite'} restriction, this document will be ignored."; $GlobalSpiderResults{$URL} = -1; # delete any matching entry next ADDRESS; } $UserResults{$OldURL} = "

$NumRank. Redirect: $OldURL
\n"; $UserResults{$OldURL} .= "
This page is redirecting all traffic to \"$URL\". That page will be indexed instead."; $GlobalSpiderResults{$OldURL} = -1; # delete any matching entry if ($URL =~ m!http://([^\/]+)$!) { $URL .= '/'; } @AddressesToIndex = ('',@AddressesToIndex); # add offset at left side $AddressesToIndex[$AddressIndex+1] = $URL; # overwrite this URL with redir. $MaxAddresses++; $NumRedirectsFollowed++; push(@IndexedAddresses,$OldURL); next ADDRESS; } elsif ($Text eq '302') { $NumRank++; $UserResults{$OldURL} = "

$NumRank. Redirect: $OldURL
\n"; $UserResults{$OldURL} .= "
This page is redirecting all traffic to \"$URL\". However, the maximum of $Rules{'Crawler: Max Redirects'} redirects per session has been exceeded and so this URL is being deleted altogether. You may also set \$Rules{'Crawler: Max Redirects'} to a larger number and try again."; $GlobalSpiderResults{$OldURL} = -1; # delete any matching entry push(@IndexedAddresses,$OldURL); next ADDRESS; } elsif ($RobotError) { $NumRank++; $UserResults{$URL} = "

$NumRank. Error: $URL
\n"; $UserResults{$URL} .= "
The crawler encountered an error: $RobotError."; $GlobalSpiderResults{$URL} = -1; # delete any matching entry push(@IndexedAddresses,$URL); next ADDRESS; } my ($Title, $Description) = ('', ''); my $RecordLine = MakeRecord($URL,'',$Text, $nFileSize, \$Title, \$Description); $GlobalSpiderResults{$URL} = $RecordLine; my $ByteSize = length($Text); my ($DD,$MM,$YYYY) = unpack('A2A2A4',substr($RecordLine,2,8)); $NumRank++; my $Month = $MonthNames[$MM]; $UserResults{$URL} = AdminVersion($NumRank, $URL, $Title, $Description, $ByteSize, $DD, $Month, $YYYY, 0); push(@IndexedAddresses,$URL); next ADDRESS; } print '
'; # All of the disk i/o is in the sub-procedure: my ($EntryCount, $Duplicates, %UpdateResults) = UpdateDB($IndexFile{$Realm}, %GlobalSpiderResults); ADDRESS: foreach (@IndexedAddresses) { if ($UserResults{$_}) { print $UserResults{$_}; } if ($UpdateResults{$_}) { print "
[ $UpdateResults{$_} ]
\n"; } else { print "\n"; } print ''; } my $textDup = ''; if ($Duplicates == 0) { $textDup = 'No entries were removed.'; } elsif ($Duplicates == 1) { $textDup = 'One older entry was removed.'; } else { $textDup = $Duplicates.' older entries have been removed.'; } print <<"EOM";

There are now $EntryCount addresses in the "$Realm" search database. $textDup

EOM if ($FORM{'Action'} eq 'ReCrawlRealm') { if ($MaxAddresses) { my $NextLink = $SCRIPT_NAME . '?Mode=Admin&Action=ReCrawlRealm&TotalBatches='.$FORM{'TotalBatches'}.'&Batch='.$FORM{'Batch'}.'&DaysPast='.$FORM{'DaysPast'}.'&StartTime='.$FORM{'StartTime'}.'&Realm='.webEncode($Realm).$StringPass; print <<"EOM";

Continue with Rebuild:

This page should automatically refresh in 10 seconds in order to continue with the rebuild. If it does not, click here to continue.

EOM } else { print <<"EOM";

Rebuild Complete:

The "$Realm" realm has been rebuilt.

EOM } } elsif ($FORM{'Action'} eq 'CrawlEntireSite') { if ($MaxAddresses) { my $NextLink = $SCRIPT_NAME . '?Mode=Admin&Action=CrawlEntireSite&LimitSite='.$FORM{'LimitSite'}.'&Batch='.$FORM{'Batch'}.'&StartTime='.$FORM{'StartTime'}.'&Realm='.webEncode($Realm).$StringPass; print <<"EOM";

Continue with Index:

This page should automatically refresh in 10 seconds in order to continue with indexing the site "$FORM{'LimitSite'}". If it does not, click here to continue.

EOM } else { print <<"EOM";

Index Complete:

The site "$FORM{'LimitSite'}" has been indexed.

EOM } } elsif ($tag == 1) { print '

Continue Maintence:

'; &PrintMaintainenceForm($Realm); print '
'; } my %SaveLinks = SaveLinksToFile($PendingPagesFile, $Realm, @IndexedAddresses); return 1 if (($FORM{'Action'} eq 'ReCrawlRealm') or ($FORM{'Action'} eq 'CrawlEntireSite')); &PrintAddRemoteSiteForm('Add Another Web Site','',$Realm,0); my $LinkCount = scalar (keys %SaveLinks); if ($LinkCount) { print <<"EOM";

Embedded Links:

$HiddenPass Click on the Index button to search all of the links listed below. Or, click on each link separately to index that page. Clicking a link causes the robot to search it - you won't actually visit that page.
EOM if ($LimitSite) { print ""; } print <<"EOM"; [ clear all ] [ check all ]
EOM my $QueryString = "&Realm=$FORM{'Realm'}&Action=$FORM{'Action'}&Mode=Admin$StringPass"; $QueryString =~ tr! !+!; $LinkCount = 1; my $PastTime = time - (86400 * $Rules{'Crawler: Days Til Refresh'}); my ($UnSearched,$OutDated,$Searched,$Failed,$Checked) = (0,0,0,0,1); foreach (reverse (sort {$SaveLinks{$b} <=> $SaveLinks{$a} || $a cmp $b} keys %SaveLinks)) { # print description of link type: if ($SaveLinks{$_} == 1) { if ($UnSearched == 0) { print '

The following links were found. They have not yet been indexed.

'; $UnSearched = 1; $Checked = 1; } } elsif ($SaveLinks{$_} == 2) { if ($Failed == 0) { print '

The addresses below have been encountered before, and have failed with "Redirect," "Not Found," or "Server Unreachable" errors.

'; $Failed = 1; $Checked = 0; } } elsif ($SaveLinks{$_} <= $PastTime) { if ($OutDated == 0) { print "

The following links were found in these pages. They've already been \n"; print "entered into the search index, but haven't been visited in more than \n"; print "$Rules{'Crawler: Days Til Refresh'} days. They should be indexed again.

\n"; $OutDated = 1; $Checked = 1; } } else { if ($Searched == 0) { print "

The links below have been "; print "visited within the last $Rules{'Crawler: Days Til Refresh'} days. They already exist in the index.

\n"; $Checked = 0; $Searched = 1; } } print " "; print "$_
\n"; $LinkCount++; } } else { print <<"EOM";

Embedded Links:

No embedded links were found during this crawl session. EOM } print <<"EOM";
EOM } sub SaveLinksToFile { local $_; my ($PagesFile, $Realm, @IndexedAddresses) = @_; my $webRealm = webEncode($Realm); my %SaveLinks = (); my @Global = (); # This global index contains just a list of all the URLs that have either # been indexed at one time, forbidden, or are awaiting indexing. The # format of this file is: # # URL webEncode(realm) number # # The $URL is the web address in question. webEncode(realm) is the associated # realm of this file, either the database it was entered in or the database of # the parent document linking to this one if it's awaiting indexing. # $number is either the time # that the file was last spidered, or zero if it's on the waiting list to # be spidered, or one if it has been forbidden # All pages will go in the @Global array. # Take all pages indexed during this round and assign them a value of the # current time if they were successful and a 2 if they failed. my $CurrentTime = time; foreach (@IndexedAddresses) { next if ($_ eq 'DONE'); # skip keyword if (($GlobalSpiderResults{$_}) && ($GlobalSpiderResults{$_} eq '-1')) { push(@Global,"$_ $webRealm 2"); } else { push(@Global,"$_ $webRealm $CurrentTime"); } } # Add all saved links to this array with a 0 numeric index. Also create an # associative array of them for later comparisons: foreach (@GlobalSavedLinks) { $SaveLinks{$_} = 1; push(@Global,"$_ $webRealm 0"); } # Add all links from the database: if (-e $PagesFile) { if (open(FILE,"<$PagesFile")) { # Create an array (alpha-sorted) of the links saved during this session # along with all of the links in this file: while () { s!(\r|\n)!!g; push(@Global,$_); } close(FILE); } else { # could not open it but it exists - permissions problem: print "Error: could not read from $PagesFile - $!\n"; return 0; } } unless (open(FILE,">$PagesFile")) { print "\n"; return 0; } binmode(FILE); my $LastLink = 'http://x'; my $LastTime = 0; my $LastRealm = ''; # A null final entry is needed due to the comparison algorithm: PENDING: foreach ((sort @Global),'http://x 0 0') { next unless (m!^(.+) (.+) (\d+)?$!); my ($URL, $Realm, $VisitTime) = ($1,$2,$3); if ($URL eq $LastLink) { # This URL is in the database at least twice, let the # larger number win, but let a current failure override: if (($GlobalSpiderResults{$URL}) && ($GlobalSpiderResults{$URL} eq '-1')) { $VisitTime = 2; # key number for failed request. } elsif ($LastTime > $VisitTime) { $VisitTime = $LastTime; } } elsif ($LastLink ne 'http://x') { # different pages, put the *previous* entry in the unique bucket: print FILE "$LastLink $LastRealm $LastTime\n"; # Okay, this may be a case of a spidered link that has # already got an entry. Check whether an entry exists # for this URL in $SaveLinks{} if (($SaveLinks{$LastLink}) && ($LastTime)) { # okay, it does exist, and we've found a non-zero # number which means that this page has already # been handled by the robot: $SaveLinks{$LastLink} = $LastTime; } } # set current to last for next round: $LastLink = $URL; $LastRealm = $Realm; $LastTime = $VisitTime; } close(FILE); chmod($FileMask, $PagesFile); return %SaveLinks; } # -------------------------------------------------------------------- # # Database Routines: sub SearchRunTime { local $_; my ($Realm, $DocSearch) = @_; # Make sure a base directory has been declared: unless ($BaseDir{$Realm}) { print "\n"; return 0; } # Make sure a base URL has been declared: unless ($BaseURL{$Realm}) { print "\n"; return 0; } # Remove trailing slashes if present: $BaseDir{$Realm} = $1 if ($BaseDir{$Realm} =~ m!(.*)/$!); $BaseURL{$Realm} = $1 if ($BaseURL{$Realm} =~ m!(.*)/$!); my $DMZ = ForbidFilesFromSites($BaseDir{$Realm},$BaseURL{$Realm},@ForbidSites); $DMZ .= ' '.$Exclude{$Realm}; my $FILE = ''; my ($URL, $TERM, $DEL) = ('', '', 0, 0); my @WordCount = (); my $WordMatches = 0; my ($t, $d, $k, $hdr); Record: foreach $FILE (GetFilesByDir($DMZ,$EXT,$BaseDir{$Realm})) { my $HTML_Text = Get_String($FILE); # Check for ROBOTS meta tags: if ($HTML_Text =~ m!]+)!i) { my $RobotsDirectives = $7; if ($RobotsDirectives =~ m!(NONE|NOINDEX)!i) { # Early abort: we don't bother with gathering links from a page we # can't index. print "\n"; next Record; } } my ($nFileSize, $LastModT) = (stat($FILE))[7,9]; if ($FILE =~ m!^$BaseDir{$Realm}(.*)$!) { $URL = $BaseURL{$Realm}.$1; } else { print "\n"; next Record; } my $Record = MakeRecord($URL, $LastModT, $HTML_Text, $nFileSize); $_ = $Record; eval($DocSearch); } close(ALLFILE); } # End sub SearchRunTime(); sub SearchIndexFile { local $_; my ($IndexFile, $RealmSearch) = @_; unless (open(ALLFILE,"<$IndexFile")) { # Bad fatal error - we don't have permission to open the index, or # the path to the index is wrong. Do not go fatal, just print # error to STDOUT and return null: print "\n" if ($AllowDebug == 1); return 0; } binmode(ALLFILE); eval($RealmSearch); close(ALLFILE); } # End sub SearchIndexFile(); # $FormattedSearchTerm = Format_Term($SearchTerm) # strips non-alpha_numerics, replaces with whitespace # collapses multiple whitespaces to single # collapses consecutive * to single # replaces * with ([^\s+]{0,4}) # expands url:term, title:term into regexp notation sub Format_Term { local $_ = $_[0]; s/(\*+)/$WildCard/g; if (!(/\:/)) { # if not a special case like "title:Foo" then # just strip non-alphanumerics: s!\W! !g; } elsif (/^\W?(url|host|domain)\:(.+)/i) { ($_ = $2) =~ s!\W! !g; $_ = 'uM=.*?( '.$_.' ).*?uT='; } elsif (/^\W?title:(.+)/i) { ($_ = $1) =~ s/\W/ /g; $_ = 'uT=.*?( '.$_.' ).*?uD='; } elsif (/^\W?text:(.+)/i) { ($_ = $1) =~ s/\W/ /g; $_ = 'h=.*?( '.$_.' ).*?l='; } elsif (/^\W?link:(.+)/i) { ($_ = $1) =~ s/\W/ /g; $_ = 'l=.*?( '.$_.' )'; } else { s!\W! !g; } # Add a leading and trailing space $_ = ' '.$_.' '; # Remove all ignored words: s'\s+' 'og; eval($StripIgnoreWords); s'\s+' 'og; # replace wildcard placeholder with the limited wildcard string: s!$WildCard!$WildSearch!g; # return formatted term: return $_; } # - # usage: $RecordLine = MakeRecord($URL, $LastModT, $sText, $nFileSize, \$byrefTitle, \$byrefDescription); sub MakeRecord { local $_; my ($URL, $LastModT, $sText, $nFileSize, $byrefTitle, $byrefDescription) = @_; my $FBYTES = sprintf('%06.f', $nFileSize); my ($Title, $Description, $Links, $Keywords); ($Title, $Description, $sText, $Links, $Keywords) = Extract_Meta($sText, $URL); # records use '= ' as delimiters; this cannot occur in the CompressStrip areas because # non-alphanumerics are forbidden, but they must be manually scrubbed from other fields: foreach ($URL, $Title, $Description) { s'= '=%20'og; } my $AlphaData = ' u= '.$URL; $AlphaData .= ' t= '.$Title; $AlphaData .= ' d= '.$Description; $AlphaData .= ' uM='.CompressStrip($URL); # uM= is the "searchable" URL field. $AlphaData .= 'uT='.CompressStrip($Title); $AlphaData .= 'uD='.CompressStrip($Description); $AlphaData .= 'uK='.CompressStrip($Keywords); $AlphaData .= 'h='.CompressStrip($sText); $AlphaData .= 'l='.CompressStrip($Links); $LastModT = $LastModT ? $LastModT : time; my ($DD,$MM,$YYYY) = (localtime($LastModT))[3..5]; $YYYY += 1900; my $CC = 1; foreach (@PromoteSites) { next unless ($URL =~ m!^$_!i); $CC = $Rules{'Promote Value'}; last; } for ($CC,$DD,$MM) { $_ = sprintf('%02.f',$_); } $$byrefTitle = $Title; $$byrefDescription = $Description; return "$CC$DD$MM$YYYY$FBYTES$AlphaData\n"; } sub BuildIndex { local $_; my ($Realm, $StartFile) = @_; my $StartTime = time; $| = 1; if ($StartFile > 0) { # Append, because we're continue the process: unless (open(ALLFILES,">>$IndexFile{$Realm}")) { print "BuildIndex error for realm \"$Realm\": could not write to index file \"$IndexFile{$Realm}\" in folder \"$DataFilesDir\" - system returned \"$!.\"\n"; return 0; } } else { # Do a write, because we're starting the process: unless (open(ALLFILES,">$IndexFile{$Realm}")) { print "BuildIndex error for realm \"$Realm\": could not write to index file \"$IndexFile{$Realm}\" in folder \"$DataFilesDir\" - system returned \"$!.\"\n"; return 0; } } binmode(ALLFILES); # Remove trailing slashes if present: $BaseDir{$Realm} = $1 if ($BaseDir{$Realm} =~ m!(.*)/$!); $BaseURL{$Realm} = $1 if ($BaseURL{$Realm} =~ m!(.*)/$!); my $FileCount = $StartFile; $SetSaveLinks = 0; my $DMZ = &ForbidFilesFromSites($BaseDir{$Realm},$BaseURL{$Realm},@ForbidSites); $DMZ .= ' '.$Exclude{$Realm}; print "

Building index file for realm '$Realm'.

\n"; my @Files = GetFilesByDir($DMZ,$EXT,$BaseDir{$Realm}); my $FileTotal = scalar @Files; print "

Found $FileTotal files in base directory '$BaseDir{$Realm}'. "; if ($StartFile) { print "Continuing index process, with file $StartFile:"; } else { print "Starting index:"; } print "
Refresh or reload your browser window if this stops for more than a few minutes.

\n"; my $FILE = ''; my $URL = ''; my $Index = 0; for ($Index = $StartFile; $Index < $FileTotal; $Index++) { my $FILE = $Files[$Index]; my $HTML_Text = Get_String($FILE); # Check for ROBOTS meta tags: if ($HTML_Text =~ m!]*)(NONE|NOINDEX)!i) { print "Warning: file '$FILE' excluded due to robots META tag.
\n"; next; } my ($Size, $LastModT) = (stat($FILE))[7,9]; if ($FILE =~ m!^$BaseDir{$Realm}(.*)$!) { $URL = $BaseURL{$Realm}.$1; print "Added $1 -\> $URL [$Size]
\n"; } else { print "Error: $FILE didn't pattern match by beginning with basedir '$BaseDir{$Realm}'
\n"; next; } print ALLFILES MakeRecord($URL, $LastModT, $HTML_Text, $Size); $FileCount++; unless ($FileCount % 10) { print '
About '; print sprintf("%02.1f",(100 * $FileCount / $FileTotal)); $| = 1; print "\% complete ($FileCount of $FileTotal files)

\n"; $| = 0; } last if ((time - $StartTime) > $Timeout); } if ($Index < ($FileTotal - 1)) { $Index++; my $NextLink = $SCRIPT_NAME . '?Mode=Admin&Action=Build&StartFile='.$Index.'&Realm='.webEncode($Realm).$StringPass; print <<"EOM";

Continue with Build:

The build has stopped, because it has taken more than $Timeout seconds. It is now restarting itself with a fresh process to avoid a server timeout. This page should automatically refresh in 10 seconds. If it does not, click here to continue.

EOM } else { print "

Finished: the build process is complete.

\n"; } close(ALLFILES); chmod($FileMask, $IndexFile{$Realm}); my $Index_Size = -s $IndexFile{$Realm}; print "

Captured $FileCount files during this pass. Index file '$IndexFile{$Realm}' is ".Commas($Index_Size)." bytes.

\n"; print "

This process took ".(time - $StartTime)." seconds.

\n"; } sub Extract_Meta { local $_; my ($HTML_Text,$URL) = @_; my ($Title, $Description, $Keywords, $Links) = ('','','',''); if (($SetSaveLinks == 1) or ($Rules{'Index Links'})) { Link: foreach (split(m!<(A|FRAME|IFRAME) !i, $HTML_Text)) { # made \"? quote optional; added \s\> to [^..] list next unless m!^([^\>]*)(HREF|SRC)\s*=\s*\"?([^\"\s\>]+)!i; my $ThisLink = Trim($3); $Links .= ' '.$ThisLink if $Rules{'Index Links'}; next unless ($SetSaveLinks == 1); next if (($Rules{'Crawler: Follow Query Strings'} == 0) && ($ThisLink =~ m!\?!)); $ThisLink = GetAbsoluteAddress($ThisLink, $URL); # skip if GetAbsoluteAddress failed: next unless $ThisLink; # skip if this is a remote site, and we only want local sites: next if (($LimitSite) && ($ThisLink !~ m!^$LimitSite!i)); # skip file types that aren't interesting: next if ($Rules{'Crawler: Ignore Links To'} and ($ThisLink =~ m!\.($Rules{'Crawler: Ignore Links To'})$!i)); if ($Rules{'Crawler: Follow Offsite Links'} ne '1') { # skip remote links: my $hostname = $1 if ($URL =~ m!^http://(.*?)/!); next unless ($ThisLink =~ m!^http://$hostname/!i); } # skip forbidden sites: my $fURL = ''; foreach $fURL (@ForbidSites) { next Link if ($ThisLink =~ m!^$fURL!i); } # skip long addresses: next if (length($ThisLink) > $Rules{'Max Characters: URL'}); push(@GlobalSavedLinks, $ThisLink); } } if ($HTML_Text =~ m!(.*?).*?<\/TITLE>! !i; # replace title with blank space in HTML text } # tag not found, or doesn't contain alphanumerics - use filename: unless ($Title =~ m!\w+!) { if ($URL =~ m!([^\/]+)$!) { $Title = $1; } elsif ($URL) { $Title = $URL; } else { $Title = 'Document'; } } if (($Rules{'Forbid All Cap Titles'}) && ($Title !~ m![a-z]!)) { my $NewTitle = ''; foreach (split(m!\s+!,$Title)) { # List each word in the title: unless (length($_) > 1) { $NewTitle .= $_.' '; next } # Retain first letter... $NewTitle .= ' '.substr($_,0,1); # get rid of first letter: $_ = substr($_,1,(length($_)-1)); # translate to lowercase: tr[A-Z][a-z]; $NewTitle .= $_; } $Title = $NewTitle; } $Title = substr($Title, 0, $Rules{'Max Characters: Title'}); if ($HTML_Text =~ m!.*?<META([^\>]*?)(NAME|HTTP-EQUIV)="keywords"([^\>]*?)(CONTENT|VALUE)="([^\"]+)"!i) { # Init keywords and add whitespace padding: $Keywords = " $5 "; # Here we automatically take a substring - if the original length of the # keywords was less, it will fail gracefully: $Keywords = substr($Keywords, 0, $Rules{'Max Characters: Keywords'}); } if ($HTML_Text =~ m!.*?<META([^\>]*?)(NAME|HTTP-EQUIV)="description"([^\>]*?)(CONTENT|VALUE)="([^\"]+)"!i) { # Init description and add whitespace padding: $Description = " $5 "; } if ($Rules{'Index ALT Text'}) { # The following code removes all HTML tags which contain an ALT="foo" # attribute and replaces them with the literal string "foo": $HTML_Text =~ s/<[^>]*\s+ALT\s*=\s*"(([^>"])*)"[^>]*>/ $1 /ig; } # The following code strips everything inside <SCRIPT..>...</SCRIPT> and # <STYLE> tags out of the HTML text: my $BlockTag; foreach $BlockTag ('SCRIPT', 'STYLE') { my $NewText = ''; foreach (split(m!</$BlockTag>!i, $HTML_Text)) { if (m!^(.*?)<$BlockTag!i) { $NewText .= ' '.$1; } else { $NewText .= ' '.$_; } } $HTML_Text = $NewText; } # The following code strips text in <NOFRAMES> tag if it's less than 2,000 # characters (since these are mainly "get a frames-capable browser" error # messages.) if ($HTML_Text =~ m!(.*)<NOFRAMES>(.*)</NOFRAMES>(.*?)!i) { if (length($2) < 2000) { $HTML_Text = $1.' '.$2; } } # The following code strips all HTML tags and replaces them with blank # spaces: $HTML_Text =~ s!<([^>]*?)>! !g; # Now all multiple spaces become one space: $HTML_Text =~ s!\s+! !g; unless ($Description) { # Okay, they don't have a description META tag and so the description will be parsed # from their document text. # Get first X allowable characters: my $tempDescription = substr($HTML_Text,0,$Rules{'Max Characters: Auto Description'}); # Strip the last word fragment and replace with "..."; if that fails, deny that a description exists. if ($tempDescription =~ m!([^\|]*)\s+!) { $Description = $1.'...'; } else { $Description = 'No description available.'; } } # Here we automatically take a substring - if the original length of the # description was less, it will fail gracefully: $Description = substr($Description, 0, $Rules{'Max Characters: Description'}); # Clean the Description to remove <, >, = $Description =~ tr!\<\>\=! !; # Condense attributes: $Title =~ s!\s+! !g; # strip leading whitespace: $Title =~ s!^\s!!o; $Description =~ s!\s+! !g; $Description =~ s!^\s!!o; if (($Rules{'Forbid All Cap Descriptions'}) && ($Description !~ /[a-z]/)) { my $NewDescription = ''; foreach (split(/\s+/,$Description)) { # Retain first letter... $NewDescription .= ' '.substr($_,0,1); # get rid of first letter: $_ = substr($_,1,(length($_)-1)); # translate to lowercase: tr[A-Z][a-z]; $NewDescription .= $_; } $Description = $NewDescription; } return($Title,$Description,$HTML_Text,$Links, $Keywords); } # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # The following three functions return the HTML text for printing a single # hit. &StandardVersion() returns the normal text, &AdminVersion() returns # the same text as StandardVersion with the addition of "Edit" and "Delete" # buttons as well as re-routing all links through the redirector # usage: $textoutput = StandardVersion(numrank, url, title, desc, size, day, month, year); sub StandardVersion { local $_; my ($Rank,$URL,$Title,$Description,$Size,$Day,$Month,$Year) = @_; $Size = ($Size<1500)?int($Size).' bytes':(int($Size/1000)).'kb'; my $Term; foreach $Term (@SearchTerms) { my $Temp = Trim($Term); # strip leading and trailing spaces $Temp =~ s!\*!$WildCard!g; $Temp =~ s!(\W+|_)!\\W+!g; # replace interior spaces in the term with match for non-word character(s) $Temp =~ s!$WildCard!\\w\+!g; #print "<h1>$Temp: $Description</h1>"; $Description =~ s!(\b)($Temp)(\b)!$1<B>$2</B>$3!ig; } return <<"EOM"; <DL> <DT><B>$Rank. <A HREF="$URL">$Title</A></B></DT> <DD>$Description<BR> <B>URL:</B><FONT COLOR=#444444> $URL - size $Size - $Day $Month $Year</FONT></DD> </DL> EOM } # &AdminVersion() depends on $SCRIPT_NAME, $RAW{'Password'}, $RAW{'Realm'} sub AdminVersion { local $_; my ($Rank, $URL, $Title, $Description, $Size, $Day, $Month, $Year, $bCloseTags) = @_; $Size = ($Size < 1500) ? int($Size).' bytes' : (int($Size/1000)).'kb'; my $wURL = webEncode($URL); my $strClose = $bCloseTags ? '</DD></DL>' : ''; return <<"EOM"; <DL> <DT><B>$Rank. <A HREF="$SCRIPT_NAME?NextLink=$wURL">$Title</A></B> [ <A HREF="$SCRIPT_NAME?Mode=Admin$StringPass&Action=Edit&URL=$wURL&Realm=$RAW{'Realm'}">Edit</A> | <A HREF="$SCRIPT_NAME?Mode=Admin$StringPass&Action=AddURL&NewURL=$wURL&Realm=$RAW{'Realm'}">Crawl</A> | <A HREF="$SCRIPT_NAME?Mode=Admin$StringPass&Action=DeleteRecord&URL=$wURL&Realm=$RAW{'Realm'}" ONCLICK=\"return confirm('Are you sure you want to delete this object?');\">Delete</A> ]</DT> <DD>$Description<BR><B>URL:</B><FONT COLOR=#444444> $URL - size $Size - $Day $Month $Year</FONT> $strClose EOM } sub webEncode { local $_ = shift; return $_ unless m!(\%|\+|\&|\s)!; # short circuit for normal text s!\%!\%25!g; # URL encode literal % signs s!\+!\%2B!g; # URL encode literal plus signs s!\&!\%26!g; # URL encode literal ampersands s! !\+!g; # make blank spaces + signs return $_; } # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ sub GetFilesByDir { local $_; my ($DMZ, $EXT, @AllDirs) = @_; my (@TextFiles, $FullFileName, $ThisDir); my $MaxDirCount = 1; # Hold names of symbolic links: my %SymLinks; for ($ThisDir = 0; $ThisDir < $MaxDirCount; $ThisDir++) { unless (opendir(DIR, $AllDirs[$ThisDir])) { print "<!-- GetFilesByDir error: could not search the folder $AllDirs[$ThisDir] - the system returned \"$!\" -->\n"; next; } else { print "<!-- successfully opened directory $AllDirs[$ThisDir] -->\n"; } foreach (readdir(DIR)) { # Skip, for the . and .. entries: next, if (m!^\.\.?$!); # Build the absolute file name: $FullFileName = $AllDirs[$ThisDir].'/'.$_; # Skip, if this file or folder is in the DMZ: next, if ($DMZ =~ m| $FullFileName.? |i); # Since so many people use Front Page, save them from having their # _vti_XXX directories indexed: next, if ($FullFileName =~ m!\_vti!i); # If this is a folder, store for a later loop: if (-d $FullFileName) { # Is this a symlink? if (-l $FullFileName) { print "<!-- '$FullFileName' is a symbolic link!! ($_) -->\n"; unless ($AllowSymbolicLinks) { print "<!-- skipping '$FullFileName' because AllowSymbolicLinks is false -->\n"; next; } unless ($TrustSymbolicLinks) { # Record this name for avoiding loops: if ($SymLinks{$_}) { print "<!-- skipping '$FullFileName' because a symbolic link named '$_' has already been encountered during this crawl session. -->\n"; next; } $SymLinks{$_}++; } } $MaxDirCount++; push(@AllDirs, $FullFileName); next; } # Skip, if this is not a text file, and we care about bin/text: next, unless (($AllowBinaryFiles) || (-T $FullFileName)); # Skip, if this doesn't have a file extension, or if it's # file extension isn't allowed in EXT: next, if ($FullFileName !~ /\.([^\.]+)$/) or ($EXT !~ m/ $1 /i); # Valid file, add it to the output: push(@TextFiles, $FullFileName); } closedir(DIR); } return @TextFiles; } sub ReviewIndex { local $_; # similar to search_runtime and search_indexfile... my ($Realm) = @_; my $Start = ($FORM{'Start'}) ? $FORM{'Start'} : 0; my $End = $Start + $Rules{'Crawler: Max Pages Per Batch'}; my ($showNext, $showNextCount); print '<BLOCKQUOTE>'; my $Count = 0; if ($IndexFile{$Realm} eq 'RUNTIME') { # Remove trailing slashes if present: $BaseDir{$Realm} = $1 if ($BaseDir{$Realm} =~ m!(.*)/$!); $BaseURL{$Realm} = $1 if ($BaseURL{$Realm} =~ m!(.*)/$!); my $DMZ = ForbidFilesFromSites($BaseDir{$Realm}, $BaseURL{$Realm}, @ForbidSites); $DMZ .= ' '.$Exclude{$Realm}; my $FILE = ''; my $URL = ''; foreach $FILE ((GetFilesByDir($DMZ,$EXT,$BaseDir{$Realm}))[$Start..$End]) { $Count++; my $HTML_Text = Get_String($FILE); my ($Size, $LastModT) = (stat($FILE))[7,9]; if ($FILE =~ m!^$BaseDir{$Realm}(.*)$!) { $URL = $BaseURL{$Realm}.$1; } else { print "waring, $FILE didn't fix regexp."; } my ($Title, $Description); my $Record = MakeRecord($URL, $LastModT, $HTML_Text, $Size, \$Title, \$Description); my ($DD, $MM, $YYYY, $FBYTES) = (unpack('A2A2A2A4A*', substr($Record, 0, 16)))[1..4]; print AdminVersion($Count, $URL, $Title, $Description, $FBYTES, $DD, $MonthNames[$MM], $YYYY, 1); } } # End "if search runtime" else { my ($IndexFile) = $IndexFile{$Realm}; unless (open(ALLFILE,"<$IndexFile")) { # Bad fatal error - we don't have permission to open the index, or # the path to the index is wrong. Do not go fatal, just print # error to STDOUT and return null: if ($AllowDebug == 1) { print "<!-- Error opening $IndexFile for reading - $! -->\n"; } return 0; } $showNext = 0; RECORD: while (<ALLFILE>) { $Count++; next if ($Count < ($Start + 1)); if ($Count > $End) { $showNext = 1; last RECORD; } m!^(\d+) u= (.+?) t= (.*?) d= (.*?) uM=!; my ($URL,$Title,$Description) = ($2,$3,$4); my ($DD,$MM,$YYYY,$FBYTES) = (unpack('A2A2A2A4A*',$1))[1..4]; print AdminVersion($Count, $URL, $Title, $Description, $FBYTES, $DD, $MonthNames[$MM], $YYYY, 1); } $showNextCount = 1; COUNT: while (<ALLFILE>) { $showNextCount++; if ($showNextCount >= $Rules{'Crawler: Max Pages Per Batch'}) { last COUNT; } } close(ALLFILE); } # End "if Realm has index file" print '</BLOCKQUOTE>'; if ($showNext && $showNextCount) { print '<BR><P ALIGN="center"><B><A HREF="'.$SCRIPT_NAME.'?Realm='.$RAW{'Realm'}.'&Start='.$End."&Mode=Admin&Action=$RAW{'Action'}$StringPass\"> Next ".$showNextCount.' Matches </A></B></P>'; } } sub PrintEditRecordForm { local $_; my ($Realm,$EditURL) = @_; if ($IndexFile{$Realm} eq 'RUNTIME') { print <<"EOM"; <P>This record is generated dynamically from your actual files. To change the title or description, you'll have to change those attributes of the file.</P> EOM } elsif (defined $BaseDir{$Realm}) { print <<"EOM"; <P>Warning: If the crawler encounters this page again and re-indexes it, specific changes you make here will be overwritten. A better way to change the title or description of this page is to edit the actual file's title and description attributes.</P> EOM } else { print <<"EOM"; <P>Warning: If the crawler encounters this page again and re-indexes it, then any specific changes made here will be lost.</P> EOM } my ($IndexFile) = $IndexFile{$Realm}; unless (open(ALLFILE,"<$IndexFile")) { # Bad fatal error - we don't have permission to open the index, or # the path to the index is wrong. Do not go fatal, just print # error to STDOUT and return null: if ($AllowDebug == 1) { print "<!-- Error opening $IndexFile for reading - $! -->\n"; } return 0; } my $found = 0; my ($URL, $Title, $Description, $Preference, $DD, $MM, $YYYY, $FBYTES); RECORD: while (<ALLFILE>) { m!^(\d+) u= (.+?) t= (.*?) d= (.*?) uM=!; ($URL,$Title,$Description) = ($2,$3,$4); next unless ($URL eq $EditURL); $found = 1; ($Preference,$DD,$MM,$YYYY,$FBYTES) = (unpack('A2A2A2A4A*',$1))[0..4]; print '<BLOCKQUOTE>'; print AdminVersion(1, $URL, $Title, $Description, $FBYTES, $DD, $MonthNames[$MM], $YYYY, 1); print '</BLOCKQUOTE>'; last; } close(ALLFILE); if ($found == 0) { print "<P><B>Error:</B> the web address $EditURL was not found in the index file for the \"$Realm\" realm.</P>\n"; } else { $MM++; print <<"EOM"; <FORM METHOD="$REQUEST_METHOD" ACTION="$SCRIPT_NAME"> <INPUT TYPE="hidden" NAME="Mode" VALUE="Admin"> $HiddenPass <INPUT TYPE="hidden" NAME="Action" VALUE="SaveEditedRecord"> <INPUT TYPE="hidden" NAME="Realm" VALUE="$Realm"> <INPUT TYPE="hidden" NAME="EditURL" VALUE="$EditURL"> <HR SIZE="1"><TABLE BORDER="0"> <TR><TD ALIGN="right"><B>Document Title:</B> </TD><TD><INPUT TYPE="text" NAME="Title" VALUE="$Title" SIZE="60"></TD></TR> <TR><TD ALIGN="right"><B>Web Address:</B> </TD><TD><INPUT TYPE="text" NAME="URL" VALUE="$URL" SIZE="60"></TD></TR> <TR><TD ALIGN="right"><B>File Size:</B> </TD><TD><INPUT TYPE="text" NAME="FBYTES" VALUE="$FBYTES" SIZE="6" MAXLENGTH="6"> bytes</TD></TR> <TR><TD ALIGN="right"><B>Last Modified:</B> </TD><TD><INPUT TYPE="text" NAME="MM" VALUE="$MM" SIZE="2" MAXLENGTH="2">-<INPUT TYPE="text" NAME="DD" VALUE="$DD" SIZE="2" MAXLENGTH="2">-<INPUT TYPE="text" NAME="YYYY" VALUE="$YYYY" SIZE="4" MAXLENGTH="4"> (month-day-year)</TD></TR> <TR><TD ALIGN="right"><B>Preference:</B> </TD><TD><INPUT TYPE="text" NAME="Preference" VALUE="$Preference" SIZE="2" MAXLENGTH="2"></TD></TR> <TR><TD ALIGN="right" VALIGN="top"><B>Description:</B> </TD><TD><TEXTAREA NAME="Description" ROWS="3" COLS="60">$Description</TEXTAREA></TD></TR> <TR><TD><BR></TD><TD><INPUT TYPE="submit" CLASS="submit" VALUE=" Update Record "></TD></TR> </TABLE><HR SIZE="1"> </FORM> <BLOCKQUOTE> <P>The "Web Address" field is the primary key of the database. If you change the <B>Web Address</B> field, a new record with the new information will be created, and the old record will be removed.</P> <P>The <B>File Size</B> attribute must be an integer, no more than 6 characters long.</P> <P>The <B>Last Modified</B> date must be in Month-Day-Year format. Use the full four-digit year, as in "1985".</P> <P>The <B>Preference</B> field is a multiplier used in weighing the relevance of search results. By default it is 01; increasing it will cause a particular web page appear earlier in the search results.</P> <P>To save changes, click the <B>Update Record</B> button. If you don't want to make changes, you may follow this link back to the <A HREF="$SCRIPT_NAME?Realm=$RAW{'Realm'}&Mode=Admin&Action=Review$StringPass">list of all web pages</A> for the $FORM{'Realm'} realm.</P> </BLOCKQUOTE> EOM } } sub SaveEditedRecord { local $_; my $EditURL = $FORM{'EditURL'}; my $FBYTES = sprintf('%06.f',$FORM{'FBYTES'}); my ($Title,$Description) = ($FORM{'Title'},$FORM{'Description'}); my $AlphaData = ' '; $AlphaData .= "u= $FORM{'URL'} "; $AlphaData .= "t= $Title "; $AlphaData .= "d= $Description "; $AlphaData .= 'uM='.&CompressStrip($FORM{'URL'}); # h= * l= * attributes are retained from old record. my $DD = sprintf('%02.f',$FORM{'DD'}); my $MM = sprintf('%02.f',($FORM{'MM'}-1)); # correct for human vs computer time my $YYYY = sprintf('%04.f',$FORM{'YYYY'}); $FORM{'Preference'} = 1 unless ($FORM{'Preference'} =~ m!^\d+$!); my $CC = sprintf('%02.f',$FORM{'Preference'}); my $TrailingData = ''; my $LeadingData = "$CC$DD$MM$YYYY$FBYTES$AlphaData"; my $File = $IndexFile{$FORM{'Realm'}}; my $matchURL; # Check: is $TempFile in use? if (-e "$TempFile.lock") { # yep, it's in use. sleep for 3 seconds and try again... sleep(3); if (-e "$TempFile.lock") { # still in use. wait another 3 seconds... sleep(3); if (-e "$TempFile.lock") { # okay, we need to return an error. # what was the PID and timestamp of the locking process? my $lockTimeStamp = scalar localtime((stat("$TempFile.lock"))[9]); my $lockPID = ''; if (open(LOCK,"<$TempFile.lock")) { $lockPID = 'is '.<LOCK>; close(LOCK); } else { $lockPID = "could not be read from the lock file because the filesystem returned the error \"$!\" when a read was attempted"; } print <<"EOM"; <P><HR SIZE="1"><B><FONT COLOR="#ff0000">Error:</FONT></B> This operation could not be completed because the database system is locked by another process which is trying to edit the same data. The PID of this process $lockPID. This operation waited 6 seconds for the lock to be released, but it was not released. The lock has been in place since $lockTimeStamp.</P> <P>You may want to wait a few minutes, verify that no other users are editing the index files, and then click here to <A HREF=\"$SCRIPT_NAME?Mode=Admin&Action=forceUnLock$StringPass\">force an unlock</A> of the temp file.</P> <P>This is a critical error which overrides any other error or success messages written below.<HR SIZE="1"></P> EOM return (0,0); # early exit with no disk I/O } # not locked after 6 seconds, continue... } # not locked after 3 seconds, continue... } # Not locked, okay so now we're going to lock it: if (open(LOCK,">$TempFile.lock")) { binmode(LOCK); print LOCK $$; # $$ is perl's var for PID close(LOCK); chmod($FileMask, "$TempFile.lock"); } else { print <<"EOM"; <P><B>Error:</B> This operation could not be completed, because this process was unable to place an exclusive lock on the database files. A lock is placed by writing to the file $TempFile.lock - writing to this file failed with the error message $!.</P> <P>You may want to wait a few minutes, verify that no other users are editing the index files, and then click here to <A HREF=\"$SCRIPT_NAME?Mode=Admin&Action=forceUnLock$StringPass\">force an unlock</A> of the temp file.</P> EOM return (0,0); # early exit with no disk I/O } unless (open(NEWFILES,">$TempFile")) { print "Error: could not write to temp file $TempFile in $DataFilesDir - $!\n"; return (0,0); } binmode(NEWFILES); if (!(open(ALLFILES,"<$File"))) { # open failed - this is okay if the file doesn't exist yet, fatal otherwise. if (-e $File) { print "Error: could not read from $File in $DataFilesDir - $!\n"; return (0,0); } # didn't exist, we need to create it so that the rename() operations work elsif (open(TEMP,">>$File")) { # opened okay, write one blank line: print TEMP "\n"; close(TEMP); } else { print "Error: $File does not exist and could not be created - $!\n"; return (0,0); } } else { my $Record = ''; RECORD: while ($Record = <ALLFILES>) { # compare whether an existing entry is there: $matchURL = quotemeta($EditURL); if ($Record =~ m!u= $matchURL .*? h=(.*)!) { $TrailingData = 'h= '.$1; $TrailingData =~ s!(\r|\n)!!g; # safe, kill newlines next RECORD; # this url just got skipped; trailing data was captured. } # no duplicate entry, go ahead and print it. print NEWFILES $Record; } } if ($TrailingData) { print NEWFILES "$LeadingData $TrailingData\n"; } close(NEWFILES); close(ALLFILES); chmod($FileMask, $TempFile); if ((-e $BackFile) && (!(unlink($BackFile)))) { print "Error: could not delete $BackFile - $!\n"; return (0,0); } unless (rename($File, $BackFile)) { print "Error: could not rename $File to $BackFile - $!.\n"; return (0,0); } unless (rename($TempFile, $File)) { print "Error: could not rename $TempFile to $File - $!.\n"; return (0,0); } chmod($FileMask, $BackFile); chmod($FileMask, $File); if ($TrailingData) { print "Update on $EditURL okay.\n"; print '<BLOCKQUOTE>'; print AdminVersion(1, $FORM{'URL'}, $Title, $Description, $FBYTES, $DD, $MonthNames[$MM], $YYYY, 1); print '</BLOCKQUOTE>'; } else { print "Updated failed: the address $EditURL was not found in the $FORM{'Realm'} database.\n"; } # Done with our disk I/O - clear the lock: unless (unlink("$TempFile.lock")) { print "<B>Warning:</B> the disk operation completed successfully, but this process was unable to clear the lock on the database system. Future operations will fail until $TempFile.lock has been deleted from the \$DataFilesDir.</P><P>You may want to wait a few minutes, verify that no other users are editing the index files, and then click here to <A HREF=\"$SCRIPT_NAME?Mode=Admin&Action=forceUnLock$StringPass\">force an unlock</A> of the temp file.</P>"; } print <<"EOM"; <P>Return to the starting <A HREF="$SCRIPT_NAME?Realm=$RAW{'Realm'}&Mode=Admin&Action=Review$StringPass">list of all web pages</A> for the $FORM{'Realm'} realm.</P> EOM } sub DeleteRecord { local $_; my $File = $IndexFile{$FORM{'Realm'}}; my ($matchURL, $TrailingData) = ('', ''); # Check: is $TempFile in use? if (-e "$TempFile.lock") { # yep, it's in use. sleep for 3 seconds and try again... sleep(3); if (-e "$TempFile.lock") { # still in use. wait another 3 seconds... sleep(3); if (-e "$TempFile.lock") { # okay, we need to return an error. # what was the PID and timestamp of the locking process? my $lockTimeStamp = scalar localtime((stat("$TempFile.lock"))[9]); my $lockPID = ''; if (open(LOCK,"<$TempFile.lock")) { $lockPID = 'is '.<LOCK>; close(LOCK); } else { $lockPID = "could not be read from the lock file because the filesystem returned the error \"$!\" when a read was attempted"; } print <<"EOM"; <P><HR SIZE="1"><B><FONT COLOR="#ff0000">Error:</FONT></B> This operation could not be completed because the database system is locked by another process which is trying to edit the same data. The PID of this process $lockPID. This operation waited 6 seconds for the lock to be released, but it was not released. The lock has been in place since $lockTimeStamp.</P> <P>You may want to wait a few minutes, verify that no other users are editing the index files, and then click here to <A HREF=\"$SCRIPT_NAME?Mode=Admin&Action=forceUnLock$StringPass\">force an unlock</A> of the temp file.</P> <P>This is a critical error which overrides any other error or success messages written below.<HR SIZE="1"></P> EOM return (0,0); # early exit with no disk I/O } # not locked after 6 seconds, continue... } # not locked after 3 seconds, continue... } # Not locked, okay so now we're going to lock it: if (open(LOCK,">$TempFile.lock")) { binmode(LOCK); print LOCK $$; # $$ is perl's var for PID close(LOCK); chmod($FileMask, "$TempFile.lock"); } else { print <<"EOM"; <P><B>Error:</B> This operation could not be completed, because this process was unable to place an exclusive lock on the database files. A lock is placed by writing to the file $TempFile.lock - writing to this file failed with the error message $!.</P> <P>You may want to wait a few minutes, verify that no other users are editing the index files, and then click here to <A HREF=\"$SCRIPT_NAME?Mode=Admin&Action=forceUnLock$StringPass\">force an unlock</A> of the temp file.</P> EOM return (0,0); # early exit with no disk I/O } unless (open(NEWFILES,">$TempFile")) { print "Error: could not write to temp file $TempFile in $DataFilesDir - $!\n"; return (0,0); } binmode(NEWFILES); if (!(open(ALLFILES,"<$File"))) { # open failed - this is okay if the file doesn't exist yet, fatal otherwise. if (-e $File) { print "Error: could not read from $File in $DataFilesDir - $!\n"; return (0,0); } # didn't exist, we need to create it so that the rename() operations work elsif (open(TEMP,">>$File")) { # opened okay, write one blank line: print TEMP "\n"; close(TEMP); } else { print "Error: $File does not exist and could not be created - $!\n"; return (0,0); } } else { my $Record = ''; RECORD: while ($Record = <ALLFILES>) { # compare whether an existing entry is there: $matchURL = quotemeta($FORM{'URL'}); if ($Record =~ m!u= $matchURL .*? h=(.*)!) { $TrailingData = 'h= '.$1; $TrailingData =~ s!(\r|\n)!!g; # safe, kill newlines next RECORD; # this url just got skipped; trailing data was captured. } # no duplicate entry, go ahead and print it. print NEWFILES $Record; } } close(NEWFILES); chmod($FileMask, $TempFile); close(ALLFILES); if ((-e $BackFile) && (!(unlink($BackFile)))) { print "Error: could not delete $BackFile - $!\n"; return (0,0); } unless (rename($File, $BackFile)) { print "Error: could not rename $File to $BackFile - $!.\n"; return (0,0); } unless (rename($TempFile, $File)) { print "Error: could not rename $TempFile to $File - $!.\n"; return (0,0); } chmod($FileMask, $BackFile); chmod($FileMask, $File); # Done with our disk I/O - clear the lock: unless (unlink("$TempFile.lock")) { print "<P><B>Warning:</B> the disk operation completed successfully, but this process was unable to clear the lock on the database system. Future operations will fail until $TempFile.lock has been deleted from the \$DataFilesDir.</P><P>You may want to wait a few minutes, verify that no other users are editing the index files, and then click here to <A HREF=\"$SCRIPT_NAME?Mode=Admin&Action=forceUnLock$StringPass\">force an unlock</A> of the temp file.</P>"; } if ($TrailingData) { print "<P>Deleted '$FORM{'URL'}' from the '$FORM{'Realm'}' database.</P>"; } else { print "<P>Error: the address '$FORM{'URL'}' was not found in the '$FORM{'Realm'}' database.</P>"; } # delete this entry from the pending pages file: print "<P>Removing URL '$FORM{'URL'}' from the pending pages file.</P>"; if (not (open(FILE, "<$PendingPagesFile"))) { print "<P>Warning: unable to read from the pages file '$PendingPagesFile' - $!. This may prevent complete deletion of the record.</P>"; } elsif (not (open(TEMP, ">$TempFile"))) { print "<P>Warning: unable to write to temp file '$TempFile' - $!. This may prevent complete deletion of the record.</P>"; } else { # all i/o ok: # write all data from FILE to TEMP, unless the web address matches: while (<FILE>) { print TEMP unless m!^$matchURL !; } # skip all matching lines: while (<FILE>) { unless (m!^$matchURL !) { print TEMP; last; } } # write all remaining data: while (<FILE>) { print TEMP; } close(TEMP); close(FILE); if (unlink($PendingPagesFile) and rename($TempFile, $PendingPagesFile)) { print "<P>Updated pending pages file.</P>"; } else { print "<P>Warning: unable to update file '$PendingPagesFile' - $!. This may prevent complete deletion of the record.</P>"; } } print <<"EOM"; <P>Return to the starting <A HREF="$SCRIPT_NAME?Realm=$RAW{'Realm'}&Mode=Admin&Action=Review$StringPass">list of all web pages</A> for the $FORM{'Realm'} realm.</P> EOM } sub forceUnLock { local $_; # This procedure will delete the lock on the temp file. print <<"EOM"; <P><B>Status:</B> Attempting to delete the lock on the temp file and verify filesystem data...</P> EOM if (-e "$TempFile.lock") { print "<P><B>Status:</B> Confirmed that \"<TT>$TempFile.lock</TT>\" exists...</P>\n"; if (unlink("$TempFile.lock")) { print "<P><B><FONT COLOR=\"#0000ff\">Success:</FONT></B> File \"<TT>$TempFile.lock</TT>\" was deleted successfully...</P>\n"; } else { print "<P><B><FONT COLOR=\"#ff0000\">Failed:</FONT></B> The attempt to delete \"<TT>$TempFile.lock</TT>\" failed with \"$!\". You will most likely have to log on to this server and remove the lock file manually.</P>\n"; } } else { print "<P><B><FONT COLOR=\"#0000ff\">Success:</FONT></B> The file \"<TT>$TempFile.lock</TT>\" no longer exists...</P>"; } print "<P><B>Status:</B> Verifying filesystem data...</P>\n"; if (-e $BackFile) { my $size = -s $BackFile; print "<P><B><FONT COLOR=\"#0000ff\">Success:</FONT></B> Backup file \"<TT>$BackFile</TT>\" exists with size $size bytes.</P>\n"; } else { print "<P><B><FONT COLOR=\"#ff0000\">Warning:</FONT></B> Backup file \"<TT>$BackFile</TT>\" does not exist. During normal transactions the backup file will exist on exit.</P>\n"; } if (-e $TempFile) { my $size = -s $TempFile; print "<P><B><FONT COLOR=\"#ff0000\">Warning:</FONT></B> Temp file \"<TT>$TempFile</TT>\" exists with size $size bytes. This temp file will be overwritten as normal during the next database transaction. During successful transactions the temp file is deleted automatically - the existence of a temp file suggests that some transaction in the past failed. If any of the index files below are missing, you may want to log on to the server and move the contents of the temp file to your missing index file.</P>\n"; } else { print "<P><B><FONT COLOR=\"#0000ff\">Success:</FONT></B> Temp file \"<TT>$TempFile</TT>\" does not exist.</P>\n"; } foreach (sort keys %IndexFile) { my $realm = $_; my $file = $IndexFile{$realm}; next if ($file eq 'RUNTIME'); if (-e $file) { my $size = -s $file; print "<P><B><FONT COLOR=\"#0000ff\">Success:</FONT></B> Index file \"<TT>$file</TT>\" for realm \"$realm\" exists with size $size bytes.</P>\n"; } else { print "<P><B><FONT COLOR=\"#ff0000\">Warning:</FONT></B> Index file \"<TT>$file</TT>\" for realm \"$realm\" does not exist. Either this file was lost, or you haven't yet built an index file for this realm.</P>\n"; } } print '<BLOCKQUOTE><B>Please keep backups of all data files!</B><P>The transaction you were running when you encountered the file lock most likely did not complete successfully, and you will have to run it again.</P></BLOCKQUOTE>'; } sub x8t { local $_; return 0 if ($pq > 7); if ($pq % 2) { return substr($O, int($pq/2), 1) % 5; } else { return int(substr($O, ($pq/2), 1) / 5); } } sub passFunc { local $_; $pq = 0; my $Z = ''; my ($W, $cP); ($W, $O, $cP) = @_; my $w; if($cP eq '1') { srand($$*time); foreach (split(m!!,$W)) { $Z.=sprintf("%02.f",((ord($_)-31+$pq+x8t[$pq])%100)); $pq++; if (($pq % 3)==1){ $pq++; $Z.=sprintf("%02.f",int((100*rand())%100)) } } } else { while (length($W)) { !($W!~m!^(..)(.*)$!)||last; $W=$2,$w=$1-$pq-x8t($pq),$pq++; if($w<0){ $w+=100 } (2==($pq%3))||($Z.=chr($w+31)); } } return ($cP eq '1')?$Z:(($cP eq crypt($Z,'$Z'))?1:0); } sub CrawlEntireSite { local $_; my ($Realm) = @_; my @ReIndex = (); my ($Count, $Limit) = (0, $Rules{'Crawler: Max Pages Per Batch'}); unless (open(FILE,"<$PendingPagesFile")) { print "<P>The pending pages file could not be opened, so the list of web addresses in the $Realm realm could not be found.</P>\n"; return 0; } binmode(FILE); my $matchRealm = quotemeta(webEncode($Realm)); my $cutTime = $FORM{'StartTime'}; if ($FORM{'DaysPast'}) { $cutTime -= (86400 * $FORM{'DaysPast'}); } while (<FILE>) { next unless (m!^(.*?) $matchRealm (\d+)!); my ($URL, $time) = ($1, $2); next unless ($URL =~ m!^$FORM{'LimitSite'}!i); if ($time == 2) { $FORM{'LimitFailed'}++; } elsif ($time >= $cutTime) { $FORM{'LimitIndexed'}++; } else { $FORM{'LimitPending'}++; push(@ReIndex,$URL); #print "<P>Planning to index: $URL</P>\n"; $Count++; last if ($FORM{'TotalBatches'} && ($Count > $Limit)); } } close(FILE); unless ($FORM{'TotalBatches'}) { $FORM{'TotalBatches'} = 1 + int((scalar @ReIndex)/$Limit); } &AddURL(1, @ReIndex); } sub MaintainRealm { local $_; my ($Realm) = @_; my (@MasterArray, @Explore, @ReIndex) = (); my ($Count,$Limit) = (0,$Rules{'Crawler: Max Pages Per Batch'}); unless (open(FILE,"<$PendingPagesFile")) { print "<P>The pending pages file could not be opened, so the list of web addresses in the $Realm realm could not be found.</P>\n"; return 0; } my $matchRealm = quotemeta(webEncode($Realm)); my $cutTime = (time - (86400*$Rules{'Crawler: Days Til Refresh'})); while (<FILE>) { next unless (m!^(.*?) $matchRealm (\d+)!); my ($URL,$time) = ($1,$2); if ($time == 0) { push(@Explore, $URL); } elsif (($time != 2) && ($time < $cutTime)) { push(@ReIndex,$URL); $Count++; last if ($Count >= $Limit); } } close(FILE); @MasterArray = (@ReIndex,@Explore); # re-indexing has higher priority my $MasterSize = scalar (@MasterArray); if ($MasterSize > $Limit) { splice(@MasterArray, $Limit, ($MasterSize-$Limit)); } &AddURL(1, @MasterArray); } sub ReCrawlRealm { local $_; my ($Realm) = @_; my (@ReIndex) = (); my ($Count, $Limit) = (0, $Rules{'Crawler: Max Pages Per Batch'}); unless (open(FILE,"<$PendingPagesFile")) { print "<P>The pending pages file could not be opened, so the list of web addresses in the $Realm realm could not be found.</P>\n"; return 0; } binmode(FILE); my $matchRealm = quotemeta(webEncode($Realm)); my $cutTime = $FORM{'StartTime'}; if ($FORM{'DaysPast'}) { $cutTime -= (86400 * $FORM{'DaysPast'}); } while (<FILE>) { next unless (m!^(.*?) $matchRealm (\d+)!); my ($URL,$time) = ($1,$2); if (($time > 2) && ($time < $cutTime)) { push(@ReIndex,$URL); #print "<P>Planning to index: $URL</P>\n"; $Count++; last if ($FORM{'TotalBatches'} && ($Count > $Limit)); } elsif ($time >= $cutTime) { #print "<P>Document $URL is too new! ($time greater or requal to $cutTime)</P>\n"; } } close(FILE); unless ($FORM{'TotalBatches'}) { $FORM{'TotalBatches'} = 1 + int((scalar @ReIndex)/$Limit); } &AddURL(1, @ReIndex); } sub Authenticate { local $_; my $AuthCookie = ''; my $AUTH_EXIT = 0; # Returns 1 if authentication succeeds. If authentication fails, this # function exits the entire script, after handling such details as prompting # for a new password. Just to be safe, the syntax is: # exit unless Authenticate(); # This function must be called before HTTP headers have been written to the # client. This will write the HTTP status and possibly some Set-Cookie # headers. # foreach incoming request, do: # if not admin routine (i.e., if "Mode=Admin" doesn't exist in query string) # 200 OK # allow user through # next request if ((!$FORM{'Mode'}) or ($FORM{'Mode'} ne 'Admin')) { return 1; } # if (request logout page) # 200 OK # clear auth cookie # show login page # exit script if (($FORM{'Action'}) and ($FORM{'Action'} eq 'LogOut')) { &SetCookie(0); print "Content-Type: text/html\r\n"; print "\r\n"; print TextEnterPassword(); return $AUTH_EXIT; } # # now we know it is a non-trivial admin request: # if (cookie exists) and (cookie auth ok) # 200 OK # allow user through # next request if ($ENV{'HTTP_COOKIE'} && ($ENV{'HTTP_COOKIE'} =~ m!AuthCookie=(\d+)!) && (passFunc($1,(substr(time,1,4)),$CryptPassword))) { return 1; } # if (cookie exists) and (cookie auth ok-but-stale) # 200 OK # set fresh auth cookie # allow user through # next request if ($ENV{'HTTP_COOKIE'} && ($ENV{'HTTP_COOKIE'} =~ m!AuthCookie=(\d+)!) && (passFunc($1,(substr((time+10E4),1,4)),$CryptPassword))) { $AuthCookie = passFunc($FORM{'Password'},(substr(time,1,4)),1); &SetCookie($AuthCookie); return 1; } # # cookie auth has failed: # if (form password exists) and (form password ok) # if (require cookie auth) # 302 Redirect # set new auth cookie # redirect user to this page again # set "IsAuthRedir" string # exit (next request) # else # 200 OK # set new auth cookie # allow user through # next request if (($FORM{'Password'}) && ($CryptPassword eq crypt($FORM{'Password'},'$Z'))) { $AuthCookie = passFunc($FORM{'Password'},(substr(time,1,4)),1); &SetCookie($AuthCookie); if ($AllowClearTextAuth) { $bUseClearTextAuth = 1; return 1; } else { print "Content-Type: text/html\r\n"; print "\r\n"; print "<META HTTP-EQUIV=\"refresh\" CONTENT=\"2;url=$ENV{'SCRIPT_NAME'}?Mode=Admin&IsAuthRedir=1\">\n"; print "<P>Password correct. Attempting to load your authentication data (wait 2 seconds.)</P>\r\n"; return $AUTH_EXIT; } } # Cookie and form auth have failed; all cases below will end with exit. These # cases are a little more complex because the include offering the end user # the ability to set his password. # # 200 OK # prompt for password # if (form password exists) # warn that submitted password failed # if ("IsAuthRedir" query string exists) # warn that client is not accepting cookies # offer tips on fixing this # exit (next request) print "Content-Type: text/html\r\n"; print "\r\n"; print $Header; if (($AllowSetPassword) && ($FORM{'SetPassword'} ne '')) { if ($FORM{'Password'} && $FORM{'X2'} && ($FORM{'Password'} eq $FORM{'X2'})) { print TextPasswordSet(); } else { print PromptSetPassword(); } } else { print TextEnterPassword(); } return $AUTH_EXIT; } # End of sub Authenticate(); sub SetCookie { my ($AuthCookie) = @_; print "Set-Cookie: AuthCookie=$AuthCookie; path=/\r\n"; } sub TextEnterPassword { local $_; my $Text = <<"EOM"; <BLOCKQUOTE> <BR> <FORM ACTION="$SCRIPT_NAME" METHOD="post" NAME="AuthForm"> <INPUT TYPE="hidden" NAME="Mode" VALUE="Admin"> <P>Welcome to your search engine's admin page.</P> <P>You'll need your password to access this page. Type your password in, then click the submit button:</P> <BLOCKQUOTE> <TABLE BORDER="0"> <TR> <TD><B>Your Password:</B>  </TD> <TD><INPUT TYPE="password" NAME="Password"></TD> </TR> <TR> <TD><BR></TD> <TD><INPUT TYPE="submit" VALUE="the submit button" STYLE="background-color: #ffffff"></TD> </TR> </TABLE> <SCRIPT LANGUAGE="JavaScript"> <!-- document.AuthForm.Password.focus(); // --> </SCRIPT> </BLOCKQUOTE> EOM if ($FORM{'Password'}) { $Text .= '<P>The password you just typed in was incorrect.'; $FORM{'NumFailures'} = 0 unless $FORM{'NumFailures'}; if ($FORM{'NumFailures'} == 1) { $Text .= ' Is your CAPS lock key on, perhaps?'; } elsif ($FORM{'NumFailures'} == 4) { $Text .= ' New password time? You can reset it if ya want.'; } $Text .= '<INPUT TYPE="hidden" NAME="NumFailures" VALUE="'.(($FORM{'NumFailures'}+1)%5).'"></P>'; } if ($FORM{'IsAuthRedir'}) { $Text .= "<P><HR></P><P><B>Warning:</B> this search engine requires cookies-based authentication; however, it appears that your browser is not accepting the auth cookie. The auth cookie must be accepted! Please make sure your browser is configured to accept cookies, and that the date on your computer matches the date on this server (the current server date is " . scalar localtime(time) .").</P><P>If you are unable to get cookies to work, then you must edit this script and set</P><BLOCKQUOTE><TT>\$AllowClearTextAuth = 1;</TT></BLOCKQUOTE><P>This will permit you to administer your search engine without accepting cookies. However, fixing cookies is a better solution.</P>"; } else { $Text .= <<"EOM" if ($AllowSetPassword == 1); <P><B>Or...</B></P> <P>Click the button below if you want to set a new password. First time users should click here:</P> <BLOCKQUOTE><INPUT TYPE="submit" VALUE=" I Want A New Password " NAME="SetPassword" STYLE="background-color: #ffffff"></BLOCKQUOTE> EOM $Text .= <<"EOM" if ($AllowSetPassword != 1); <P>If you forget your password, you can create a new one. To do so, edit this script and set the <TT>\$AllowSetPassword</TT> variable to the number 1.</P> EOM } $Text .= '</FORM></BLOCKQUOTE></td></tr></table></td></tr></table></BODY></HTML>'; return $Text; } sub PromptSetPassword { local $_; return <<"EOM"; <BLOCKQUOTE> <P>You need to create an <I>administrative password</I>.</P> <P>This search engine is managed from web pages. A strong password prevents others from accessing these administrative pages.</P> <BLOCKQUOTE> <P>You are allowed to make up your password. There aren't any fixed rules about what it can be, but it should be difficult for others to guess. Your password will be Case Sensitive.</P> </BLOCKQUOTE> <P>Enter the same <B>password</B> twice.</P> <P>Then <B>submit the form</B>.</P> <P>You'll receive instructions on what to do next.</P> <BLOCKQUOTE> <BR> <FORM ACTION="$SCRIPT_NAME" METHOD="post" NAME="FirstForm" OnSubmit="return Validate();"> <INPUT TYPE="hidden" NAME="Mode" VALUE="Admin"> <INPUT TYPE="hidden" NAME="SetPassword" VALUE="on"> <SCRIPT LANGUAGE="JavaScript"> <!-- // var NumErrors = 0; function Validate() { if (NumErrors > 1) { NumErrors = 0; if (confirm("It looks like you're having trouble.\\n\\nThat's okay, a lot of people have trouble with these types of forms.\\n\\nWould you like me to transport you to the Fluid Dynamics help page? It has some tips on how to set your password.")) { parent.location.href = "http://www.xav.com/scripts/search/admin_help.html"; return false; } } if (document.FirstForm.Password.value != document.FirstForm.X2.value) { alert("The passwords do not match."); NumErrors++; return false; } else if (document.FirstForm.Password.value.length == 0) { alert("Okay, there is one rule:\\n\\nYour password cannot be blank."); NumErrors++; return false; } else { return true; } } // --> </SCRIPT> <TABLE BORDER="2" CELLSPACING="1" CELLPADDING="0"><TR> <TD><TABLE BORDER="0" CELLSPACING="0" CELLPADDING="4"> <TR><TD><B>Password: </B></TD><TD ALIGN="center" BGCOLOR="#cccccc"><INPUT TYPE="password" NAME="Password"></TD></TR> <TR><TD><B>Password: </B></TD><TD ALIGN="center" BGCOLOR="#cccccc"><INPUT TYPE="password" NAME="X2"></TD></TR> <TR><TD><BR></TD><TD ALIGN="center" BGCOLOR="#cccccc"><INPUT TYPE="submit" VALUE="submit the form" STYLE="background-color: #ffffff"></TD></TR> </TABLE></TD></TR></TABLE> </FORM> </BLOCKQUOTE> </BLOCKQUOTE> EOM } sub TextPasswordSet { local $_; my $TestPassword = crypt($FORM{'Password'}, '$Z'); return <<"EOM"; <BLOCKQUOTE> <P>You have chosen an initial password:</P> <PRE>\t$FORM{'Password'}</PRE> <P>This password has been encrypted for your protection:</P> <PRE>\t$TestPassword</PRE> <P>You must now enter this new encrypted password into the text of this script. If you can access this script on the server (the file name is <TT>$0</TT>), then go ahead and paste the following text on lines 17 and 18. Otherwise, paste this text into the copy of this script that you have on your computer:</P> <PRE>\t# Lines 17 and 18 \t\$CryptPassword = '$TestPassword'; \t\$AllowSetPassword = 1; # 1 -> YES; 0 -> NO</PRE> <P>This sets your new password (on line 17) and prevents anyone else from doing the same (on line 18.)</P> <P>Once you've made these changes, save the script. If you changed the copy on your computer, then re-upload it in ASCII mode.</P> <P>After you've finished, reload this page to get started. This script will do nothing of value until you've properly set the administrative password.</P> </BLOCKQUOTE> EOM } sub AnonAdd { # Accepts one address local $_; my @AddressesToIndex = @_; my $sTitle = 'Add Your Own Website'; my $Realm = $FORM{'Realm'} ? $FORM{'Realm'} : ''; if (($AddressesToIndex[0]) && ($AddressesToIndex[0] ne 'prompt') && ($IndexFile{$Realm}) && (-W $IndexFile{$Realm})) { # else just show form print "<P><B>Adding Web Pages to the \"$Realm\" Database:</B></P>\n"; print 'Crawling remote sites and updating the database may take a long time. Please be patient...'; $|=1; $SetSaveLinks = 1; my %UserResults = (); my $NumRank = 0; my $NumRedirectsFollowed = 0; my $MaxAddresses = scalar @AddressesToIndex; my $AddressIndex = 0; my @IndexedAddresses = (); ADDRESS: for ($AddressIndex = 0; $AddressIndex < $MaxAddresses; $AddressIndex++) { print '.'; if ($Rules{'Crawler: Max Pages Per Batch'} <= $AddressIndex) { push(@IndexedAddresses,'DONE'); $UserResults{'DONE'} = "<P><DT><B>Crawler Finished</B></DT>"; $UserResults{'DONE'} .= "<DD>The crawler has indexed "; $UserResults{'DONE'} .= "$Rules{'Crawler: Max Pages Per Batch'} web pages. "; $UserResults{'DONE'} .= "It will wait for further input from you. The "; $UserResults{'DONE'} .= "maximum number of web pages searched per batch can be adjusted using the variable 'Crawler: "; $UserResults{'DONE'} .= "Max Pages Per Batch' in the \$Rules array.</DD>\n"; last ADDRESS; } my $URL = Trim($AddressesToIndex[$AddressIndex]); if ($URL !~ m!^http://!i) { $NumRank++; $UserResults{$URL} = "<P><DT><B>$NumRank. Error: <FONT COLOR=\"#cc0000\">$URL</FONT></B></DT>\n"; $UserResults{$URL} .= "<DD>This web address does not being with the string \"http://\". It will not be indexed. "; $UserResults{$URL} .= "This search engine queries only standard HTTP/1.0 and 1.1 compliant non-encrypted web servers, not secured https:// sites, nor "; $UserResults{$URL} .= "FTP nor gopher servers.</DD>"; $GlobalSpiderResults{$URL} = -1; # delete any matching entry push(@IndexedAddresses,$URL); next ADDRESS; } my $OldURL = $URL; my ($Text, $nFileSize, $RobotError) = ('', 0, ''); ($URL, $Text, $nFileSize, $RobotError) = GetStringByURL($URL); if (($Text eq '302') && ($NumRedirectsFollowed < $Rules{'Crawler: Max Redirects'})) { $NumRank++; $UserResults{$OldURL} = "<P><DT><B>$NumRank. Redirect: <A HREF=\"$OldURL\"><FONT COLOR=\"#0099bb\">$OldURL</FONT></A></B></DT>\n"; $UserResults{$OldURL} .= "<DD>This page is redirecting all traffic to \"$URL\". That page will be indexed instead.</DD>"; $GlobalSpiderResults{$OldURL} = -1; # delete any matching entry if ($URL =~ m!http://([^\/]+)$!) { $URL .= '/'; } @AddressesToIndex = ('',@AddressesToIndex); # add offset at left side $AddressesToIndex[$AddressIndex+1] = $URL; # overwrite this URL with redir. $MaxAddresses++; $NumRedirectsFollowed++; push(@IndexedAddresses, $OldURL); next ADDRESS; } elsif ($Text eq '302') { $NumRank++; $UserResults{$OldURL} = "<P><DT><B>$NumRank. Redirect: <A HREF=\"$OldURL\"><FONT COLOR=\"#0099bb\">$OldURL</FONT></A></B></DT>\n"; $UserResults{$OldURL} .= "<DD>This page is redirecting all traffic to \"$URL\". However, the maximum of $Rules{'Crawler: Max Redirects'} redirects per session has been exceeded and so this URL is being deleted altogether. You may also set \$Rules{'Crawler: Max Redirects'} to a larger number and try again.</DD>"; $GlobalSpiderResults{$OldURL} = -1; # delete any matching entry push(@IndexedAddresses, $OldURL); next ADDRESS; } elsif ($RobotError) { $NumRank++; $UserResults{$URL} = "<P><DT><B>$NumRank. Error: <A HREF=\"$URL\"><FONT COLOR=\"#cc0000\">$URL</FONT></A></B></DT>\n"; $UserResults{$URL} .= "<DD>The crawler encountered an error: $RobotError.</DD>"; $GlobalSpiderResults{$URL} = -1; # delete any matching entry push(@IndexedAddresses, $URL); next ADDRESS; } my ($Title, $Description) = ('', ''); my $RecordLine = MakeRecord($URL,'',$Text, $nFileSize, \$Title, \$Description); $GlobalSpiderResults{$URL} = $RecordLine; my $ByteSize = length($Text); my ($DD,$MM,$YYYY) = unpack('A2A2A4',substr($RecordLine,2,8)); $NumRank++; my $Month = $MonthNames[$MM]; $UserResults{$URL} = &StandardVersion($NumRank, $URL, $Title, $Description, $nFileSize, $DD, $Month, $YYYY); push(@IndexedAddresses,$URL); next ADDRESS; } print '<BLOCKQUOTE>'; # All of the disk i/o is in the sub-procedure: my ($EntryCount, $Duplicates, %UpdateResults) = UpdateDB($IndexFile{$Realm}, %GlobalSpiderResults); foreach (@IndexedAddresses) { if ($UserResults{$_}) { print $UserResults{$_}; } if ($UpdateResults{$_}) { print "<BR><FONT SIZE=\"-1\">[ $UpdateResults{$_} ]</FONT>\n"; } } my %SaveLinks = SaveLinksToFile($PendingPagesFile, $Realm, @IndexedAddresses); my $textDup = ''; if ($Duplicates == 0) { $textDup = 'No entries were removed.'; } elsif ($Duplicates == 1) { $textDup = 'One older entry was removed.'; } else { $textDup = $Duplicates.' older entries have been removed.'; } print <<"EOM"; <P>There are now $EntryCount addresses in the "$Realm" search database. $textDup</P> </BLOCKQUOTE> EOM $sTitle = 'Add Another Page'; } # End if $AddSite &PrintAddRemoteSiteForm($sTitle,'',$Realm,1); print <<"EOM"; <BLOCKQUOTE> Any embedded links found by the crawler will be indexed during idle time; there is no need to submit them again. </BLOCKQUOTE> <HR SIZE="1" WIDTH="50%" NOSHADE> EOM } sub ForbidFilesFromSites { local $_; my ($BaseDir, $BaseURL, @ForbidSites) = @_; my $ForbidFileString = ''; foreach (@ForbidSites) { if (!(m!^http://!i)) { # Assume this is a full file or directory: $ForbidFileString .= ' '.$_; } elsif ($BaseURL =~ m!^$_(.*)$!i) { # Map the forbidden URL to a local file: $ForbidFileString .= ' '.$BaseDir.$1; } } $ForbidFileString =~ s!\\!/!g; return $ForbidFileString.' '; } sub LoadRealms { local $_; unless (open(FILE, "<$RealmFile")) { return 0; } binmode(FILE); while (<FILE>) { my @Fields = split(m!\|!); next unless ($Fields[0] && $Fields[1]); my $Realm = $Fields[0]; $IndexFile{$Realm} = $Fields[1]; $BaseDir{$Realm} = $Fields[2] || ''; $BaseURL{$Realm} = $Fields[3] || ''; $Exclude{$Realm} = $Fields[4] || ''; } close(FILE); } sub Append { local $_; my ($FileName, $FileText, $Persist) = @_; unless (open(FILE, ">>$FileName")) { return 0; } binmode(FILE); print FILE $FileText; close(FILE); chmod($FileMask, $FileName); return 1; } sub NixPath { local $_ = shift; s!\\!/!mg; return $_; } sub escape { shift() if ref($_[0]); my $toencode = shift; return undef unless defined($toencode); $toencode=~s/([^a-zA-Z0-9_.-])/uc sprintf("%%%02x",ord($1))/eg; return $toencode; } sub Commas { local $_ = shift; unless (m!\.!) { $_ .= '.'; } while (m!(.*)(\d)(\d\d\d)(\,|\.)(.*)!) { $_ = "$1$2,$3$4$5"; } s!\.$!!o; return $_; } sub Trim { local $_ = shift; while (s!^(\r|\n|\t|\s+)!!) {} while (s!(\r|\n|\t|\s+)$!!) {} return $_; } # Use in the UI when a user needs to select a remote realm. Takes the default # realm as an argument. 3 possible return values: # <BR>: no remote realms exist # <input type=hidden> non-ui component - includes a reference to the single realm # <select></select> ui component - defaults to realm argument, alpha-sorted otherwise sub SelectRealmList { local $_; my ($Realm) = @_; my $SelectRealmText = ''; # Gather a list of all realms which don't have a BaseDir my (@RemoteRealms) = (); foreach (keys %IndexFile) { next if ($BaseDir{$_}); # local realm; push(@RemoteRealms, $_); } if (0 == (scalar @RemoteRealms)) { $SelectRealmText = '<BR>'; } elsif (1 == (scalar @RemoteRealms)) { # there's only 1 remote realm - make it the default: $SelectRealmText = "<INPUT TYPE=hidden NAME=Realm VALUE=\"$RemoteRealms[0]\">"; } else { # There are multiple realms - create an interactive select list, with the # current one selected: $SelectRealmText = "<B>Realm:</B> </TD><TD><SELECT NAME=Realm>"; foreach (sort @RemoteRealms) { $SelectRealmText .= "<OPTION"; $SelectRealmText .= " SELECTED" if ($_ eq $Realm); $SelectRealmText .= ">$_"; } $SelectRealmText .= "</SELECT>"; } return $SelectRealmText; }