#!/usr/bin/perl require 5; use Socket; # Fluid Dynamics Search Engine, Version 2.x # Copyright 1997-2000 by Fluid Dynamics. Please adhere to the copyright # notice and conditions of use, described in the attached help file and # hosted at the URL below. For the latest version and help files, visit: # http://www.xav.com/scripts/search/ # ___________________________________________________________________________ my ($CryptPassword, $AllowSetPassword, $AllowAnonAdd); # Lines 17 and 18: $CryptPassword = '$ZR.QKQhGFFB6'; $AllowSetPassword = 0; # 1 -> YES; 0 -> NO $AllowAnonAdd = 0; # This search engine is managed from the web, and it comes with a password to # keep it secure. # # You will be given a password when you first visit this script using the # special "Mode=Admin" query string - for example: # # http://my.host.com/search.pl?Mode=Admin # # On your first visit, the script gives instructions on how to set the # password variable below. # ___________________________________________________________________________ # # Security Settings: my $AllowClearTextAuth = 0; my $AllowDebug = 1; my $VERSION = '2.0.0.0001'; my $Timeout = 50; # seconds. my ($pq, $O); my $SetSaveLinks = 1; my $bUseClearTextAuth = 0; my (@HITS, @SearchTerms, %RAW, %SessionCookie, @GlobalSavedLinks, %GlobalSpiderResults) = (); my %FORM = ReadInput(); my $LimitSite = ''; my $WildCard = 'thewildcardisaveryspecialcharacter'; my $WildSearch = '([^\s+]{0,4})'; # ___________________________________________________________________________ # # Robot and Search Settings: my $SCRIPT_NAME = ($ENV{'SCRIPT_NAME'} || 'search.pl'); my $SearchTipsPage = $SCRIPT_NAME; my $REQUEST_METHOD = 'POST'; my %Rules = (); $Rules{'Max Index File Size'} = 10000000; # 10mb default $Rules{'Hits Per Page'} = 10; $Rules{'Multiplier: Title'} = 10; $Rules{'Multiplier: Keyword'} = 10; $Rules{'Multiplier: Description'} = 4; $Rules{'Minimum Page Size'} = 128; # bytes $Rules{'Max Characters: URL'} = 128; $Rules{'Max Characters: Title'} = 96; $Rules{'Max Characters: Description'} = 384; $Rules{'Max Characters: Auto Description'} = 150; $Rules{'Max Characters: Keywords'} = 256; $Rules{'Max Characters: File'} = 64000; $Rules{'Forbid All Cap Titles'} = 1; # 1 -> YES; 0 -> NO $Rules{'Forbid All Cap Descriptions'} = 1; $Rules{'Crawler: Minimum WhiteSpace'} = 0.01; $Rules{'Crawler: Max Pages Per Batch'} = 12; $Rules{'Crawler: Max Redirects'} = 6; $Rules{'Crawler: Days Til Refresh'} = 30; $Rules{'Crawler: User Agent'} = 'Mozilla/4.0 (compatible: FDSE robot)'; $Rules{'Crawler: Follow Query Strings'} = 0; # 1 -> YES; 0 -> NO # This is a pipe-delimited list of lowercase file extensions. Links to files # with these extensions will not be treated as document-type links by the # crawler; using this list will make crawl sessions faster, and will keep the # pending pages file small: $Rules{'Crawler: Ignore Links To'} = 'gif|jpg|js|css|mp3|wav|zip|exe|doc|xls|pdf'; # When the crawler hits a page, it gathers a list of all the links on the page # for future searching. This list can be overwhelming. To reduce the size of # the list, set Follow Offsite Links to 0, and then crawler will only remember # links to the same host: $Rules{'Crawler: Follow Offsite Links'} = 1; # 0 -> No, 1 -> Yes $Rules{'Crawler: Rogue'} = 0; # 1 -> Disregard robots exclusion rules $Rules{'Index ALT Text'} = 1; $Rules{'Index Links'} = 0; # The following words will be ignored when entered as part of a query: my @IgnoredWords = ( 'your', 'you', 'www', 'with', 'will', 'why', 'who', 'which', 'where', 'when', 'what', 'web', 'we', 'was', 'want', 'w', 'used', 'use', 'two', 'to', 'this', 'they', 'these', 'there', 'then', 'then', 'them', 'their', 'the', 'that', 'than', 't', 'so', 'site', 'should', 'see', 's', 're', 'quot', 'page', 'our', 'other', 'org', 'or', 'only', 'one', 'on', 'of', 'now', 'not', 'no', 'new', 'net', 'nbsp', 'name', 'n', 'my', 'ms', 'mrs', 'mr', 'most', 'more', 'me', 'may', 'lt', 'like', 'just', 'its', 'it', 'is', 'in', 'if', 'i', 'http', 'how', 'he', 'have', 'has', 'gt', 'get', 'from', 'for', 'find', 'ed', 'do', 'd', 'com', 'can', 'by', 'but', 'been', 'be', 'b', 'at', 'as', 'are', 'any', 'and', 'an', 'amp', 'also', 'all', 'after', 'about', 'a', '5', '2', '1', '0', ); # ___________________________________________________________________________ # # File Control Section: # The writable folder where all data files are stored. Use the path relative # to this script: my $DataFilesDir = 'searchdata'; # List the URLs or paths that you do not want searched. Use all forward # slashes: my @ForbidSites = ( 'http://www.umsl.edu/studentlife/current/forums', '/webstuff/htdocs/studentlife/current/forums', 'http://www.umsl.edu/studentlife/current/forums/student/messages', '/webstuff/htdocs/studentlife/current/forums/student/messages', 'http://www.umsl.edu/studentlife/current/forums/issues/messages', '/webstuff/htdocs/studentlife/current/forums/issues/messages', 'http://www.umsl.edu/studentlife/current/wwwboard/messages', '/webstuff/htdocs/studentlife/current/wwwboard/messages', 'http://www.umsl.edu/studentlife/current/css_web_design', '/webstuff/htdocs/studentlife/current/css_web_design', ); # List URLs which should receive higher ranking (cannot use paths here): my @PromoteSites = ( 'http://www.xav.com/', 'http://www.microsoft.com/', ); # Enter the rank multiplier for PromoteSites (values 2 through 99): $Rules{'Promote Value'} = 20; # Local files only - enter the file extensions to be searched. Separate list # of extensions by space: my $EXT = ' htm html '; # Local files only - specify whether to allow non-text files. This requires # both the variable below and the extension listed in $EXT above: my $AllowBinaryFiles = 1; # 1 -> YES; 0 -> NO # See http://www.xav.com/scripts/search/admin_help.html#symlink my $AllowSymbolicLinks = 1; # 1 -> Yes; 0 -> NO my $TrustSymbolicLinks = 0; # 1 -> Yes; 0 -> NO # End File Control Section. # # See also the $Exclude{'Realm'} options below to set ForbidSites file # exclusions on a per-Realm basis. # ___________________________________________________________________________ # # Realms Section: # All the Realm information is stored in a |-delimited file named 'realms': my (%IndexFile, %BaseDir, %BaseURL, %Exclude) = (); my (%HashIP) = (); my $RealmFile = 'search.realms.txt'; # ___________________________________________________________________________ # # This HTML text appears at the top of every page. It's a good place to # declare custom styles like background color and font type: my $Header = <<"EOM";
|
![]() |
|
Results: No documents were found.EOM # ___________________________________________________________________________ # my $SearchTipsText = <<"EOM";
|
Search Rules This search engine helps you find documents on this website and related sites. Here's how it works: you tell the search service what you're looking for by typing in keywords, phrases, or questions in the search box. The search service responds by giving you a list of all the Web pages in our index relating to those topics. The most relevant content will appear at the top of your results. How To Use:
Here's an example:
Tip: Don't worry if you find a large number of results. In fact, use more than a couple of words when searching. Even though the number of results will be large, the most relevant content will always appear at the top of the result pages. More Basics - An Overview Here's a quick overview of the rest of our Basic Help. Just click on the links to jump to these sections.
What is an 'Index'? |
Data Folder Required
EOM unless ((-e $DataFilesDir) and (-d $DataFilesDir)) { # if don't even exist! print <<"EOM";This script requires a writable folder named "$DataFilesDir".
Please create a folder with this name (and give it write permissions).
Need help? Visit http://www.xav.com/scripts/search/admin_help.html.
EOM } else { print <<"EOM";This script requires a writable folder named "$DataFilesDir".
A folder exists with that name, but it isn't readable and writable.
Give this folder RWX permissions for Everyone. Your ISP can usually assist with this.
Need help? Visit http://www.xav.com/scripts/search/admin_help.html.
EOM } print "