--- makenh.orig	Tue Jul 28 03:21:30 1998
+++ makenh	Wed Nov  4 07:05:47 1998
@@ -68,6 +68,7 @@
 $SITE_RE = '[^:]+:\/\/([^\/]+)\/.*';
 $NumLocalCollected = 0;
 $NumRemoteCollected = 0;
+$max_redir = 6;
 # LOGFILE, ERRFILE -- files for logging
 
 ### *TO CHANGE TRAVERSAL*
@@ -105,6 +106,7 @@
 $LOGFILENAME = ".wg_log";
 # $STARTFILE = ".wgstart";
 $WGADDSEARCH = ".wgfilter-box";
+$SITECACHE = ".wgsitecache";
 
 $ROBOTNAME = "HTTPGET";
 
@@ -187,22 +189,22 @@
 
 # Initialize variables to avoid warnings
    ($title, $urlpath, $traverse_type, $explicit_only, $numhops,
-    $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem) = 
-   ('','','','','','','','','','','');
+    $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem, $locale, $charset) =
+   ('','','','','','','','','','','','','');
 
 ($title, $urlpath, $traverse_type, $explicit_only, $numhops,
- $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem, @urllist) = ReadConfig($archivepwd);
+ $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem, $locale, $charset, @urllist) = ReadConfig($archivepwd);
 
 # open logs
 &open_logs();
 
 print LOGFILE "From Configuration:\n";
 my(@configlist) = qw(title urlpath traverse_type explicit_only numhops
-	nhhops local_limit remote_limit addboxes vhost usemaxmem ) ;
+	nhhops local_limit remote_limit addboxes vhost usemaxmem locale charset) ;
 foreach $item (@configlist) 
 {
 	$value = '';
-	eval "$value = \$$item";
+	eval "\$value = \$$item";
 	print LOGFILE " $item: $value\n";
 }
 print LOGFILE " urllist: @urllist\n\n";
@@ -231,6 +233,7 @@
 $MAPFILE = "$archivepwd/$MAPFILE";
 $TEMPROBOTFILE = "$archivepwd/$TEMPROBOTFILE";
 $WGADDSEARCH = "$archivepwd/$WGADDSEARCH";
+$SITECACHE = "$archivepwd/$SITECACHE";
 
 ($archiveprot, $archivehost, $archiveport, $archivepath) = 
    &url::parse_url($archiveurl);
@@ -252,7 +255,7 @@
 
 # read in the site configuration
 &siteconf::ReadConf($vhost);
-&siteconf::LoadCache();
+&siteconf::LoadCache("$SITECACHE");
 
 ###############
 ### PHASE 1 ###
@@ -398,7 +401,7 @@
 &close_logs();
 
 # remove the robots file
-system("rm -rf $TEMPROBOTFILE");
+unlink($TEMPROBOTFILE);
 
 #----------------------
 #change the dir back
@@ -751,7 +754,7 @@
    my($prot, $host, $port, $path) = &url::parse_url($url);
    
    # if the protocol isn't http, assume it's good
-   if($prot!~/http/i){
+   if(!defined($prot) || $prot!~/http/i){
       return 1;
    }
    
@@ -800,6 +803,7 @@
    my($output);
    my($olddata, $newdata);
    my($newprot, $newhost, $newport, $newpath, $url);
+   my($redcount)=0;
    
    # make the $url
    $url = "http://$host:$port/robots.txt";
@@ -815,6 +819,7 @@
    while($output ne ""){
       # more for error?
       if($output=~/^error/i){
+	 truncate($TEMPROBOTFILE,0);
 	 print ERRFILE "Error with getting $url\n";
 	 #			print LOGFILE "Error with getting $url\n";
 	 last;
@@ -822,7 +827,13 @@
       
       # look at output for redirect -- store redirects in file, too
       if($output=~/^Redirect: (.*)$/){
-	 print LOGFILE "Redirected to: $1...";
+	 if ($redcount >= $max_redir) {
+	     truncate($TEMPROBOTFILE,0);
+	     print ERRFILE "Too many redirections with $url\n";
+	     last;
+	 }
+	 $redcount++;
+	 print LOGFILE "Redirected to: $1...\n";
 	 
 	 # see if we have the redirected server
 	 ($newprot, $newhost, $newport, $newpath) = &url::parse_url($1);
@@ -843,6 +854,7 @@
 	 }
       }else{
 	 # we've got it, or there's an error...
+	 truncate($TEMPROBOTFILE,0);
 	 last;
       }
    }
@@ -894,6 +906,7 @@
 sub geturl2file{
    my($url) = @_;
    my($output, $link, $file, $oldfile, @aliases);
+   my($redcount)=0;
    
    # check if we have that in stock (we know it's not local)
    if (defined($URL2FILE{$url})) {
@@ -930,6 +943,7 @@
       while($output ne ""){
 	 # more for error?
 	 if($output=~/^error/i){
+	    truncate($file,0);
 	    print ERRFILE "Error with getting $url: $output\n";
 	    #				print LOGFILE "Error with getting $url\n";
 	    last;
@@ -937,6 +951,12 @@
 	 
 	 # look at output for redirect -- store redirects in file, too
 	 if($output=~/^Redirect: (.*)$/){
+	    if ($redcount >= $max_redir) {
+		truncate($file,0);
+		print ERRFILE "Too many redirections with $url\n";
+		last;
+	    }
+	    $redcount++;
 	    &ungetnewname();	# rewind the name counter		
 				# The next get will overwrite the unnecessary file
 	    
@@ -970,6 +990,7 @@
 	    }
 	 }else{
 	    # we've got it, or there's an error...
+	    truncate($file,0);
 	    last;
 	 }
       }
@@ -1159,6 +1180,15 @@
       ($prot, $host, $port, $path) = &url::parse_url($url);
       #print "URL after parsing: $prot://$host:$port$path\n";
       
+      next if !defined($prot);
+      if (!defined($port) ||
+	  ($port eq '80' && $prot =~ /^https?$/) ||
+	  ($port eq '21' && $prot eq 'ftp')) {
+	$port = '';
+      } else {
+	$port = ":$port";
+      }
+
       # make sure the path has a preceding /
       $path = "/$path" if $path!~/^\//;
       
@@ -1177,7 +1207,7 @@
 #      $host = "$a.$b.$c.$d";
 #      }
       
-      $url = "$prot://$host:$port$path";
+      $url = "$prot://$host$port$path";
       #print "URL after normalization: $url\n";
       
       # strip off any #text
