#!/usr/bin/perl -w
# Sirobot
#   28.05.99 by Settel <settel@sirlab.de>

# done #fold00
# [recent improvements not listed in CHANGELOG]
#
# -new options: --dump, --nodump, --dumpfile <file>
#

# todo #fold00
# -option --exec: filter STDERR
# -use strict;
# -unescape %20 etc in filenames?
# -suppress warnings generated by LWP which distort our screen (esp. /robots.txt parsing error message)
# -make $overwrite_mode local for every job
# -option --nowait to override waiting for a key when finished
# -option --last that finishes all pending downloads (daemon mode)
# -allow user keystrikes eg to redraw curses screen, issue a --last, ...
# -multiple streams
# -erase empty files (eg. after 404s)
# -<img src="folder1/folder2/\nfile.ext">
# -decrease number of screenupdates for better performance
# -get javascript, CSS links (child_get_links(), child_convert_links())
#  <script language="JavaScript" src="data/javabar.js" type="text/javascript" defer>
#  <link rel=stylesheet type="text/css" href="format.css">
# -improve sigint handler (dumps some uninitialized values :( )
# -quotes in regexps for --exclude and --include when reading from file
# -shift returncode from child
# -Gimmick: Anzeige in ps f ndern

# Includes #fold00
use LWP::RobotUA;
use URI;
use HTTP::Headers;
use HTTP::Request;
use HTTP::Date;
use IO::Pipe;
use IO::Select;          
use IO::File;
use POSIX (":sys_wait_h",":fcntl_h"); # needed for waitpid(,WNOHANG)
use Fcntl;               # to change pipes to O_NONBLOCK

# Curses are included later in section init and only if desired by the user!

# Settings #fold00
# $Revision: 1.50 $  $Date: 2000/10/10 23:54:45 $
$version="0.15.1";
$versiondate="15.10.2000";
$robot="yes";
$botname="sirobot/$version($versiondate)";
#$from='<settel@sirlab.de>';
$win=0;  # INCOMPLETE: adapt Sirobot to Windows

# set default values                                    
$mode=0;        # same dir  (see enum mode)
$retries=0;     # no retries
$depth=1;       # default depth
$verbose=0;     # see enum verbose
$maxjobs=5;     # max. number of jobs to run parallel
$prefix="";     # prefix for all files written
$splitstream=0; # split download into several streams? (0: no) [not implemented yet]
$showstats=1;   # show statistics when done
$convert=0;     # don't convert absolute links
undef %headers; # no user set headers
undef $exclude; # no files excluded by default
$use_curses=1;  # use curses library to display stuff
$wait_for_key=0;# wait for key after the download is done and before exitting
$dump=0;        # dump links to file (1) or nor (0)
undef $dumpfile;# dump to file (defined) or to stdout (undef)
$callback_blocksize=4096;          # chunksize used by the LWP callback
undef $logfile;
$overwrite_mode=0;                 # see enum overwrite_mode (no overwrite)
$removelinksfromfile=0;            # don't remove processed links for --file
$default_index_name="index.html";  # what to use, if URL points to a dir
$daemon=0;                         # no daemon mode
$daemon_pipe="/tmp/sirobot";       # default path of the pipe
undef $proxy;                      # no proxies so far
undef $ftp_proxy;
undef $noproxy;

# init #fold00

# other global variables
$appname=(split(/\//,$0))[-1];  # get prgname without path

# set From: line for HTTP headers
defined($ENV{'USER'}) and $from=$ENV{'USER'} or $from='unknown';         
$from.='@';
defined($ENV{'HOSTNAME'}) and $from.=$ENV{'HOSTNAME'} or $from.='unknown';

undef $is_child;
$childcnt=0;        # no childs forked
@joblist_todo=();   # no jobs so far
@joblist_fetch=();
@joblist_done=();              
%fetched_links=();  # URL->job translation
%child_job_list=(); # pid -> job translation

$stats_success=0;   # reset statistics
$stats_skipped=0;
$stats_updated=0;
$stats_error=0;
$stats_servererror=0;
$stats_servererror404=0;
$stats_servererrorauth=0;
$stats_links_rejected=0;
$stats_links_excluded=0;
$stats_links_unsupp_scheme=0;

undef $daemon_pipe_fd;

# install own signal handler
$SIG{'INT'}=\&sigint_handler;
$SIG{'CHLD'}=\&sigchild_handler;
$SIG{'WINCH'}=\&sigwinch_handler; # changes of window size
#$SIG{'__DIE__'}='cleanup';

$|=1;               # flush output immediatly after prints

# set to 1 later if this process is a child and shouldn't do screen outputs
$curses_child_nooutput=0; 

# read ~/.sirobotrc 
my($home)=$ENV{'HOME'};
$home||=".";
read_options_from_file(0,$home."/.sirobotrc") if(-r $home."/.sirobotrc");

# process command line options
parse_cmdline();

if(defined($logfile)){
   $log_fd=IO::File->new(">>".$logfile);
   unless(defined($log_fd)){
      msg_err("couldn't create logfile: $!\n");
      undef $logfile;
      cleanup();
   }
   $log_fd->autoflush();
}

if($use_curses){
   my($ret)=eval{require Curses;};
   
   if(!defined($ret)){
      $use_curses=0;
      msg_err("Couldn't find Curses library, fallback to old interface\n");
      msg_info("Please put --nocurses into your ~/.sirbotrc to get rid of this message\n");
   }elsif($ret<=0){
      $use_curses=0;
      msg_err("Error while loading Curses library, fallback to old interface\n");
      msg_info("Please put --nocurses into your ~/.sirbotrc to get rid of this message\n");
   }else{
      unless(-t STDOUT){
         msg_err("stdout is no tty, fallback to old interface\n");
         msg_info("Please put --nocurses into your ~/.sirbotrc to get rid of this message\n");
         $use_curses=0;
      }else{
         Curses::initscr();
         Curses::clear();
         Curses::refresh($Curses::curscr); # don't really need this but 
         # without we get Curses::curscr used only once.
         
         $curses_y=0;  # current line for output
         $curses=1;    # tell 'em, we use Curses from now on!
      }
   }
}

if($daemon){
    msg_debug("waiting for FIFO $daemon_pipe to get ready\n");
    unless($daemon_pipe_fd=IO::File->new($daemon_pipe,O_NONBLOCK)){
        msg_err("can't open pipe $daemon_pipe: $!\n");
        cleanup();
        exit(1);
    }
    msg_info("Sirobot is now in daemon mode.\n");
}

######################################################################### #FOLD00
# main #fold00
my($exitflag)=0;
while(1){
   my($job);
   
   while(1){
      # print out some useful informations about what we've done so far
      if(defined($curses)){
         progress_update();
      }else{
         msg_stats(scalar(@joblist_todo)," todo, ",scalar(@joblist_fetch)," fetch, ",scalar(@joblist_done)," done\n");
      }
      
      # Start another job?
      if(scalar(@joblist_todo)>0 && $childcnt<$maxjobs){
         $job=shift(@joblist_todo);
         last;
      }
      
      # exit if we're done
      if($childcnt==0 && scalar(@joblist_todo)==0 && $daemon==0){  
         $exitflag=1;
         last;
      }
      wait_for_child();
   }
   last if($exitflag);
   
   push(@joblist_fetch,$job);

   msg_info("fetching $job->{'URI'}\n") unless(defined($curses));
   start_job($job);
}

msg_info("download finished\n");
Curses::getch() if(defined($curses) && $wait_for_key);
cleanup();

if($showstats){
   msg_info("done.\n");
   print("Statistics: \n");
   printf("%4i successful download".plural_s($stats_success)."\n",$stats_success);
   printf("%4i existing file".plural_s($stats_skipped)." skipped\n",$stats_skipped);
   printf("%4i existing file".plural_s($stats_updated)." updated (--newer)\n",$stats_updated);
   print("\n");
   
   printf("%4i file".plural_s($stats_servererror404)." not found (HTTP error 404)\n",$stats_servererror404);
   printf("%4i authorization failed (HTTP error 401)\n",$stats_servererrorauth);
   printf("%4i total server error".plural_s($stats_servererror)."\n",$stats_servererror);
   printf("%4i other error".plural_s($stats_error)." (aborted download, can't create file or path, ...)\n",$stats_error);
   print("\n");
   
   printf("%4i link".plural_s($stats_links_rejected)." rejected by mode ",$stats_links_rejected);
   print(("--samedir","--sameserver","--samedomain","--anyserver")[$mode],"\n");
   printf("%4i link".plural_s($stats_links_excluded)." excluded.\n",$stats_links_excluded);
   printf("%4i ",$stats_links_unsupp_scheme);
   print("link has") if($stats_links_unsupp_scheme==1);
   print("links have") if($stats_links_unsupp_scheme!=1);
   print(" unsupported scheme (https, mailto, news, ...)\n");
   
   print("\n");
   printf("%4i job".plural_s(scalar(@joblist_done))." processed in total\n",scalar(@joblist_done));
}else{
   msg_info("done, ",scalar(@joblist_done)," job".plural_s(scalar(@joblist_done))." processed.\n");
}

exit 0;

# cleanup #fold00
sub cleanup{
   if(defined($is_child)){  # clean up child's stuff
      if(defined($child_job)){
         close($child_job->{'Pipe'}) if($child_job->{'Pipe'});
         undef $child_job;
      }
   }else{ # clean up parent's stuff
      if(defined($childcnt) && $childcnt>0){   # no childs to wait for
         msg_info("cleanup: waiting for $childcnt child(s) to shut down\n");
         while($childcnt>0){
            wait_for_child(1);
            $childcnt--;
         }
      }
      if(defined($daemon_pipe_fd)){
         close($daemon_pipe_fd);
         undef $daemon_pipe_fd;
      }
      
      if(defined($curses)){
         Curses::endwin();
         undef $curses;
      }
   }
   if(defined($log_fd)){
      close($log_fd);
      undef $log_fd;
   }
}

# start_job #fold00
sub start_job{
   my($job)=shift;
   my($pid);
   
   unless(defined($job->{'Pipe'}=IO::Pipe->new())){
      msg_err("can't create pipe: $!\n");
      msg_err("job ",$job->{'URI'}," not started!\n");
      job_done($job,3,"can't create pipe: $!\n");
      return;
   }
   
   if(!defined($pid=fork())){       # error
      msg_err("couldn't fork: $!\n");
      msg_err("job ",$job->{'URI'}," not started!\n");
      job_done($job,3,"couldn't fork: $!");  # unsuccessful fetch
      return;
   }elsif($pid>0){   # parent
       $childcnt++;
       $child_job_list{$pid}=$job;
      
      $job->{'Pipe'}->reader();
      fcntl($job->{'Pipe'},F_SETFL,O_NONBLOCK) unless($win);
      return;
   }
   
   #########################################################################
   # child 
   $curses_child_nooutput=1 if(defined($curses));
   $is_child=1;
   
   msg_debug("child forked successfully\n");
   @joblist_todo=();  # free (some) unused memory
   @joblist_fetch=();
   @joblist_done=();
   
   child_init(); # set child's global variables
   
   my($fd)=$job->{'Pipe'}; # init pipe
   $fd->writer();
   $fd->autoflush(1);
   
   # measure time and call the actual child subrouts
   $job->{'Time'}=time();
   my($ret,$message)=child_do_job($job,$fd);
   msg_debug("child completed download\n");
   print $fd ("TIME ",time()-($job->{'Time'}),"\n");

   if($ret==5){ # special handling for redirects
      print $fd ("RET 5 $message\n");
      close($fd);
      msg_debug("child done\n");
      exit(0);
   }elsif($ret==2 || $ret==7){ # file not changed (option --newer)
      print $fd ("RET 2\n");
   }elsif($ret==0 || $ret==6){
      print $fd ("RET 0\n");
   }else{   # common errors (file too short, ...)
      print $fd ("RET $ret $message\n");
      close($fd);
      msg_debug("child done\n");
      exit($ret);
   }
   
   if($ret==6 || $ret==7){
      my($ret,$message);
      
      msg_debug("child executes command\n");
      ($ret,$message)=child_exec($job,$fd);
      if($ret>0){
         print $fd ("DONE $ret $message\n") ;
      }
      msg_debug("child extracts links\n");
      ($ret,$message)=child_get_links($job,$fd);
      if($ret>0){
         print $fd ("DONE $ret $message\n") ;
      }elsif($ret==0){
         print $fd ("DONE\n") ;
      }
      print $fd ("STATS ");
      print $fd ($child_stats_excluded," ");
      print $fd ($child_stats_unsupp_scheme," ");
      print $fd ($child_stats_rejected,"\n");
   }
   if(defined($job->{'LastModified'})){
      my($filename)=get_filename_from_job($job);
      msg_debug("setting utime ",$job->{'LastModified'}," for $filename\n");
      unless(utime(time(),$job->{'LastModified'},$filename)){
         msg_err("couldn't set modification time: $!\n");
      }
   }
   
   close($fd);
   
   msg_debug("child done\n");
   exit(0);
}

# wait_for_child #fold00
sub wait_for_child{
   my($abort)=shift;
   my($job);
   
   # first, check if there's a zombie waiting. Order is important!
   $sigchild_caught=0; 
   my($pid,$status)=(get_waiting_pid(),$abort);
   if(defined($pid)){
       finish_child($pid,$status);
       return;
   }
   
   # create list of FDs we'd like to check for
   my($select)=IO::Select->new();
   my(%handles,@ready);
   my($selectcnt)=0; # how many fds we're waiting for
   foreach (@joblist_fetch){
      $select->add($_->{'Pipe'});
      $handles{$_->{'Pipe'}}=$_; # create reverse mapping (file->job) table
      $selectcnt++;
   }
   if($daemon==1){
      $select->add($daemon_pipe_fd);
      $selectcnt++;
   }
   
   msg_debug("waiting...\n");

   # read pipes until at least one child exits
   my($timeout,$flag,$fd);
   my($daemon_disabled)=0;
   while($sigchild_caught==0){      # unless we get a SIGCHLD
      $timeout=1;
      
      if($selectcnt>0){
         @ready=$select->can_read($timeout);
      }else{
         sleep($timeout);
         @ready=();
      }

      if($daemon_disabled){
         $select->add($daemon_pipe_fd); #re-enable daemon
         $selectcnt++;
         $daemon_disabled=0;
      }

      foreach $fd (@ready){
         if(defined($job=$handles{$fd})){ # get the child's messages
            my($input);
            $flag=1;
            while(defined($input=<$fd>)){
               $flag=0;
               # PROGRESS-Messasges are processed directly, the rest is saved
               if($input=~m/^PROGRESS (\d+) (\d+) (-?\d+) (-?\d+)/){
                  $job->{'ProgressPos'}=$1;
                  $job->{'ProgressFile'}=$2;
                  $job->{'ProgressShould'}=$3 if($3>0);
                  $job->{'ProgressBPS'}=$4;
                  progress_update() if(defined($curses));
               }elsif($input=~m/PROGRESSSTATE (\w+)/){
                   my($state)=$1;
                   $job->{'ProgressState'}=1 if($state eq 'images');
                   $job->{'ProgressState'}=2 if($state eq 'links');
                   $job->{'ProgressState'}=5 if($state eq 'exec');
                   progress_update() if(defined($curses));
               }else{
                  msg_debug("[PIPE]: $input");
                  $job->{'Piperead'}.=$input;
               }
            }
            if($flag){
               $select->remove($fd);
               $selectcnt--;
            }
         }elsif($daemon==1 && $fd eq $daemon_pipe_fd){ # read named pipe
            $flag=1;

            # We've got something to read from the named pipe. Get it.
            my(@newargs)=();
            while(defined($_=<$daemon_pipe_fd>)){
               $flag=0;
               chomp($_);
               next if(m/^\s*#/); # ignore comments
               next if(m/^\s*$/); # ignore empty lines
               foreach (split(/\s/,$_)){
                  next unless(defined($_) && $_ ne ''); # skip spaces
                  push(@newargs,$_);
               }
            }

            # workaround because closed FDs always returns state 
            # readable :-(  Disable for 1 sec.
            if($flag){ 
               $select->remove($daemon_pipe_fd);
               $selectcnt--;
               $daemon_disabled=1;
               next;
            }
            msg_debug("read from named pipe: \n",join('\n',@newargs),"\n");
            parse_commands(0,undef,@newargs);
            Curses::refresh($Curses::curscr) if(defined($curses));
            return;
         }else{
            msg_warn("wait_for_child(): FD not defined!\n");
         }
      }
   }
   
   # go, get the child's status, links etc...
   finish_child(get_waiting_pid(),$abort);
}

# get_waiting_pid #fold00
sub get_waiting_pid{
   my($pid);
   
   unless($win){
       $pid=waitpid(-1,&WNOHANG);
       return unless($pid>0);
   }else{
       $pid=waitpid(-1,&WNOHANG);
       unless(defined($child_job_list{$pid})){ 
           $pid=-$pid; # Ugly hack! ActivePerl seems to return negative PID?!
           unless(defined($child_job_list{$pid})){
               return;
           }
       }
   }
   return($pid,$?);
}

# finish_child #fold00
sub finish_child{
   my($pid)=shift;
   my($exitstatus)=shift;
   my($abort)=shift;
   $childcnt--;
   
   msg_debug("child $pid exited with status $exitstatus\n");
   my($job)=$child_job_list{$pid};
   delete $child_job_list{$pid};
   
   # get the rest of what's in the pipe
   $job->{'Piperead'}||='';
   my($fd)=$job->{'Pipe'};
   while(defined($_=<$fd>)){
      chomp($_);
      $job->{'Piperead'}.=$_."\n";
   }
   close($job->{'Pipe'});
   $job->{'Pipe'}=undef;
   
   return if(defined($abort)); # if called by cleanup(), flush pipes only
   
   # tell the user what we got on the pipe except for PROGRESS messages
   foreach (split(/\n/,$job->{'Piperead'})){
      msg_debug("[PIPE] \"$_\"\n") unless(m/^PROGRESS /);
   }
   
   my($statustext);
   my($done_code);
   
   if($exitstatus==0){ # child was successful
      my($retstatus)=0;
      my($retmsg);
      foreach (split(/\n/,$job->{'Piperead'})){
         if(m/^RET (-?\d+)/){
            $retmsg=$_;
            $retstatus=$1;
            last;
         }
      }
      
      if($retstatus==5){ # redirect
         unless($retmsg=~m/^RET 5 (.+)$/){
            msg_err("got redirect from child but no new location\n");
            return;
         }
         my($new_loc)=form_uri($1);
         unless(defined($new_loc)){
             job_done($job,5);
             $stats_links_unsupp_scheme++;
             return;
         }
         $job->{'RedirectCnt'}||=0;
         if(++$job->{'RedirectCnt'}>10){
            msg_err("Redirect loop detected!");
            job_done($job,5);
            return;
         }
         
         if(check_double_uris($new_loc)){
            push(@joblist_todo,
                 new_job_entry(
                               {'URI'        => $new_loc,
                               'Depth'       => $depth,
                               'Mode'        => $job->{'Mode'},
                               'Retries'     => $job->{'MaxRetries'},
                               'MaxRetries'  => $job->{'MaxRetries'},
                               'BaseURI'     => $job->{'BaseURI'},
                               'Prefix'      => $job->{'Prefix'},
                               'Exclude'     => $job->{'Exclude'},
                               'Headers'     => $job->{'Headers'},
                               'Convert'     => $job->{'Convert'},
                               'RedirectCnt' => $job->{'RedirectCnt'},
                               'FromFile'    => $job->{'FromFile'},
                               'OriginalURI' => $job->{'OriginalURI'},
                               'Exec'        => $job->{'Exec'},
                               }));
         }
         job_done($job,7);
         return;
      }elsif($retstatus==2){ # option --newer
         $stats_updated++;
      }else{
         $stats_success++;
      }
      read_links($job,$pid);
      if(defined($job->{'Convert'}) && $job->{'Convert'}>0){
          $job->{'ProgressState'}=3;
          progress_update() if(defined($curses));
          child_convert_links(
                              $job->{'URI'},
                              $job->{'Mode'},
                              $job->{'BaseURI'},
                              $job->{'Prefix'}
                             );
      }
      job_done($job,2);
   }else{
      $done_code=4;  # fetch not ok by default
      undef $statustext;
      
      foreach (split(/\n/,$job->{'Piperead'})){
         if(m/^RET (-?\d+)/){
            my($retstatus)=$1;
            # do some statistics
            
            $stats_success++ if($retstatus==0);
            if($retstatus==1){
               $stats_skipped++;
               $done_code=6;
            }
            if($retstatus==3){
               $stats_servererror++;
               if(m/^RET -?\d+ (\d+)/){
                  if($1==404){
                     $done_code=5;
                     $stats_servererror404++;
                  }
                  $stats_servererrorauth++ if($1==401);
               }
            }
            $stats_error++ if ($retstatus<0 || $retstatus==4);
            
            
            if(m/^RET -?\d+ (.+)$/){
               $statustext=$1;
            }
            if($retstatus==4){ # fetch failed, retry
               if($job->{'Retries'}>0){
                  $job->{'Retries'}--;
                  push(@joblist_todo,
                       new_job_entry(
                                     {'URI'        => $job->{'URI'},
                                     'Depth'       => $job->{'Depth'},
                                     'Mode'        => $job->{'Mode'},
                                     'Retries'     => $job->{'Retries'},
                                     'MaxRetries'  => $job->{'MaxRetries'},
                                     'BaseURI'     => $job->{'BaseURI'},
                                     'Prefix'      => $job->{'Prefix'},
                                     'Exclude'     => $job->{'Exclude'},
                                     'Headers'     => $job->{'Headers'},
                                     'Convert'     => $job->{'Convert'},
                                     'RedirectCnt' => $job->{'RedirectCnt'},
                                     'FromFile'    => $job->{'FromFile'},
                                     'OriginalURI' => $job->{'OriginalURI'},
                                     'Exec'        => $job->{'Exec'},
                                     }));
               }
            }
         }elsif(m/^STATS (\d+) (\d+) (\d+)/){
            $stats_links_rejected+=$3;
            $stats_links_excluded+=$1;
            $stats_links_unsupp_scheme+=$2;
         }elsif(m/^DONE \d+/){
            if(m/^DONE \d+ (.+)$/){
               $statustext=$1;
            }
            job_done($job,$done_code,$statustext);
            close($job->{'Pipe'});
            $job->{'Pipe'}=undef;
            return;
         }
      }
      
      undef $job->{'Piperead'};
      job_done($job,$done_code,$statustext);
      return;
   }
   return;
}

# job_done #fold00
sub job_done{
   my($job)=shift;
   $job->{'Status'}=shift;
   $job->{'Statustext'}=shift;
   
   my($i);  # job aus joblist_fetch entfernen
   for($i=0;$i<@joblist_fetch;$i++){
      if($job==$joblist_fetch[$i]){
         splice(@joblist_fetch,$i,1);
         last;
      }
   }
   
   push(@joblist_done,$job);
   remove_link($job) if(defined($job->{'FromFile'}) && $job->{'Status'}==2);
   
   msg_info("$job->{'URI'}: ",
            ("waiting","fetching","ok","internal error",
             "incomplete","not found","file exists","redirect")[$job->{'Status'}],
             ".\n");
}

# read_links #fold00
sub read_links{
   my($job)=shift;
   my($pid)=shift;
   
   my($link,$depth);
   $job->{'ProgressState'}=4;
   progress_update() if(defined($curses));
   foreach (split(/\n/,$job->{'Piperead'})){
      if(substr($_,0,5) eq 'LINK '){
         $_=substr($_,5);
         ($link,$depth)=split(/ /,$_);
         $depth=0 unless(defined($depth));
         $link=form_uri($link);
         if(!defined($link)){
             $stats_links_unsupp_scheme++;
         }elsif(check_double_uris($link)){
             my($newjob)=new_job_entry({'URI'        => $link,
                                        'Depth'       => $depth,
                                        'Mode'        => $job->{'Mode'},
                                        'Retries'     => $job->{'Retries'},
                                        'MaxRetries'  => $job->{'MaxRetries'},
                                        'BaseURI'     => $job->{'BaseURI'},
                                        'Prefix'      => $job->{'Prefix'},
                                        'Exclude'     => $job->{'Exclude'},
                                        'Headers'     => $job->{'Headers'},
                                        'Convert'     => $job->{'Convert'},
                                        'RedirectCnt' => $job->{'RedirectCnt'},
                                        'FromFile'    => $job->{'FromFile'},
                                        'OriginalURI' => $job->{'OriginalURI'},
                                        'Exec'        => $job->{'Exec'},
                                        });
                 
             if(!$dump){ # dump links to file?
                 push(@joblist_todo,$newjob); # no
                 next;
             }

             my($fd);
             if(defined($dumpfile)){
                 $fd=new IO::File($dumpfile,O_WRONLY|O_APPEND|O_CREAT);
                 unless(defined($fd)){
                     msg_err("can't dump links to file $dumpfile: $!\n");
                     $dump=0;
                     redo;
                 }
             }else{
                 $fd=\*STDOUT;
             }
             print $fd ("-d$depth ");
             print $fd ("$link\n");

             if(defined($dumpfile)){
                 close($fd);
             }
             push(@joblist_done,$newjob);
             next;
         }
      }elsif(m/^STATS (\d+) (\d+) (\d+)/){
          $stats_links_rejected+=$3;
          $stats_links_excluded+=$1;
          $stats_links_unsupp_scheme+=$2;
      }
   }
   undef $job->{'Piperead'};
   $job->{'ProgressState'}=0;
   progress_update() if(defined($curses));
}

# check_double_uris #fold00
sub check_double_uris{
   my($uri)=shift;
   
   # check if URI is already marked for download/downloaded
   return 0 if(defined($fetched_links{$uri}));
   return 1;
}

# sigint_handler #fold00
sub sigint_handler{
   msg_warn("sigint received, exiting\n");
   cleanup();
   exit(1);
}


# sigchild_handler #fold00
sub sigchild_handler{
   msg_debug("sigchild received\n");
   $sigchild_caught=1;
}

# sigwinch_handler #fold00
sub sigwinch_handler{
   msg_debug("sigwinch received\n");
   if($curses){
       # workaround: Curses for Perl doesn't look like it supports
       # resizeterm(); or similar functions so we need to close the screen
       # and it start again :-(
       undef $curses;
       Curses::endwin();
       
       Curses::initscr();
       Curses::erase();
       Curses::clear();
       Curses::refresh();
       $curses=1;
       $curses_y=0;
       
       progress_update();
       msg_debug("refresh\n");
   }
}


# new_job_entry #fold00
sub new_job_entry{
   my(%job)=%{$_[0]};
   
   $job{'Depth'}||=$depth;
   $job{'Retries'}||=$retries;
   $job{'MaxRetries'}||=$retries;
   $job{'BaseURI'}||=$job{'URI'};
   $job{'OriginalURI'}||=$job{'URI'};
   $job{'Mode'}||=$mode;
   $job{'Prefix'}||=$prefix;
   $job{'Status'}=0;
   $job{'Convert'}||=$convert;
   $job{'RedirectCnt'}||=0;
   undef $job{'Statustext'};
   $job{'ProgressState'}=0;
   
   $fetched_links{$job{'URI'}}=\%job; 
   return \%job;
}

# remove_link #fold00
sub remove_link{
   my($job)=shift;
   my($fromfile)=$job->{'FromFile'};
   my($uri)=$job->{'OriginalURI'};
   my(@content);
   
   msg_debug("removing link from file: $uri\n");
   unless(open(FROMFILE,$fromfile)){
      msg_warn("couldn't open $fromfile: $!, \n");
      msg_warn("link $uri not removed.\n");
      return;
   }
   @content=<FROMFILE>;
   close(FROMFILE);
   
   # deactivate it
   my($cnt)=0;
   foreach (@content){
      $cnt++ if(s/^$uri(\s?)\s*$/#[SIROBOT: done] $uri$1/s);
   }
   msg_debug("removed $cnt links\n");
   return if($cnt==0);
   
   unless(open(FROMFILE,">".$fromfile)){
      msg_warn("couldn't write $fromfile: $!, \n");
      msg_warn("link $uri not removed.\n");
      return;
   }
   print FROMFILE (@content);
   close(FROMFILE);
}

# progress_update #fold00
sub progress_update{
   my($progress_bar_len)=30;
   my($job);
   my($progress);
   my($cnt,$ypos,$t,$t2);

   # begin status bar
   $ypos=$Curses::LINES-$maxjobs*2-1;
   $ypos=0 if($ypos<0);
   my($tmpstr)=" -=###  Statistics: ".
      scalar(@joblist_todo)." todo, ".
      scalar(@joblist_fetch)." fetch, ".
      scalar(@joblist_done)." done  ";
   curses_output_line($ypos++,0,$tmpstr.("#"x($Curses::COLS-length($tmpstr)-3))."=- ");
   
   
   
   for($cnt=0;$cnt<$maxjobs;$cnt++){
      unless(defined($job=$joblist_fetch[$cnt])){
         curses_output_line($ypos++,0,"");
         curses_output_line($ypos++,0,"");
         next;
      }
      curses_output_line($ypos++,0,$job->{'URI'});
      if($job->{'ProgressState'}!=0){
          my($state)=(
                      undef,                        # 0
                      'extracting image links',     # 1
                      'extracting links',           # 2
                      'converting links',           # 3
                      'filtering double links',     # 4
                      'executing external program', # 5
                     )[$job->{'ProgressState'}];
          curses_output_line($ypos++,0," "x(10)."($state)");
          next;
      }

      unless(defined($job->{'ProgressPos'})){
         curses_output_line($ypos++,0,"");
         next;
      }
      
      $progress="";
      
      # print progress bar
      if(defined($job->{'ProgressShould'})){
         $t=($job->{'ProgressPos'})/($job->{'ProgressShould'});
         $t2=int($t*$progress_bar_len);
         $progress="[".("#"x$t2);
         if($job->{'ProgressPos'}<$job->{'ProgressFile'}){
            $t=($job->{'ProgressFile'})/($job->{'ProgressShould'});
            $t2=int($t*$progress_bar_len)-$t2;
            $progress.=("."x$t2);
         }
         $progress.=(" "x($progress_bar_len-length($progress)+1))."]";
         $progress.=sprintf(" (%3i%%)",$t*100);
      }
      # print BPS
      $progress.=sprintf(" %6i KB",($job->{'ProgressPos'})/1024);
      if($job->{'ProgressBPS'}<1536){
         $progress.=sprintf("  (%i B/s)",$job->{'ProgressBPS'});
      }else{
         $progress.=sprintf("  (%.1i KB/s)",($job->{'ProgressBPS'})/1024);
      }
      
      if(defined($job->{'ProgressShould'}) &&
         defined($job->{'ProgressFile'}) &&
         defined($job->{'ProgressBPS'})){
          my($eta);
          if($job->{'ProgressBPS'}>0){
              $eta=int(($job->{'ProgressShould'}-$job->{'ProgressFile'})/$job->{'ProgressBPS'}+0.999);
              $progress.=sprintf("  ETA: %i:%02i:%02i",int($eta/3600),int($eta/60)%60,$eta%60);
          }
      }
      curses_output_line($ypos++,$Curses::COLS-$progress_bar_len-46,$progress);
   }
   Curses::refresh();
}

######################################################################### #FOLD00
# read_options_from_file #fold00
sub read_options_from_file{
   my($recursion)=shift;
   my($infilename)=shift;
   
   unless(defined($infilename)){
      msg_err("missing file argument\n");
      exit(1);
   }
   unless(open(INFILE,$infilename)){
      msg_err("can't read file $infilename: $!\n");
      exit(1);
   }
   my(@newargs)=();
   while(defined($_=<INFILE>)){
      chomp($_);
      next if(m/^\s*#/); # ignore comments
      next if(m/^\s*$/); # ignore empty lines
      foreach (split(/\s/,$_)){
         next unless(defined($_) && $_ ne ''); # skip spaces
         push(@newargs,$_);
      }
   }
   close(INFILE);
   parse_commands($recursion+1,$infilename,@newargs);
}

# parse_cmdline #fold00
sub parse_cmdline{
   my($ret)=parse_commands(0,undef,@ARGV);
   usage() if($ret==2);
   exit(1) if($ret>0);
   exit(0) if($ret==-1);
   
   if(scalar(@joblist_todo)==0 && $daemon==0){
      usage();
      exit(1);
   }
}

# parse_commands #fold00
sub parse_commands{
   my($recursion)=shift;
   my($fromfilename)=shift;
   
   if($recursion>10){
      msg_err("Encountered more than 10 recursions while including files! Abort.\n");
      return(1);
   }
   
   #
   # ######## preprocess arguments ######################################
   # (eg. convert short->long options, split --longopt=arg)
   #
   my($i)=0;
   my(@arg)=();
   while(defined($_=$_[$i++])){
      if($_ eq '-'){              # a single "-"?
         push(@arg,$_);           # yes -> pass it on
         next;
      }
      if(substr($_,0,1) ne '-'){  # an option?
         push(@arg,$_);           # no -> pass it on
         next;
      }
      if(substr($_,0,2) eq '--'){ # long option?
         my($addarg);
         $addarg=$2 if(s/^(.+?)=(.+)$/$1/);
         push(@arg,$_);
         push(@arg,$addarg) if(defined($addarg)); 
         next;
      }
      
      my($addarg); 
      # split additional args glued to this arg eg. -d2     
      $addarg=$2 if(s/^(-.)(.+)$/$1/);
      
      s/^-h$/--help/;
      s/^-t$/--tries/;
      s/^-d$/--depth/;
      s/^-V$/--version/;
      s/^-v$/--verbose/;
      s/^-s$/--split/;
      s/^-n$/--newer/;
      s/^-F$/--file/;
      s/^-f$/--force/;
      s/^-c$/--continue/;
      s/^-P$/--proxy/;
      s/^-p$/--prefix/;
      s/^-j$/--jobs/;
      s/^-H$/--header/;
      
      if(substr($_,0,2) ne '--'){  # short option not converted?
         msg_err("unrecognized option \"$_\"\n");
         return(1);
      }
      push(@arg,$_);
      push(@arg,$addarg) if(defined($addarg));
   }
   
   #
   # ###### the real argument processing ###########################
   #
   $i=0;
   while($i<@arg){
      $_=$arg[$i];
      if(substr($arg[$i],0,1) ne '-'){   # not an option, must be an URI
         my($origuri)=$arg[$i++];
         my($uri)=form_uri($origuri);
         unless(defined($uri)){
             $stats_links_unsupp_scheme++;
             next;
         }
         my(%privheaders)=%headers;
         my($fromfilename2);
         $fromfilename2=$fromfilename if(defined($fromfilename) && $removelinksfromfile);

         push(@joblist_todo,
              new_job_entry(
                            {'URI'        => $uri,
                            'Depth'       => $depth,
                            'Mode'        => $mode,
                            'Retries'     => $retries,
                            'MaxRetries'  => $retries,
                            'BaseURI'     => $uri,
                            'Prefix'      => $prefix,
                            'Exclude'     => $exclude,
                            'Headers'     => \%privheaders,
                            'Convert'     => $convert,
                            'FromFile'    => $fromfilename2,
                            'OriginalURI' => $origuri,
                            'Exec'        => $exec,
                            }));
         next;
      }
      
      if(substr($_,0,2) eq '--'){
         $_=substr($_,2);
         if($_ eq 'help'){
            usage_long();
            return(-1);
         }elsif($_ eq 'morehelp'){
            usage_long(1);
            return(-1);
         }elsif($_ eq 'examples'){
            usage_examples();
            return(-1);
         }elsif($_ eq 'log'){
            unless(defined($logfile=$arg[++$i])){
               msg_err("missing or wrong log argument\n");
               return(1);
            }
         }elsif($_ eq 'nolog'){
            undef $logfile;
         }elsif($_ eq 'dump'){
             $dump=1;
         }elsif($_ eq 'nodump'){
             $dump=0;
         }elsif($_ eq 'dumpfile'){
             unless(defined($dumpfile=$arg[++$i])){
                 msg_err("missing or wrong dumpfile argument\n");
               return(1);
             }
             if($dumpfile eq '-'){
                 undef $dumpfile;
             }
         }elsif($_ eq 'blocksize'){
             unless(defined($callback_blocksize=$arg[++$i])){
               msg_err("missing or wrong blocksize argument\n");
               return(1);
             }
             $callback_blocksize=8 if($callback_blocksize<8);
         }elsif($_ eq 'daemon'){
            $daemon=1;
         }elsif($_ eq 'nodaemon'){
            $daemon=0;
         }elsif($_ eq 'exec'){
             unless(defined($exec=$arg[++$i])){
                 msg_err("missing or wrong exec argument\n");
                 return(1);
             }
             undef $exec if($exec eq '');
         }elsif($_ eq 'pipe'){
            unless(defined($daemon_pipe=$arg[++$i])){
               msg_err("missing or wrong pipe argument\n");
               return(1);
            }
         }elsif($_ eq 'curses'){
            $use_curses=1;
         }elsif($_ eq 'nocurses'){
            $use_curses=0;
         }elsif($_ eq 'wait'){
            $wait_for_key=1;
         }elsif($_ eq 'nowait'){
            $wait_for_key=0;
         }elsif($_ eq 'stats'){
            $showstats=1;
         }elsif($_ eq 'nostats'){
            $showstats=0;
         }elsif($_ eq 'convert'){
            $convert=1;
         }elsif($_ eq 'noconvert'){
            $convert=0;
         }elsif($_ eq 'samedir'){
            $mode=0;
         }elsif($_ eq 'sameserver'){
            $mode=1;
         }elsif($_ eq 'samedomain'){
            $mode=2;
         }elsif($_ eq 'anyserver'){
            $mode=3;
         }elsif($_ eq 'tries'){
            unless(defined($retries=$arg[++$i]) && $retries>0){
               msg_err("missing or wrong retries argument\n");
               return(1);
            }
            $retries--;
         }elsif($_ eq 'header'){
            my($h,$v);
            unless(defined($h=$arg[++$i])){
               msg_err("missing header argument\n");
               return(1);
            }
            if($h eq '-'){
               undef %headers;
               next;
            }
            if($h=~s/^(.+)=(.+)$/$1/){
               $v=$2;
            }else{
               unless(defined($v=$arg[++$i])){
                  msg_err("missing header argument\n");
                  return(1);
               }
            }
            $headers{$h}=$v;
         }elsif($_ eq 'exclude'){
            my($tmp);
            unless(defined($arg[++$i])){
               msg_err("missing exclude argument\n");
               return(1);
            }
            foreach $tmp (split(/,/,$arg[$i])){
               if(defined($exclude)){
                  $exclude.=",-$tmp";
               }else{
                  $exclude="-$tmp";
               }
            }
         }elsif($_ eq 'include'){
            my($tmp);
            unless(defined($arg[++$i])){
               msg_err("missing include argument\n");
               return(1);
            }
            undef $exclude if($arg[$i] eq '');
            foreach $tmp (split(/,/,$arg[$i])){
               msg_debug("include: \"$tmp\"\n");
               if($tmp eq '.*' || $tmp eq '*' || $tmp eq ''){
                  undef $exclude;
                  next;
               }
               if(defined($exclude)){
                  $exclude.=",+$tmp";
               }else{
                  $exclude="+$tmp";
               }
            }
         }elsif($_ eq 'split'){
            $splitstream=1;
         }elsif($_ eq 'depth'){
            unless(defined($depth=$arg[++$i]) && $depth>=0){
               msg_err("missing or wrong depth argument\n");
               return(1);
            }
         }elsif($_ eq 'version'){
            version();
            return(-1);
         }elsif($_ eq 'prefix'){
            unless(defined($prefix=$arg[++$i])){
               msg_err("missing prefix argument\n");
               return(1);
            }
            $prefix=~s#([^/])$#$1/#; # append / if not given.
         }elsif($_ eq 'from'){
            unless(defined($from=$arg[++$i])){
               msg_err("missing from argument\n");
               return(1);
            }
         }elsif($_ eq 'jobs'){
            unless(defined($maxjobs=$arg[++$i]) && $maxjobs>0){
               msg_err("missing or wrong jobs argument\n");
               return(1);
            }
         }elsif($_ eq 'proxy'){
            unless(defined($proxy=$arg[++$i])){
               msg_err("missing proxy argument\n");
               return(1);
            }
            if($proxy eq '-'){
               # undef $proxy;
               $proxy='';
            }else{
               unless($proxy=~m#http:#i){
                  msg_err("invalid proxy argument (must begin with http:)\n");
                  return(1);
               }
            }
         }elsif($_ eq 'noproxy'){
            unless(defined($noproxy=$arg[++$i])){
               msg_err("missing noproxy argument\n");
               return(1);
            }
         }elsif($_ eq 'ftpproxy'){
            unless(defined($ftp_proxy=$arg[++$i])){
               msg_err("missing ftpproxy argument\n");
               return(1);
            }
            if($ftp_proxy eq '-'){
               # undef $ftp_proxy;
               $ftp_proxy='';
            }else{
               unless($proxy=~m#http:#i){
                  msg_err("invalid proxy argument (must begin with http:)\n");
                  return(1);
               }
            }
         }elsif($_ eq 'norobots'){
            undef $robot;
         }elsif($_ eq 'file'){
            read_options_from_file($recursion+1,$arg[++$i]);
         }elsif($_ eq 'remove'){
            $removelinksfromfile=1; 
         }elsif($_ eq 'noremove'){
            $removelinksfromfile=0; 
         }elsif($_ eq 'noclobber'){
            $overwrite_mode=0;
         }elsif($_ eq 'continue'){
            $overwrite_mode=1;
         }elsif($_ eq 'force'){
            $overwrite_mode=2;
         }elsif($_ eq 'newer'){
            $overwrite_mode=3;
         }elsif($_ eq 'quiet'){
            $verbose=-2;
         }elsif($_ eq 'silent'){
            $verbose=-1;
         }elsif($_ eq 'noverbose'){
            $verbose=0;
         }elsif($_ eq 'verbose'){
            $verbose=1;
         }elsif($_ eq 'debug'){
            $verbose=2;
         }else{
            msg_err("unknown option $arg[$i]\n");
            usage();
            return(1);
         }
      }else{
         msg_err("unknown option $arg[$i]\n");
         return(2);
      }
      
      $i++;
   }
   return(0);
}

# version #fold00
sub version{
   msg_normal("Sirobot $version ($versiondate)\n");
   msg_normal("  by Settel <settel\@sirlab.de>\n");
   msg_normal("  See http://www.sirlab.de/linux/\n");
}

# usage #fold00
sub usage{
   msg_normal("usage: $appname [options] <URL> [<URL>...] | -F <file>\n");
   msg_normal("  try $appname --help and $appname --morehelp for details\n");
}

# usage_long #fold00
sub usage_long{
   version();
   msg_normal("\n");
   my(@o);
   @o=(" -h, --help           print this helpscreen",
       " -V, --version        print out version and exit",
       " -t, --tries n        set number of attempts (eg. 1: try once; default: ".($retries+1).")",
       " -d, --depth n        set depth (default: $depth)",
       " -f, --force          force overwrite, if file exist",
       " -c, --continue       try to continue download, if file exist",
       "     --noclobber      leave existing files alone (default)",
       " -n, --newer          overwrite existing files only if newer",
       " -p, --prefix <dir>   save downloaded file in <dir>",
       "#-j, --jobs           number of jobs to run in parallel (default: $maxjobs)",
       "#-F, --file <file>    read options and URLs from file",
       "#    --remove         deactivate URLs in files (see --file) when fetched",
       "#    --noremove       opposite of --remove (default)",
       "#-P, --proxy <URL>    use <URL> as proxy for http requests",
       "#    --ftpproxy <URL> use <URL> as ftp proxy (no FTP requests without proxy!)",
       "#    --noproxy <list> a comma separated list of domain names",
       "     --samedir        fetch files only from the same directory",
       "     --sameserver     fetch files only from the same server",
       "     --samedomain     fetch files only from the same domain",
       "     --anyserver      no limits",
       "#    --convert        convert absolute links to relative ones if possible",
       "#    --noconvert      don't convert absolute links",
       "#    --blocksize <val> set size of chunks processed (default: $callback_blocksize)",
       "#    --exclude <patt> comma separated list of regexps(!) to exclude",
       "#    --include <patt> comma separated list of regexps(!) to include",
       "#-H, --header <h> <v> set HTTP header <h> to value <v> during a request",
       "#    --from <email>   set \"From:\" header (default: $from)",
       "#    --daemon         run sirobot in daemon mode (read from pipe, don't exit)",
       "#    --nodaemon       turn off daemon mode",
       "#    --pipe <file>    set name of pipe for daemon mode (default: $daemon_pipe)",
       "#    --stats          show statistics when done (default)",
       "#    --nostats        do not show statistics when done",
       "#    --wait           wait for a key after download is done".($wait_for_key?" (default)":""),
       "#    --nowait         do not wait for a key".($wait_for_key?"":" (default)"),
       "#-v, --verbose        be a bit more verbose, show joblist",
       "#    --noverbose      turn of verbose mode (default)",
       "#    --debug          be very verbose. For debugging purposes",
       "#    --silent         print errors only",
       "#    --quiet          absolutely no output",
       "#    --curses         use curses library for UI if available (default)",
       "#    --nocurses       don't use curses library",
       "#    --log <file>     write logging informations to <file>",
       "#    --nolog          do not create logfile (default)",
       "#    --norobots       ignore /robots.txt. USE WITH CARE!!!",
       "#    --dump           dump links to file/STDOUT ".($dump?" (default)":""),
       "#    --nodump         don't dump links to file ".($dump?"":" (default)"),,
       "#    --dumpfile <file> dump links to <file>, or STDOUT (default: ".($dumpfile||"-").")",
       "#    --exec <prg>     executes <prg> for every HTML-page fetched",
       "     --morehelp       print all available options",
       "     --examples       print usage examples");

   unless(defined($_[0])){
      msg_normal("These are the most important options. Use --morehelp to get a complete list.\n");
      msg_normal("\n");
   }
   
   foreach (@o){
      if(defined($_[0]) || substr($_,0,1) ne '#'){
         msg_normal(" ",substr($_,1),"\n");
      }
   }
}

# usage_examples #fold00
sub usage_examples{
   print(<<EOF);
usage examples:
   There's sometimes more than one way to do things. Possible (but not all)
   alternatives are given where appropriate. Some options can be given 
   in various ways. See manual page for details.
   
   Note: server is short for any kind of webservers eg. www.freshmeat.net

Fetch main page and it's images
   sirobot.pl http://server/
   
Fetch main page and it's images, even if they are located on other servers
(like most ad banners do)
   sirobot.pl --anyserver http://server/

Fetch main page only (that means, download only one file)
   sirobot.pl --depth 0 http://server/

Fetch main page, all links that point to the same server and images
   sirobot.pl --depth 2 http://server/
   
Same as above but don't fetch jpeg files
   sirobot.pl -d2 --exclude \'\.jpg$\' --exclude \'\.jpeg$\' http://server/
   sirobot.pl -d2 --exclude \'\.{jpg,jpeg}$\' http://server/
   sirobot.pl -d2 --exclude \'\.jpe?g$\' http://server/
   
Fetch HTML-Files and GIFs only
   siribot.pl -d2 --exclude \'.\' --include \'\.{html,gif}$\' http://server/
   

EOF
}

######################################################################### #FOLD00
# child: global vars #fold00
sub child_init{
   undef $child_job;
   $child_firstchunk=0;
   $child_stats_excluded=0;
   $child_stats_unsupp_scheme=0;
   $child_stats_rejected=0;

   undef $child_progress_len;  # size of file on server
   $child_progress_filelen=0;  # size of real file on disk
   $child_progress_pos=0;      # current pos in stream (pos<=filelen)
   $child_progress_bytes=0;    # number of bytes received from server (to calc throughput)
   undef $child_mtime;

   $SIG{'WINCH'}='IGNORE';     # don't exit if the user resizes our window
}

# child_do_job #fold00
sub child_do_job{
   my($job)=shift;
   my($fd)=shift;
   my($ua,$rules);
   my($uri)=$job->{'URI'};
   my($filename)=get_filename_from_job($job);
   my(%headers)=%{$job->{'Headers'}};
   my($retry)=0;
   $child_job=$job;  # used for callback
   
   unless(defined($filename)){
      my($text)="couldn't create filename from URI";
      
      if($job->{'RedirectCnt'}){
         $text="URI out of bounds due to redirects";
      }
      msg_warn($text.": ".$uri);
      return(-2,$text);
   }
   
   msg_debug("$uri -> $filename\n");
   
   $SIG{'INT'}='child_sigint_handler';
   $retry=1 unless($job->{'MaxRetries'}==$job->{'Retries'});
   
   if(defined($robot)){
      $ua=new LWP::RobotUA($botname,$from);
   }else{
      $ua=new LWP::UserAgent;
   }
   $ua->env_proxy();
   $ua->proxy('http',$proxy) if(defined($proxy));
   $ua->proxy('ftp',$ftp_proxy) if(defined($ftp_proxy));
   $ua->noproxy(split(/,/,$noproxy)) if(defined($noproxy));
   if(defined($robot)){
      $ua->delay(0);
      $ua->use_sleep(1);
   }

   if(-f $filename){
      msg_debug("retry: $retry; Retries: ",$job->{'Retries'},"; MaxRetries: ",$job->{'MaxRetries'},"\n");
      if($retry==1){     
         $child_progress_filelen=(stat($filename))[7];
      }else{ 
         if($overwrite_mode==0){     # don't overwrite
            msg_info("file $filename exists, skipping\n");
            return(1,"file exists, skipping");
         }elsif($overwrite_mode==1){ # continue
            $child_progress_filelen=(stat($filename))[7];
         }elsif($overwrite_mode==3){ # newer
            $child_mtime=(stat($filename))[9];
         }# else: force overwrite
      }
   }
   
   # set up headers
   my($http_headers)=HTTP::Headers->new();
   if($child_progress_filelen){
      $http_headers->header("Range"=>"bytes=$child_progress_filelen-");
   }
   foreach (keys(%headers)){
      $http_headers->header($_=>$headers{$_});
   }
   
   # ###If-Modified-Since###
   #if(defined($child_mtime) && $overwrite_mode==3){
   #    $http_headers->header("If-Modified-Since"=>time2str($child_mtime));
   #}
   
   my($request)=HTTP::Request->new("GET",$uri,$http_headers);
   
   my($response)=$ua->simple_request($request,\&child_get_callback,$callback_blocksize);
   close(CHILD_GET_FD);
   # $response=$ua->request($request,$filename);
   
   unless($response->is_success){
      my($code)=$response->code();
      if($code==301 || $code==302){
         my($new_loc)=$response->header('Location');
         unless(defined($new_loc)){
            msg_warn("Got HTTP error $code but no new Location:\n");
            msg_warn("$uri: ",$response->status_line,"\n");
            return(3,$response->status_line);
         }
         msg_debug("redirect: ",$job->{'URI'}," -> ",$response->header('Location'),"\n");
         return(5,$new_loc);
      }
      
      # HTTP-Code 304 means the page hasn't changed 
      # (answer to our "If-Modified-Since"-header)
      return 2 if($code==304 && $overwrite_mode==3);
      
      # unknown error/we don't handle this ("server error")
      msg_warn("$uri: ",$response->status_line,"\n");
      return (3,$response->status_line);
   }

   # download complete (==file has rigth size)?
   if(defined($response->header('Content-Length')) && $overwrite_mode!=3){
      my($length)=$response->header('Content-Length');
      $length+=child_get_content_start($response); # correct length with respect to Content-Range
      if($length>$child_progress_filelen){
         msg_warn("file too short! Should: $length; is: $child_progress_filelen\n");
         return(4,"should: $length; is: $child_progress_filelen; retries: ".$job->{'Retries'}."\n");
      }elsif($length<$child_progress_filelen){ # might happen if file on server was updated meanwhile
         msg_warn("file too long! Should: $length; is: $child_progress_filelen\n");
         return(4,"should: $length; is: $child_progress_filelen; retries: ".$job->{'Retries'}."\n");
      }
      print $fd ("LEN $length\n");
   }
   
   my($retval)=0;
   
   if($overwrite_mode==3 && 
                     defined($job->{'LastModified'}) &&
                     defined($child_mtime) &&                     
                     $job->{'LastModified'}==$child_mtime){
      $retval=2;
   }
   
   msg_debug("Content-Type: ".$response->header('Content-Type')."\n");
   $retval=$retval?7:6 if($response->header('Content-Type')=~m#^text/html#);

   return $retval;
}

# child_exec #fold00
sub child_exec{
    my($job)=shift;
    my($fd)=shift;
    
    return 0 unless(defined($job->{'Exec'}));
    print $fd ("PROGRESSSTATE exec\n");
    
    unless(open(CMD,$job->{'Exec'}." ".
                   $job->{'URI'}." ".
                   get_filename_from_job($job)." ".
                   $job->{'Depth'}." ".
                   "|")){
        return(1,"can't exec program: $!");
    }

    my($read);
    while(defined($read=<CMD>)){
        chomp($read);
        if($read=~m/^ECHO\s?(.*)$/){
            msg_normal("$1\n");
        }
    }
    close(CMD);
    return 0;
}

# child_get_links #fold00
sub child_get_links{
   my($job)=shift;
   my($fd)=shift;
   my($uri)=$job->{'URI'};
   my($filename)=get_filename_from_job($job);
   my($depth)=$job->{'Depth'}-1;
   my($baseuri)=$uri;
   my($mode2)=$job->{'Mode'};
   my($exclude)=$job->{'Exclude'};
   
   return 0 if($depth<0);  # nothing to do
   
   unless(open(INFILE,$filename)){
      return (1,"can't open file $filename $!\n");
   }
   my($page)="";
   while(defined($_=<INFILE>)){
      tr/\n\r/  /;
      $page.=$_;
   }
   close(INFILE);
   $page=~s/<!\s*--.*?--\s*>//g;  # remove comments
   
   # fetch images
   print $fd ("PROGRESSSTATE images\n");
   foreach ($page=~m/<\s*img [^>]*?src=([^>\s]+).*?>/ig,
            $page=~m/<[^>]*background=([^>\s]+).*?>/ig,
            $page=~m/<\s*input[^>]*?type=["']?image["'][^>]*?src=([^>\s]+).*?>/ig,
            $page=~m/<\s*input*?src=([^>\s]+)[^>]*?type=["']?image["'][^>].*?>/ig
           ){
      s/^\"(.*)\"$/$1/;
      s/^\'(.*)\'$/$1/;   # some @*!#&$% use single quotes instead of double quotes
      $_=form_uri($_,$baseuri);
      unless(defined($_)){
          $child_stats_unsupp_scheme++;
          next;
      }
      my($ret)=child_is_link_ok($_,$baseuri,$mode2,undef,$exclude);
      print $fd ("LINK $_\n") if($ret==0);
   }
   
   # fetch links
   print $fd ("PROGRESSSTATE links\n");
   if($depth>0){
      foreach ($page=~m/<\s*a [^>]*?href=([^>\s]+).*?>/ig,
              $page=~m/<\s*frame [^>]*?src=([^>\s]+).*?>/ig){
         s/^\"(.*)\"$/$1/;
         s/^\'(.*)\'$/$1/; # some @*!#&$% use single quotes instead of double qoutes
         $_=form_uri($_,$baseuri);
         unless(defined($_)){
             $child_stats_unsupp_scheme++;
             next;
         }
         my($ret)=child_is_link_ok($_,$baseuri,$mode2,$job->{'Prefix'},$exclude);
         print $fd ("LINK $_ $depth\n") if($ret==0);
      }
   }
   return 0;
}


# child_is_link_ok #fold00
sub child_is_link_ok{
   my($uri)=shift;
   my($baseuri)=shift;
   my($mode2)=shift;
   my($myprefix)=shift;
   my($exclude)=shift;
   
   # Return 0 if link is ok
   # Return 1 if link is not ok (wrong server/path/domain etc)
   # Return 2 if link is not ok (wrong scheme)
   # Return 3 if link is not ok (excluded)
   
   
   unless(defined($uri) && ($uri=~m/^https?:/i 
                                || $uri=~m/^ftp:/i)){
      $child_stats_unsupp_scheme++;
      return 2;
   }
   
   if(defined(get_filename_from_uri($_,$baseuri,$mode2,$myprefix))){
      return 0 unless(defined($exclude)); 
                                          
      # do pattern matching to find out if file is excluded
      my($flag)=0; # by default, include all files
      my($tmp,$tmp2);
      foreach $tmp (split(/,/,$exclude)){
         $tmp2=substr($tmp,1);
         if($uri=~m,$tmp2,){
            if(substr($tmp,0,1) eq '-'){
               $flag=3;
            }else{
               $flag=0;
            }
         }
      }
      msg_debug("link $uri rejected due to exclude list\n") if($flag==3);
      $child_stats_excluded++ if($flag==3);
      return $flag;
   }
   $child_stats_rejected++;
   return 1;
}


# child_convert_links #fold00
sub child_convert_links{
   my($uri,$mode,$baseuri,$prefix)=(@_)[0,1,2,3];;
   my($filename)=get_filename_from_uri($uri,$baseuri,$mode,$prefix);
   my($path)=$filename;
   $path=~s#/[^/]+$#/#;
   my($converted_flag)=0; # nothing converted so far
   
   # read file into memory
   unless(open(INFILE,$filename)){
      msg_warn("couldn't open $filename to convert links: $!\n");
      return;
   }
   my($page)="";
   $page.=$_ while(<INFILE>);
   close(INFILE);
   
   my($newpage)="";
   my($link,$link2);
   my($pattern);
   foreach $pattern ('\s*a\W[^>]*href=',
                     '\s*frame\W[^>]*src=',
                     '\s*img\W[^>]*src=',
                     '[^>]*background=',
                     '\s*input[^>]*?type=["\']?image["\'][^>]*?src=',
                    ){
      $newpage="";
      while($page=~s#^(.*?<$pattern)([^>\s]+)##si){
         $newpage.=$1;
         $link=$2;
         $link=~s#^\"(.*)\"$#$1#; # remove quotes from link
         $link=~s#^\'(.*)\'$#$1#; # some @*!#&$% use single quotes instead of double quotes
         unless(defined($link2=form_uri($link,$uri))){ # make absolute
            $newpage.="\"$link\""; # at least put quotes around it
            next;
         }
         unless(defined($link2=get_filename_from_uri(
                                                     $link2,
                                                     $baseuri,
                                                     $mode,
                                                     $prefix))){
            $newpage.="\"$link\""; # link points to external reference
            next;
         }
         my($newlink)=child_convert_links_make_relative($link2,$path);
         $newlink=URI::Escape::uri_escape($newlink);

         $newpage.="\"$newlink\"";
         $converted_flag=1 if($newlink ne $link);
      }
      $page=$newpage.$page;
   }
   
   return unless($converted_flag); # quit if there was nothing to convert
   # write back converted file
   unless(open(OUTFILE,">".$filename)){
      msg_warn("couldn't write $filename while converting links: $!\n");
      return;
   }
   print OUTFILE ($page);
   close(OUTFILE);
   
   msg_debug("child converted links\n");
}

# child_convert_links_make_relative #fold00
sub child_convert_links_make_relative{
   my($linkfilename)=shift;
   my($origpath)=shift;
   my($linkpath)=$linkfilename;
   my($newpath)="";
   $linkpath=~s#/[^/]+$#/#;
   $linkfilename=~s#^.*/([^/]+)#$1#;
   
   my(@origfolders)=(split(/\//,$origpath));
   my(@linkfolders)=(split(/\//,$linkpath));
   while(@origfolders && @linkfolders && $origfolders[0] eq $linkfolders[0]){
      shift(@origfolders);
      shift(@linkfolders);
   }
   $newpath="";
   $newpath.="../" while(defined(shift(@origfolders)));
   $newpath.=join('/',@linkfolders)."/" if(@linkfolders);
   return($newpath.$linkfilename);
}

# child_sigint_handler #fold00
sub child_sigint_handler{
   msg_warn("(child) sigint received, exiting\n");
   cleanup();
   exit(1);
}

# child_get_callback #fold00
sub child_get_callback{
   my($in,$r)=@_;  # Input, Response, [Protocol]
   
   $child_progress_bytes+=length($in);
   
   msg_debug("######## callback\n");
   if($child_firstchunk==0){
      $child_firstchunk=1;
      
      # this if is executed when we receive the first portion of data.
      
      msg_debug("got first chunk\n");
      
      # continue or overwrite?
      my($opentype)=">";
      $opentype=">>" if($child_progress_filelen>0);

      my($mtime);
      # get modification time
      if(defined($r->header("Last-Modified"))){
         $mtime=str2time($r->header("Last-Modified"));
         $child_job->{'LastModified'}=$mtime if(defined($mtime));
      }
      
      # do all the mtime checks to detect if the file has changed (if overwrite_mode==3)
      if($overwrite_mode==3){
         if(defined($child_mtime)){
            if(!defined($mtime)){
               msg_debug("didn't get mtime from server, aborting\n");
               die("didn't get mtime from server, aborting\n");
            }
            if($child_mtime==$mtime){
               msg_debug("file unchanged, aborting\n");
               die("file unchanged, aborting\n");
            }
            msg_debug("file on server is newer, updating\n");
         }
         $opentype=">";
      }
      my($filename)=get_filename_from_job($child_job);
      if(make_path_to_file($filename)){
         die("$filename: $!\n");
      }
      unless(open(CHILD_GET_FD,$opentype.$filename)){
         msg_err("couldn't create $filename: $!\n");
         return(-1);
      }
      
      # now check how big the file is and where we start
      unless(defined($child_progress_len)){
         $child_progress_len=child_get_content_length($r);
      }
      $child_progress_pos=child_get_content_start($r);
      
      if(defined($child_progress_len)){
         msg_debug("proceeding to $child_progress_pos of $child_progress_len\n");
         if($child_progress_len<=$child_progress_filelen){
            msg_debug("file complete, aborting\n");
            die("file complete, aborting\n");
         }
      }
   }
   
   ##################
   # Normal retrieval
   
   my($oldpos)=$child_progress_pos;
   $child_progress_pos+=length($in);
   
   if($child_progress_pos>$child_progress_filelen){
      if($child_progress_filelen>$oldpos){
         # write portions of the file (--continue)
         $in=substr($in,$child_progress_pos-$child_progress_filelen);
         msg_debug("skipping(1) ",$child_progress_pos-$child_progress_filelen," bytes\n");
      }
      # write to file
      $child_progress_filelen+=length($in);
      print CHILD_GET_FD ("$in");
   }else{
      msg_debug("skipping(2) ",$child_progress_pos-$oldpos," bytes\n");
   }
   
   # send progress informations to parent
   my($txt)="PROGRESS $child_progress_pos $child_progress_filelen ";
   if(defined($child_progress_len)){
      $txt.="$child_progress_len ";
   }else{
      $txt.="-1 ";
   }
   my($t)=time()-($child_job->{'Time'});
   $txt.=int($child_progress_bytes/($t+0.5));
   print ${$child_job->{'Pipe'}} ("$txt\n");
}

# child_get_content_start #fold00
sub child_get_content_start{
   my($r)=shift;
   my($tmp);
   return 0 unless(defined($tmp=$r->header("Content-Range")));
   return 0 unless($tmp=~m#bytes (\d+)-\d+/\d+#);
   return $1;
}

# child_get_content_end #fold00
sub child_get_content_end{
   my($r)=shift;
   my($tmp);
   return 0 unless(defined($tmp=$r->header("Content-Range")));
   return 0 unless($tmp=~m#bytes \d+-(\d+)/\d+#);
   return $1;
}


# child_get_content_length #fold00
sub child_get_content_length{
   my($r)=shift;
   my($tmp);
   if(defined($tmp=$r->header("Content-Range"))){
      if($tmp=~m#bytes \d+-\d+/(\d+)#){
         return($1);
      }
   }
   return($r->header("Content-Length"));
}
######################################################################### #FOLD00
# plural_s #fold00
sub plural_s{
   my($a)=shift;
   return "" unless(defined($a));
   return "" if($a==1);
   return "s";
}

# form_uri #fold00
###
# kind of normalize URIs
#
sub form_uri{
    my($u)=shift;
    my($baseuri)=shift;
    my($uri);
    
    if(defined($baseuri)){
        $uri=URI->new_abs($u,$baseuri)->canonical;
    }else{
        $uri=URI->new($u)->canonical;
    }
    $uri=~s/#.*$//;              # remove fragment
    
    if($uri=~m/^(\w+):/){
        my($scheme)=lc($1);
        unless($scheme eq 'http' || $scheme eq 'ftp' || $scheme eq 'https'){
            msg_warn("unsupported scheme \"$1\" in URI $uri\n");
            return;
        }
    }else{
        msg_warn("no scheme given for $u, I asume http:\n");
        $uri.="/" if($uri!~m#/#);                    # append / if missing
        $uri="//".$uri if(substr($uri,0,2) ne '//'); # prepend // if missing
        $uri="http:".$uri;                           # prepend missing scheme
    }
    return URI->new($uri);
}

# get_filename_from_job #fold00
sub get_filename_from_job{
   my($job)=shift;
   return get_filename_from_uri(
                                $job->{'URI'},
                                $job->{'BaseURI'},
                                $job->{'Mode'},
                                $job->{'Prefix'},
                               );
}


# get_filename_from_uri #fold00
#  create filename out of a given URI
#  > uri,baseuri,mode,prefix
#  < filename
sub get_filename_from_uri{
   my($uri,$baseuri)=(@_)[0,1];
   my($mode2)=$_[2];
   my($myprefix)=$_[3];
   $mode2=$mode unless(defined($mode2));
   $myprefix=$prefix unless(defined($myprefix));

   my($basehost)=$baseuri->host();
   my($basepath)=$baseuri->path_query();
   my($host)=$uri->host();
   my($path)=$uri->path_query();

   # append port number to anything else than port 80 to avoid confusions,
   # eg. between http://localhost/ and ftp://localhost/
   $basehost=$baseuri->host_port() if($baseuri->port()!=80);
   $host=$uri->host_port() if($uri->port()!=80);
   
   #$path=URI::Escape::uri_unescape($path);
   #$basepath=URI::Escape::uri_unescape($basepath);
   $path=URI::Escape::uri_escape($path,'~');
   $basepath=URI::Escape::uri_escape($basepath,'~');

   $basepath=~s#/[^/]*$#/#;            # plain path, without file
   $path=~s#/$#/$default_index_name#;  # if no filename given, supply default
   
   my($filename);
   if($mode2==0){
      unless($basehost eq $host){
         # msg_debug("different hostnames; BaseDoc: $basedoc, URL: $uri\n");
         return undef;
      }
      unless($basepath eq substr($path,0,length($basepath))){
         # msg_debug("different paths; BasePath: $basepath, Path: $path\n");
         return undef;
      }
      $filename=substr($path,length($basepath));
   }elsif($mode2==1){
      unless($basehost eq $host){
         # msg_debug("different hostnames; BaseDoc: $basedoc, URL: $uri\n");
         return undef;
      }
      $filename=substr($path,1);  # skip leading /
   }elsif($mode2==3 || $mode2==2){
      $filename=$host.$path;
   }else{
      msg_err("unknown mode $mode2\n");
      return undef;
   }
   return $myprefix.$filename;
}

# make_path_to_file #fold00
sub make_path_to_file{
   my($file)=shift;
   my($path)="";
   
   while($file=~m#^([^/]*/)(.*)$#){
      $path.=$1;
      $file=$2;
      unless(-d $path){
         msg_debug("creating dir $path\n");
         unless(mkdir($path,0777)){
            msg_err("can't create directory $path: $!\n");
            return 1;
         }
      }
   }
   
   return 0;
}

# msg_err #fold00
sub msg_err{
   output("*** ERROR[$$]: ",@_) if($verbose>=-1);
}

# msg_warn #fold00
sub msg_warn{
   output("WARNING[$$]: ",@_) if($verbose>=0 && $curses_child_nooutput==0);
}

# msg_info #fold00
sub msg_info{
   output(@_) if($verbose>=0 && $curses_child_nooutput==0);
}

# msg_stats #fold00
sub msg_stats{
   output("Stats: ",@_) if($verbose>=1 && $curses_child_nooutput==0);
}

# msg_debug #fold00
sub msg_debug{
   output("DEBUG[$$]: ",@_) if($verbose>=2 && $curses_child_nooutput==0);
}

# msg_normal #fold00
sub msg_normal{
   output(@_) if($curses_child_nooutput==0);
}

# output  #fold00
sub output{
   if(defined($curses)){
      my($str);
      foreach $str (split(/\n/,join('',@_))){
         if(++$curses_y>=$Curses::LINES-$maxjobs*2-1){
            $curses_y=$Curses::LINES-$maxjobs*2-2;
            Curses::move(0,0);
            Curses::deleteln();
            Curses::move($curses_y,0);
            Curses::insertln();
         }
         curses_output_line($curses_y,0,join('',$str));
      }
      Curses::refresh();
   }else{
      print(@_);
   }
   print $log_fd (@_) if(defined($log_fd));
}

# curses_output_line #fold00
sub curses_output_line{
   my($y,$x,$str)=@_;
   
   if(length($str)>$Curses::COLS){
      $str=substr($str,0,$Curses::COLS);
   }
   $str.=" "x($Curses::COLS-length($str));
   
   Curses::addstr($y,$x,$str);
}

######################################################################### #FOLD00
# enum child_return #fold00
# -3: can't create path to file
# -2: no filename
# -1: can't create filename
#  0: ok
#  1: file exists, skipping
#  2: file unchanged
#  3: server error
#  4: file size differs from Content-Length
#  5: redirect
#  6: ok, extract links
#  7: file updated, extract links
#

# enum mode #fold00
# 0: same directory
# 1: same server
# 2: same domain
# 3: any server

# enum overwrite_mode #fold00
# 0: don't overwrite
# 1: continue
# 2: force overwrite
# 3: overwrite if newer, continue if file length differs

# enum state #fold00
# 0: not fetched so far
# 1: currently fetching
# 2: successful fetched
# 3: unsuccessful fetched (internal error)
# 4: incomplete fetch (lost connection during download etc)
# 5: not found (404).
# 6: file exists (skipped)
# 7: redirect
#
# enum child_status #fold00
# 0: successfully fetched
# 1: file exists, skipping
# 2: could not create path to file (message contains filename)
# 3: fetch error (message contains response line)

# enum verbose #fold00
# -2: absolutely nothing
# -1: errors only
#  0: normal reporting: errors, warnings and URLs to fetch
#  1: +stats
#  2: +debug

# enum depth #fold00
# 0: this URL only
# 1: this URL + images
# 2: this URL + images + links + images of links
# 3 ...

# enum ProgressState #fold00
# 0: waiting, downloading or done.
# 1: extracting images
# 2: extracting links
# 3: converting links
# 4: filtering double links
# 5: exec external program
#

# struct job #fold00
#
# URI        - URI of the job
# Retries    - # of retries to go (undef for unlimited, 0 for no retries)
# MaxRetries - # of retries for children (-> upon init Retries=MaxRetries)
# Depth      - 0=only this URL, 1=this URL and it's images, 2=this URL+links+images of all, ...
# Status     - enum status
# Statustext - textual representation of status or reason of failure or undef
# BaseURI    - URI of base document (for option --samedir and --sameserver)
# Mode       - enum mode
# Prefix     - Prefix for URL
# Time       - time (as reported by time()) when the job was started (used to calculate throughput)
# Pipe       - Pipe for communication child -> parent 
# Exec       - Execute this program whenever a page was fetched
# Exclude    - String of comma separated regexps to exclude(-)/include(+)
# Headers    - Pointer to an associative array of header/value pairs
# Piperead   - (Parent only) everything read from the pipe
# Convert    - convert absolute links to relative ones if possible
# FromFile   - name of file (option --file) from which this link is coming
# LastModified   - Time of last modification as reported from server (converted to Unix time!) or undef
# RedirectCnt    - Counter, how many redirects this URI got (0 if undef) to detect loops 
# OriginalURI    - contains the URI of the original request as issued in cmdline or file (for --remove)
# ProgressPos    - How much bytes we've already retrieved
# ProgressFile   - How big the file actually is (might be >=ProgressPos)
# ProgressShould - How big the file should be or undef if we don't know
# ProgressBPS    - Bytes Per Second
# ProgressState  - What we're doing right now (see "enum ProgressState")
#

######################################################################### #FOLD00
# ## documentation ## #fold00

__END__

#   NAME #fold01

=head1 NAME

Sirobot - a web fetch tool similar to wget

=cut

#   SYNOPSIS #fold01

=head1 SYNOPSIS

 sirobot.pl [options] <URL> [[options] <URL>...]

=cut

#   DESCRIPTOPN #fold01

=head1 DESCRIPTION

Sirobot is a web fetch tool. It's implemented in Perl 5 and runs from a
command line.

Sirobot takes URLs as arguments and is able to download them as well as 
all given images and links in those HTML files recursively, too. 

The main advantage over other tools like GNU wget is the ability to 
fetch several files concurrently which effectively speeds up your download.

=cut

#   USAGE #fold01

=head1 USAGE

Call Sirobot (the executable is called C<sirobot.pl>) with at least one 
URL (see L<"URL">) as an argument or specify a file to read URLs from 
(option C<--file <file>>, see L<"OPTIONS">). 
If it can't find any URLs, a short usage advice is displayed and 
Sirobot quits. 

There are various possibilities to influence Sirobot's behaviour such 
as how deep it should crawl into a WWW tree. 

Sirobot tries to figure out which proxy to use. Therefor it looks for the
environment variables C<$http_proxy> and C<$ftp_proxy>. You can always set 
the proxy configuration manually (see C<--proxy> and C<--ftpproxy>).

Often used options may be put into F<~/.sirobotrc>. This file
is processed upon startup before any command line option is read.
This is done similar to the C<--file> command (see below) so the 
syntax is the same as describe there. 

See also L<"EXAMPLES"> for a rather useful example.

=cut

#   URL #fold01

=head1 URL

(If you are familiar with the usage of URLs you may skip this section)

A correct URL may looks like this:

   http://server/path/to/index.html    # Standard URL
   http://server/file?query            # Standard URL with query
   http://server/file#frag             # Standard URL with fragment
   
If you need to access a webserver at another port instead of the commonly 
used port 80 (default), try this (example accesses port 1234):

   http://server:1234/

Some pages are protected by passwords. Sirobot can access these pages, too
but it needs a username and password from you. The following example
takes "honestguy" as username and "secret" as password:

   http://honestguy:secret@server/

It works the same for FTP.

B<Note>: If you get a strange message about a missing method while using 
password authentication try updating your libwww-perl and/or URI libraries.
See F<INSTALL> for where to get them.

=cut

#   ## OPTIONS ## #fold01
#     OPTIONS #fold02

=head1 OPTIONS 

(See L<"EXAMPLES"> for how to use them)

Sirobot's behaviour can be influenced in a lot of different ways to better
fit your needs. 

You can see a short summary of available options by simply running

 sirobot.pl --help      (displays summary of frequently used options)
 sirobot.pl --morehelp  (displays summary of ALL available options)  
 sirobot.pl --examples  (displays some examples how to use Sirobot)

Please don't get confused by so many options, you surely
do not need them all :-)) If you don't know where to start, run
C<sirobot.pl --help> and check out the commands displayed there.


Many arguments like C<--depth>, C<--samedomain> or C<--exclude> remain 
active for all remaining URLs unless other commands overwrites them. 
Some arguments take an additional value (eg. C<--depth> takes a number). 

B<Note>: the following notations are all the same and internally converted 
to the first version.

  --depth 1
  --depth=1
  -d 1          (only available for short options)
  -d1           (only available for short options)

=cut 

#     Informative options  #fold02

=head2 Informative options

=over 4

=item -h

=item --help

Print a helpscreen with the most important options along
with a short explanaition and quit. 

See also C<--morehelp>

=for html <br><br>

=item --morehelp

Print a list with all available options along with a short
explanaition and quit. 

See also C<--help>

=for html <br><br>

=item -V

=item --version

Print version + build date and quit. 

=for html <br><br>

=back

=cut

#     Control verbosity #fold02

=head2 Control verbosity

B<Note>: The following options are mutually exclusive which means every
option lasts until overwritten by another.

=over 4

=item --debug 

Be incredibly verbose. Useful for debugging (who guessed that? ;-)). 
If you want to debug the child processes, too, also add C<--nocurses> to 
your commandline.

See also C<--verbose>, C<--silent> and C<--quiet>.

=for html <br><br>

=item --nostats

Don't show statistics when all downloads are done.

See also C<--stats>.

=for html <br><br>

=item --quiet

Absolutely no output (not even on errors).

See also C<--verbose>, C<--debug> and C<--silent>.

=for html <br><br>

=item --silent

Print errors only.

See also C<--quiet>, C<--verbose> and C<--debug>.

=for html <br><br>

=item --stats

Show some statistics when all downloads are done (default).

See also C<--nostats>.

=for html <br><br>

=item -v

=item --verbose

Be a bit more verbose during operation and print statistics if done.

See also C<--quiet>, C<--silent> and C<--debug>.

=for html <br><br>

=back

=cut

#     Use curses library for user interface #fold02

=head2 Use curses library for user interface

B<Note>: The following options are mutually exclusive which means every
option lasts until overwritten by another.

=over 4

=item --curses

Use the curses library for the user interface (UI) if it is available 
(default). It will be used to improve readability of statistics etc.
The drawback is a slightly worse performance if you download a lot of small 
files because of the many screenupdates.

If curses cannot be used (eg. if stdout is not a tty), the "old" interface
will be used.

See also C<--nocurses>.

=for html <br><br>

=item --nocurses

Do not use the curses library. Everything will be printed out as-is.
You may want to use this option to turn off warning messages in case 
you don't have the lib installed.

See also C<--curses>.

=for html <br><br>

=back

=cut

#     Control behaviour if files already exists #fold02

=head2 Control behaviour if files already exist.

B<Note>: The following options are global and mutually exclusive which means
only the last of the given options is active.

=over 4

=item -c

=item --continue 

Continue download if file already exists. This is nearly the same
as C<--tries> (see there for limitations) except the fact that
C<--continue> works even if the (incomplete) file was fetched with
another tool.

See also C<--force> and C<--noclobber>.

=for html <br><br>

=item -f

=item --force

If a file already exists on your harddisc, overwrite it without asking.

See also C<--continue>, C<--newer> and C<--noclobber>.

=for html <br><br>

=item --noclobber

Don't touch any existing files but skip this link (default).

See also C<--force>, C<--newer> and C<--continue>.

=for html <br><br>

=item -n

=item --newer

Overwrite existing files only if newer. This feature utilizes the 
modification time of the file and requires the Last-Modified HTTP header 
set by the server, otherwise it behaves like C<--noclobber>.

See also C<--force>, C<--noclobber> and C<--continue>.

=for html <br><br>

=back

=cut 

#     Limit from where files will be fetched

=head2 Limit from where file will be fetched

B<Note>: The following options are mutually exclusive which means every
option lasts until overwritten by another.

B<Note>: These options also affect in which subdirectory the files are stored.

=over 4

=item --anyserver

Upon recursive download, fetch all links, whereever they're pointing
to. Use with care!

See also C<--samedir>, C<--samedomain>, C<--sameserver> and C<--depth>.

=for html <br><br>

=item --samedir

Upon recursive download, only fetch those links pointing to the
same directory or any subdirectories as the specified URL.
This is the default operation.

See also C<--sameserver>, C<--samedomain>, C<--anyserver> and C<--depth>.

=for html <br><br>

=item --samedomain

Upon recursive download, only fetch those links pointing to the
same domain as the specified URL.

See also C<--samedir>, C<--sameserver>, C<--anyserver> and C<--depth>.

=for html <br><br>

=item --sameserver

Upon recursive download, only fetch those links pointing to the
same server as the specified URL.

See also C<--samedir>, C<--samedomain>, C<--anyserver> and C<--depth>.

=for html <br><br>

=back

=cut 

#     Limit which files will be fetched #fold02

=head2 Limit which files will be fetched

B<Note>: The following options can be mixed and each option may overwrite
the preceeding one partially or completly.

=over 4

=item --exclude <regexp>

Do not download files recursivly that match a comma separated list 
of regular expressions. By default, all files are allowed.
          
Everything Perl provides as regular expressions can be used
for <regexp>, it will be directly converted to the Perl
statement m/<regexp>/; Here are the main facts:

=over 3

=item * 

Letters and digits match as-is (case sensitive matching!).
C<ba> matches C<bad> and C<alban> but not C<bla>.

=item * 

and period (".") matches any single character
C<h.llo> matches C<hallo> and C<hello>.

=item * 

an asterisk ("*") matches any number of repetitions
(including none) of the character in front of it
C<xa*ba> matches C<xaba>, C<xaaba>, C<xaaaaaba> and even C<xba>.

=item * 

a C<^> at the beginning denotes the start of a line
C<^here> matches only if here appears at the beginning
of a line. Therefor it never matches there.

=item *

a C<$> at the end denotes the end.
C<gif$> matches any file that ends on gif.

=item * 

C<$>, C<.>, C<^>, brackets among others must be escaped by a
backslash (C<\>) eg C<\$>

=back

See C<man perlre> for even more stuff and L<"EXAMPLES>.
          
You may enter several C<--exclude> and mix them with C<--include>.
If you want to allow only particular files, try this
combination:

C<--exclude . --include <regexp>>

which will disallow all files (a dot matches any string with at least
one character) and re-allow files matching <regexp>.
 
The default can be restored by inserting C<--include .>.
B<Note>: when entered as a shell command, the regexp should be quoted: 
C<--include '.*'>.

See also C<--include>.

=for html <br><br>

=item --include <regexp>

Allow downloading files recursivly that match a comma separated list of 
regular expressions. You may specify enter several C<--include> and mix 
them with C<--exclude>.
By default, all files are allowed.
See C<--exclude> for more informations.

=for html <br><br>

=back

=cut

#     Manual proxy configuration #fold02

=head2 Manual proxy configuration

B<Note>: Sirobot first reads the environment variables C<$http_proxy>, 
C<$ftp_proxy> and C<$no_proxy> to figure out your system's the default 
settings.

B<Note>: These settings are global for all URLs to fetch. Commandline
options override environment settings.

=over 4

=item --ftpproxy <FTPPROXYURL>

Use <FTPPROXYURL> for all FTP connections. ("-" will unset).
Sirobot can't access FTP sites directly but always needs a proxy
that translates between HTTP and FTP for it (most proxies are
able to do that).

See also C<--proxy> and C<--noproxy>.

=for html <br><br>

=item --noproxy <DOMAIN>,<DOMAIN>,...

A comma separated list of domains which will be accessed without
a proxy.

See also C<--proxy> and C<--ftpproxy>.

=for html <br><br>

=item -P <PROXYURL>

=item --proxy <PROXYURL>

Use <PROXYURL> as a proxy for all HTTP requests. ("-" will unset).

See also C<--ftpproxy> and C<--noproxy>.

=for html <br><br>

=back

=cut

#     Convert pages #fold02

=head2 Convert pages

B<Note>: The following options are mutually exclusive which means every
option lasts until overwritten by another.

=over 4

=item --convert

Sirobot can be asked to convert all links HTML-files from absolute
to relative. Useful for sites that use a lot of absolute links
(eg. Slashdot) which you cannot view directly.
Please note that the options C<--anyserver>, C<--sameserver>,
C<--samedomain> and C<--samedir> affect the decision which links to 
actually convert and which not because they affect in which folder the 
files are actually stored.

See also C<--noconvert>.
          
=for html <br><br>
          
=item --noconvert

Turn conversion feature off (default).

See also C<--convert>.

=for html <br><br>

=back

=cut

#     Read URLs and additional arguments from file #fold02

=head2 Read URLs and additional arguments from file

B<Note>: The following options are mutually exclusive which means every
option lasts until overwritten by another.

=over 4

=item -F <file>

=item --file <file>

Read additional options and URLs from the given file.
<file> may contain multiple lines. Lines starting with # will be ignored.

B<Note>: Althought it is possible to have multiple arguments per line, 
using one line per argument is strongly recommended.

All arguments read from the file are processed as if they have been 
entered in the command line. That means the same syntax applies but 
remember you must not escape special shell characters or use quotes. This 
also implies you can't have spaces as a part of an argument or empty 
arguments at all (really need that? Write me!)

See also L<"EXAMPLES">.

=for html <br><br>

=item --noremove

Turn off the C<--remove> feature (default).

See also C<--remove>.

=for html <br><br>

=item --remove

This option only makes sense in combination with one or more URLs
read from a file (see C<--file>). After the URL has been
downloaded successfully, it is deactivated in the file it came
from. C<--remove> is useful to better keep track of which files
are already fetched and which are not.

Deactivation of a link is done by prepending a C<#[SIROBOT: done]>
to the line that contains the link.

In order to perform the work correctly it is necessary to have only
one link per line (and only the link, no options in the same line, put 
them in a separate line before the link).

This flag is inteded to be used in combination with C<--continue>
(which is not turned on by default) in order to continue large
downloads whenever you are online but it can be used without C<--continue>, 
too.

Note: As mentioned earlier, Sirobot can only detect if a file is 
complete if the server provides information about it's content length.

See also C<--noremove>, C<--file> and L<"EXAMPLES">.

=for html <br><br>

=back

=cut

#     Log to file #fold02

=head2 Log to file

=over 4

=item --log <file>

Write logging information to F<file>. This is very useful because you 
cannot redirect output to file if you use C<--curses>. In that case,
everything printed to the upper part of the curses screen is also 
written to F<file>.

If you have curses turned off (eg. by C<--nocurses>), the output is the
same as on the screen. 

See also C<--nolog>

=for html <br><br>

=item --nolog

Turn logging off (default).

See also C<--log>

=for html <br><br>

=back

=cut

#     Daemon mode #fold02

=head2 Daemon mode

=over 4

=item --daemon

Turn on daemon mode. In this mode, Sirobot opens a named pipe 
(see C<--pipe>) and does not exit if there are no more waiting jobs.
You can write any arguments to the file and Sirobot will process them like 
those given by C<--file>. 

B<Note>: The named pipe must be created before you run Sirobot 
(eg. by the shell command C<mkfifo>).

B<Note>: Unfortunally, Sirobot blocks upon startup unless at least one line
is written to the pipe (eg. by echo >/tmp/sirobot). This is not Sirobot's 
fault.

See also C<--nodaemon>, C<--pipe> and L<"EXAMPLES">.

=for html <br><br>

=item --nodaemon

Turn off daemon mode (default)

See also C<--daemon> and C<--pipe>.

=for html <br><br>

=item --pipe <file>

Set name of pipe used for daemon mode. Default is F</tmp/sirobot>.

See also C<--daemon> and C<--nodaemon>.

=for html <br><br>

=back

=cut


#     Various options #fold02

=head2 Various options

=over 4

=item -d <n>

=item --depth <n>  

Sirobot can download images and links of HTML files as well. 
This option specifies how deep Sirobot should descent into it.
Depth 0 means Sirobot must only download the URLs specified in the
command line.

 Depth 1 tells Sirobot to download all included images but no
         further links.
 Depth 2 does the same as Depth 1 PLUS it fetches all links on this
         page PLUS all images of the links.
 Depth 3-... I think you guess it ;-)

To avoid downloading the whole internet, the use of C<--samedir>,
C<--sameserver> and C<--samedomain> as well as C<--exlucde> and C<--include>
is strongly recommended!

=for html <br><br>

=item --from <email>

Set value for the "From:" header in HTTP requests. By default,
Sirobot guesses your email address using the environment
variables C<$USER> and C<$HOSTNAME>.
Please set your email address with this option in F<~/.sirobotrc>
as shown in L<"EXAMPLES">.

=for html <br><br>

=item -H <header>=<value>

=item --header <header>=<value>

Add user defined header to all HTTP requests. If <header> is a "-",
the list of headers will be discarded.
As an example, C<--header From=myname@home> will be translated
into a "From: myname@home"-line in the HTTP request header.
Useful for sites that need a correct Referrer:-tag before they
allow downloads.

=for html <br><br>

=item -j

=item --jobs

Specifies the number of downloads Sirobot should do concurrently.
Default if 5. This is a global setting.

=for html <br><br>

=item --norobots

Ignore F</robots.txt>. Usually, HTTP-servers supply a file named
http://<servername>/robots.txt to inform automatic tools
like Sirobot which files on this server must not be downloaded
to prevent unwanted behaviour and infinite recursion.

B<This option is NOT RECOMMENDED!> USE WITH CARE!

=for html <br><br>

=item -p <dir>

=item --prefix <dir>

Saves all downloaded files to <dir>. <dir> and it's
subdirectories will be created if neccessary. By default, Sirobot
saves files to the current directory.

=for html <br><br>

=item -t n

=item --tries n

Tells how often Sirobot should try to get each file. Default is 1
which means Sirobot doesn't try again in case of failure.

To be able to determine if the download was incomplete, Sirobot
needs some help from the server so this feature might not work
with all files! This also applies to C<--continue>.

=for html <br><br>

=item --exec <prg>

Execute this external program whenever a HTML page was successfully 
downloaded. The following arguments will be appended: URL, filename and 
current depth.

Any output written to STDOUT will be discarded unless the lines start with
C<ECHO>. Error messages written to STDERR are currently not filtered and
therefor go directly to Sirobot's STDERR and may cause screen corruption
when Curses are turned on.

=for html <br><br>

=item --dump

=item --nodump

=item --dumpfile <file>

If turned on (C<--dump>), this feature will cause Sirobot to NOT follow
links and download them recursively but write all found links to a file
<file> (or STDOUT, if <file> is "-"). Double links are automatically
removed and dumped only once. Also, a C<"-d<number> "> will be put in
front of each link to represent the current depth setting.

This can be used by an external program to filter incoming links or to
run Sirobot in some kind of dry or test mode. In conjunction with
option C<--daemon>, the external program can feed the filtered links
back into Sirobot. Here's a simple and senseless loopback demonstration
(the named pipe /tmp/sirobot must exist):

  sirobot.pl --dump --dumpfile /tmp/sirobot --pipe /tmp/sirobot \
             --daemon -d2 http://www.sirlab.de/linux/

Please note the following drawbacks:

=over 3

=item *

The directory structure will not be correct unless you use C<--anyserver>.

=item *

Dumping links to STDOUT will corrupt the screen when Curses are turned on
(see C<--curses> and C<--nocurses>). You might also want to turn statistics
and other messages off, too: C<--quiet>, C<--nostats>.

=item *

Other settings (prefix, include/exclude list, headers, ...) are not 
forwarded. The last setting will be available.

=item *

Every dumped link is internally marked as successfully downloaded.

=item *

The dumpfile is opened and closed once for every link. On the one 
hand this means a loss of speed, on the other hand it allows you to get a
snapshot while Sirobot runs.

=back

=for html <br><br>

=item --wait

=item --nowait

Wait/don't wait for the user to press a key after all downloads are complete
and the statistics are shown. This option only affects Sirobot's behaviour 
if Curses are turned on.

=for html <br><br>

=back

=cut

#     Tuning options #fold02

=head2 Tuning options

=over 4

=item --blocksize <val>

Set the size of chunks to process. Files are downloaded and saved in chunks.
Bigger values mean less overhead and therefor much better performance but
also less accuracy of the progress bar in curses mode. 

Default value is 4096 Bytes (4 KB).
Use bigger values for fast links and use the default value or less for
slow ones.

=for html <br><br>

=back 

=cut

#   -- OPTIONS -- #fold02
#   ## EXAMPLES ## #fold01
#     EXAMPLES #fold02
=head1 EXAMPLES

Here are some of the many possibilities how Sirobot can be used.
Examples that don't fit in one line are split and a C<\> is appended.

=cut

#     Simple examples #fold02

=head2 Simple examples

=over 4

=item get a single page

 sirobot.pl http://www.sirlab.de/linux/

Get the Sirobot homepage (F<index.html>) and it's images and store them
in the current directory.

=for html <br><br>

=item save files to another directory

 sirobot.pl --prefix /tmp/fetched/ \
     http://www.sirlab.de/linux/

Same as above but save all downloaded files to F</tmp/fetched/>

=for html <br><br>

=item don't fetch recursive

 sirobot.pl --depth 0 http://www.sirlab.de/linux/

Get F<index.html> only (depth 0).

=for html <br><br>

=item fetch recursive

 sirobot.pl --anyserver --depth 2 http://www.tscc/searcher.html

Get all links mentioned on this page, whereever they're pointing to
with a maximum depth of two.

=for html <br><br>

=item exclude files (simple)

 sirobot.pl --exclude '\.gif$' http://www.linux.org/

Get homepage of linux.org but don't download URLs that end with ".gif".

=for html <br><br>

=item exclude files (advanced)

sirobot.pl --sameserver --depth 2 --exclude '.' \
     --include '\.html$' http://www.linux.org/

Get all pages recursively with a maximum depth of 2. Exclude all files
and re-allow those that end with ".html". That effectively means, only
HTML files get fetched but no images and other stuff.

=for html <br><br>

=back

=cut

#     Read links from file  #fold02

=head2 Read links from file

=over 4

=item Read links from file

 sirobot.pl --file getthis.txt

Read F<getthis.txt> and process it's content as command line arguments.
Imagine F<getthis.txt> consists of the following lines:

  ### start of getthis.txt ###
  --depth 0
  http://xy.org/
  --prefix zzz
  http://zzz.net/
  ###  end  of getthis.txt ###

which is the same as if you invoke

 sirobot.pl --depth 0 http://xy.org/ --prefix zzz http://zzz.net/

=for html <br><br>

=item read links from file and remove them if done

 sirobot.pl --remove --continue --file getthis.txt

This is nearly the same as above, with one major difference: After
http://xy.org/ and http://zzz.net/ are successfully downloaded,
F<getthis.txt> reads like this:

  ### start of getthis.txt ###
  --depth 0
  #[SIROBOT: done] http://xy.org/
  --prefix zzz
  #[SIROBOT: done] http://zzz.net/
  ###  end  of getthis.txt ###

What's that good for you ask? Well, imagine your connection becomes
terminated before the files are completly fetched (eg. because you've
hung up your modem, the link broke down etc). Then you can issue exactly 
the same line when you're back online again. You don't need to keep track 
which files are complete and which are not.

=for html <br><br>

=back

=cut

#     Your personal settings #fold02

=head2 Your personal settings

You may create a file F<~/.sirobotrc> which will be processed upon startup. 
It usually contains your preferred settings so you don't need to type them
every time.

Here's what I have put into my F<~/.sirobotrc>:

  ### start of ~/.sirobotrc ###
  # Put your email address here:
  --from yourusername@somedomain
  
  # Exclude all nasty big files that might accidently be fetched 
  # during recursions. They still may be re-enabled if needed.
  --exclude \.(gz|bz2|tar|tgz|zip|lzh|lha)(\?.*)?$
  --exclude \.(mpg|mp3|wav|aif|au)(\?.*)?$
  --exclude \.(ps|pdf)(\?.*)?$
  ###  end  of ~/.sirobotrc ###

=for html <br><br>

=cut

#     Using daemon mode #fold02

=head2 Using daemon mode

 mkfifo /tmp/sirobot
 sirobot.pl --daemon &
 echo >/tmp/sirobot

This creates the named pipe F</tmp/sirobot> (aka fifo) and puts Sirobot 
in daemon mode. Sirobot will block until you write something to the 
named pipe, that's what the last line is good for. 

Now you can send Sirobot additional commands if you write to the pipe:

 echo --depth 0 >/tmp/sirobot
 echo http://slashdot.org >/tmp/sirobot
 echo --prefix fm/ http://freshmeat.net >/tmp/sirobot

End daemon mode by writing C<--nodaemon> to the pipe:

 echo --nodaemon >/tmp/sirobot

=for html <br><br>

=cut

#     Hints #fold02

=head2 Hints

Remember that the following options affect only URLs issued after them:
C<--anyserver>, C<--samedomain>, C<--samedir>, C<--sameserver>, C<--depth>, 
C<--prefix>, C<--exclude>, C<--include> and C<--tries>.

This means, you can get URL1 with depth 2 and URL2 with depth 1 and save
them to different directories with one single call of Sirobot if you try the 
combination "--prefix dir1/ --depth 2 URL1 --prefix dir2/ --depth 1 URL2".

 sirobot.pl --anyserver -d 2 http://slashdot.org/ \
     --samedir http://freshmeat.net/

Get all links from Slashdot (depth 2) and those links from
freshmeat.net that point to the same directory (depth 2, too!).

You still didn't get it? Let me know! See L<"CONTACT"> for how to contact 
the author.

=cut

#   -- EXAMPLES -- #fold02
#   DISCLAIMER #fold01

=head1 DISCLAIMER

This piece of software comes with absolutely no warranty. The author
cannot be made responsible for any failures, defects or other damages 
caused by this program. Use it at your own risk.

=cut 

#   COPYRIGHT #fold01

=head1 COPYRIGHT

Sirobot is GPL.

=cut

#   CONTACT  #fold01

=head1 CONTACT

Problems? Found a bug? Want new features? 

Feel free to contact the author for any kind of reason except SPAM: 

    Email: Settel <settel@sirlab.de>
      WWW: http://www.sirlab.de/linux/contact.html
      IRC: Settel, usually on #unika and #linuxger

See the following page for updates, changelogs etc:

      http://www.sirlab.de/linux/

=cut

# -- documentation -- #fold01
