# Copyright (c) 1994 Regents of the University of California.
# All rights reserved.
# $Id: momvisit.pl,v 1.10 1994/08/10 10:18:29 fielding Exp $
# ---------------------------------------------------------------------------
# momvisit: A package for traversing a subweb of the World-Wide Web,
#           storing the results in momhistory, and calling out traversal
#           events as the spider proceeds through the structure.
#
# This software has been developed by Roy Fielding <fielding@ics.uci.edu> as
# part of the Arcadia project at the University of California, Irvine.
#
# Redistribution and use in source and binary forms are permitted,
# subject to the restriction noted below, provided that the above
# copyright notice and this paragraph and the following paragraphs are
# duplicated in all such forms and that any documentation, advertising
# materials, and other materials related to such distribution and use
# acknowledge that the software was developed in part by the University of
# California, Irvine.  The name of the University may not be used to
# endorse or promote products derived from this software without
# specific prior written permission.  THIS SOFTWARE IS PROVIDED ``AS IS''
# AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
# LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE.
#   
# Use of this software in any way or in any form, source or binary,
# is not allowed in any country which prohibits disclaimers of any
# implied warranties of merchantability or fitness for a particular
# purpose or any disclaimers of a similar nature.
#   
# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
# ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION
# (INCLUDING, BUT NOT LIMITED TO, LOST PROFITS) EVEN IF THE UNIVERSITY
# OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# If you have any suggestions, bug reports, fixes, or enhancements,
# send them to the author Roy Fielding at <fielding@ics.uci.edu>.
# ---------------------------------------------------------------------------
require "www.pl";
require "wwwurl.pl";
require "wwwhtml.pl";
require "momconfig.pl";
require "momevent.pl";
require "momhistory.pl";

package momvisit;

# ==========================================================================
# Get defaults from momconfig.pl unless they are stupid

$pwd      = ($momconfig'PWDdir   || $ENV{'PWD'} || $ENV{'cwd'} || '' );

# The initial base URL to use if a Top URL is relative

$BaseURL  = ($momconfig'BaseURL  || "file://localhost$pwd/");

# Default maximum traversal depth

$MaxDepth = ($momconfig'MaxDepth || 20);

# The maximum number of seconds to wait for a response.

$Timeout  = ($momconfig'Timeout  || 30);


# Amount of time required between any two requests to the same site.

if (!defined($momconfig'BetweenTime) || ($momconfig'BetweenTime < 15))
{
       $BetweenTime = 15;
}
else { $BetweenTime = $momconfig'BetweenTime; }

# Max number of consecutive requests to a site before long pause is required.

if (!defined($momconfig'MaxConsec) || ($momconfig'MaxConsec < 2))
{
       $MaxConsec =  5;
}
else { $MaxConsec = $momconfig'MaxConsec; }

if (!defined($momconfig'PauseTime) || ($momconfig'PauseTime < 60))
{
       $PauseTime = 60;     # The number of seconds to pause.
}
else { $PauseTime = $momconfig'PauseTime; }


# ==========================================================================
# Set things which should not be changed at installation time.

$CurConsec   =  1;     # Current number of consecutive requests to a site

$PrevSite    = '';     # Site of the last network request
$PrevTime    =  0;     # Time of the last network request

@TravNodes   = ();     # Nodes that we have yet to traverse for this task
@TravDepth   = ();     #      and their traversal depth
@TravParent  = ();     #      and their parent's url
@TestLinks   = ();     # URLs that we have yet to test for this task
@TestAbs     = ();     #      and their original HREF in absolute form
@TestOrig    = ();     #      and their original HREF
@TestType    = ();     #      and their anchor type (Link, Image, Query)


# ==========================================================================
# ==========================================================================
# setMaxDepth($number) -- Set the maximum traversal depth to first argument.
#
sub setMaxDepth
{
    $MaxDepth = $_[0];
}


# ==========================================================================
# Clear the Status of the traversal process (except the Visited tables)
#
sub clearstatus
{
    undef @TravNodes;
    undef @TravDepth;
    undef @TravParent;
    undef @TestLinks;
    undef @TestAbs;
    undef @TestOrig;
    undef @TestType;
    &momhistory'reset_traversal;
}

# ==========================================================================
# infostruct(): Traverse an entire infostructure, starting at the $tasktop
# url and continuing until there are no more nodes to traverse within the
# infostructure bounded by $tasktype and $taskname, or MaxDepth is reached.
#
sub infostruct
{
    local($task, $tasktype, $taskname, $tasktop) = @_;
    local($url, $node, $depth, $parent, $child, $abs, $orig, $type,
          $pos, $vidx, $response, $reused);

    if (!&momevent'begin_infostruct($task))
    {
        print STDERR "...skipping task $task named $taskname\n";
        return;
    }

    $url    = &wwwurl'absolute($BaseURL, $tasktop);
    $node   = &momhistory'remember($url, $momhistory'S_seen_not_tested);
    $depth  = 0;
    $parent = '';

    $tasktop = $url;

    if (&momhistory'must_avoid($node))    # Node must be avoided?
    {
        print "Avoided Top URL $url\n";
        &momevent'traversed($node);
        undef $node;
    }

    while ($node)
    {
        next if (!$url);                  # Just in case the record was deleted

        print "Traversing $url ... ";
        $response = &travlink($node, $parent);
        print $response, "\n";

        &momevent'begin_traversed($node, $parent);

        while ($TestLinks[0])
        {
            $child = shift(@TestLinks);   # Note that $url is now the parent
            $abs   = shift(@TestAbs);
            $orig  = shift(@TestOrig);
            $type  = shift(@TestType);

            next if (!$child);

            $vidx = &momhistory'remember($child, $momhistory'S_seen_not_tested);

            if (&momhistory'was_tested($vidx))
            {
                print "Reusing test of $child\n";
                $reused = 1;
            }
            else
            {
                if (&momhistory'must_avoid($vidx))
                {
                    print "Avoiding $child\n";
                    &momevent'tested($vidx, $url, $abs, $orig, $type, 1);
                    next;
                }

                print "Testing $child ... ";
                $response = &testlink($vidx, $url);
                print $response, "\n";
                $reused = 0;
            }

            if ($depth >= $MaxDepth)
            {                        # Don't traverse beyond maximum depth,
                ; # Do Nothing       # but don't mark as a leaf either.
            }
            elsif (&should_traverse($vidx,$type,$tasktype,$taskname,$tasktop))
            {
                $pos = $#TravNodes + 1;
                $TravNodes[$pos]  = $vidx;
                $TravDepth[$pos]  = $depth + 1;
                $TravParent[$pos] = $url;
                &momhistory'set_status($vidx, $momhistory'S_will_traverse);
            }
            else
            {
                &momhistory'set_status($vidx, $momhistory'S_leafed);
            }
            &momevent'tested($vidx, $url, $abs, $orig, $type, $reused);
        }
        &momevent'end_traversed($task, $url, $#TravNodes + 1);
    }
    continue
    {
        $node   = shift(@TravNodes);
        $depth  = shift(@TravDepth);
        $parent = shift(@TravParent);
        if ($node) { $url = &momhistory'get_url($node); }
    }

    &momevent'end_infostruct($task);
    &clearstatus;
}


# ==========================================================================
# travlink(): Traverse the given URL and add any links in it to the
# TestLinks queues.  Returns the server response code.
#
sub travlink
{
    local($node, $parent) = @_;
    local($url, $response, $headers, %headers, $content, $safe);

    $url = &momhistory'get_url($node);

    @TestLinks = ();   # Note that these queues are global to this package
    @TestAbs   = ();
    @TestOrig  = ();
    @TestType  = ();

    $headers = '';     # The response headers will be returned here
    %headers = ();     # The parsed response headers will be returned here
    $content = '';     # The response content will be returned here

    #
    # Set up the HTTP request headers
    #

    $headers{'Accept'} = 'text/html';
    if ($parent) { $headers{'Referer'} = $parent; }

    #
    # Make sure we are not making too many requests, too fast
    #

    &check_speed($url);

    #
    # Make the HTTP request
    #

    $response = &www'request('GET', $url, *headers, *content, $Timeout);

    #
    # Check the response code to see what we can do with it:
    #

    if (($momhistory'WhatToDo{$response} == $momhistory'DO_continue) &&
        (&is_html($url, *headers))                )  # Only HTML contains links
    {
        $safe = 1;
        if ($ce = $headers{'content-encoding'})      # Yikes, its compressed
        {
            $safe = &unencode($ce, *content);
        }
        if ($safe)
        {
            &wwwhtml'extract_links($url, *headers, *content, *TestLinks,
                                         *TestAbs, *TestOrig, *TestType);
        }
    }

    #
    # Store the response and the metainfo we got from it
    #

    &momhistory'store($node, $momhistory'S_traversed, $response, *headers);

    #
    # Handle the redirected links as if they were a single child
    #

    if ($momhistory'WhatToDo{$response} == $momhistory'DO_redirect)
    {
        if ($redir = $headers{'location'})
        {
            $redir =~ s/, .*//;     # Get rid of multiple Location: entries
        }
        elsif ($redir = $headers{'uri'})
        {
            $redir =~ s/\s*;.*//;
            $redir =~ s/, .*//;     # Get rid of any multiple URI: entries
        }

        if ($redir)
        {
            $TestLinks[0] = $redir;
            $TestAbs[0]   = $redir;
            $TestOrig[0]  = $redir;
            $TestType[0]  = 'R';
        }
    }

    return $response;
}


# ==========================================================================
# testlink(): Test the given URL using the HEAD request and update its status.
#             Returns the server response code.
#
sub testlink
{
    local($node, $parent) = @_;
    local($url, $response, $headers, %headers, $content);

    $url = &momhistory'get_url($node);

    $headers = '';     # The response headers will be returned here
    %headers = ();     # The parsed response headers will be returned here
    $content = '';     # The response content will be returned here

    #
    # Set up the HTTP request headers
    #

    $headers{'Accept'} = '*/*';
    if ($parent) { $headers{'Referer'} = $parent; }

    #
    # Make sure we are not making too many requests, too fast
    #

    &check_speed($url);

    #
    # Make the HTTP request
    #

    $response = &www'request('HEAD', $url, *headers, *content, $Timeout);

    #
    # Store the response and the metainfo we got from it
    #

    &momhistory'store($node,$momhistory'S_tested_unknown,$response,*headers);

    return $response;
}


# ==========================================================================
# check_speed(): Make sure that we are not going too fast and/or making
#                too many requests in sequence to a single site.  This
#                prevents a robot from completely dominating the resources
#                of a particular server.  If we are going too fast, go to
#                sleep for a while before returning.
#
sub check_speed
{
    local($url) = @_;
    local($site, $secs);

    return unless ($url =~ m#^http#); # THIS LINE MUST CHANGE when libwww-perl
                                      # supports other network services

#   return if ($url =~ m#^file://localhost/#);  # No speed limit on filesystem

    return unless ($site = &wwwurl'get_site($url));

    LOITER: {
        if ($site ne $PrevSite)
        {
            $CurConsec = 1;
            last LOITER;
        }

        ++$CurConsec;
        if ($CurConsec > $MaxConsec)            # Too many consecutive requests
        {                                       #     to the same site?
            sleep($PauseTime);                  #     Yes - take a breather.
            $CurConsec = 1;
            last LOITER;
        }

        $secs = ($PrevTime + $BetweenTime) - time;
        if ($secs > 0)
        {
            sleep($secs);
        }
    }

    $PrevSite = $site;
    $PrevTime = time;
}


# ==========================================================================
# should_traverse(): Determine whether or not the given node should be
#                    traversed or leafed for the current infostructure.
#
#                    Returns  1 -> Traverse this node;
#                             0 -> Leaf     this node;
#
sub should_traverse
{
    local($node, $linktype, $tasktype, $taskname, $tasktop) = @_;
    local($url, $response, %headers);

    #
    # Don't traverse links from IMG's and Queries
    #

    return 0 if (($linktype eq 'I') || ($linktype eq 'Q'));

    #
    # Don't traverse if we have been told to leaf it
    #

    return 0 if (&momhistory'must_avoid($node));

    #
    # Don't traverse again if we have already traversed it (or will soon)
    #

    return 0 if (&momhistory'checked_trav($node));

    #
    # Recall the result we got from our HEAD test
    #

    %headers  = ();
    $response = &momhistory'recall($node, *headers);

    #
    # Don't traverse if result of prior test was not OK
    #

    return 0 unless
           (($momhistory'WhatToDo{$response} == $momhistory'DO_continue) ||
            ($momhistory'WhatToDo{$response} == $momhistory'DO_redirect));
    #
    # Don't traverse if it is not an HTML document
    #

    $url = &momhistory'get_url($node);

    return 0 unless (&is_html($url, *headers));

    #
    # Figure out whether or not this URL is in the current infostructure
    #

    if ($tasktype eq 'Owner')
    {
        if (defined($headers{'owner'}) && ($headers{'owner'} eq $taskname))
        {
            return 1;
        }
        return 0;
    }

    #
    # We are now left with Site or Tree as possible task types
    #

    local($scheme, $addr, $port, $path, $query, $frag)  
          = &wwwurl'parse($url);
    local($ttsch, $ttaddr, $ttport, $ttpath, $ttq, $ttf)
          = &wwwurl'parse($tasktop);

    return 0 unless (($scheme eq $ttsch) &&
                     ($addr eq $ttaddr)  &&
                     ($port eq $ttport));
    return 1 if ($tasktype eq 'Site');

    #
    # This must be a Tree traversal
    #

    $ttpath =~ s#/[^/]*$#/#;               # Trim any filename off the Top URL
    return 0 unless ($path =~ /^$ttpath/); # Check if url is below the Top URL

    return 1;
}


# ==========================================================================
# is_html(): Determine whether or not the given URL and response headers
#            indicate that it points to a text/html document.
#            Note that it also has the side-effect of setting the content-type
#            and content-encoding if they were previously undefined.
#
#            Returns  1 -> text/html content-type
#                     0 -> not test/html
#
sub is_html
{
    local($url, *headers) = @_;
    local($suffix);

    if (!defined($headers{'content-type'}))
    {
        $_ = $url;
        s#/[^/.]*\.([^/]*)$#/#;   # Grab any filename extension off the URL

        if ($1) { $suffix = $1; }
        else    { $suffix = ''; }

        &wwwmime'set_content($suffix, *headers);
    }

    return 1 if ($headers{'content-type'} =~ m#\btext/html\b#io);
    return 0;
}


# ==========================================================================
# unencode(): Translate encoded content into its unencoded form.
#             The new content is passed-back in place of the old.
#             This is one of those routines that you hope will never be called.
#
#             Returns  0 -> failed to decode it
#                      1 -> successful, content is now decoded
#
sub unencode
{
    local($enctype, *content) = @_;

    return 1 unless ($enctype);

    local($tfile) = $momconfig'CEfile;
    local($com)   = $momconfig'CEdecoder{$enctype};

    return 0 unless ($tfile && $com);

    local($ext)   = ($momconfig'CEextension{$enctype} || '');

    if (!open(UNCOMP, "> $tfile$ext"))
    {
        print STDERR "Can't write to $tfile$ext: $!\n";
        return 0;
    }
    print UNCOMP $content;
    close(UNCOMP);
    undef $content;

    system("$com $tfile$ext");

    if (!open(UNCOMP, $tfile))
    {
        print STDERR "Can't open decompressed $tfile: $!\n";
        return 0;
    }
    local($/);
    undef($/);
    $content = <UNCOMP>;
    close(UNCOMP);
    unlink($tfile);

    return 1;
}

# ==========================================================================

1;