# Copyright (c) 1994 Regents of the University of California. # All rights reserved. # $Id: momvisit.pl,v 1.10 1994/08/10 10:18:29 fielding Exp $ # --------------------------------------------------------------------------- # momvisit: A package for traversing a subweb of the World-Wide Web, # storing the results in momhistory, and calling out traversal # events as the spider proceeds through the structure. # # This software has been developed by Roy Fielding as # part of the Arcadia project at the University of California, Irvine. # # Redistribution and use in source and binary forms are permitted, # subject to the restriction noted below, provided that the above # copyright notice and this paragraph and the following paragraphs are # duplicated in all such forms and that any documentation, advertising # materials, and other materials related to such distribution and use # acknowledge that the software was developed in part by the University of # California, Irvine. The name of the University may not be used to # endorse or promote products derived from this software without # specific prior written permission. THIS SOFTWARE IS PROVIDED ``AS IS'' # AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT # LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE. # # Use of this software in any way or in any form, source or binary, # is not allowed in any country which prohibits disclaimers of any # implied warranties of merchantability or fitness for a particular # purpose or any disclaimers of a similar nature. # # IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY # FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES # ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION # (INCLUDING, BUT NOT LIMITED TO, LOST PROFITS) EVEN IF THE UNIVERSITY # OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # If you have any suggestions, bug reports, fixes, or enhancements, # send them to the author Roy Fielding at . # --------------------------------------------------------------------------- require "www.pl"; require "wwwurl.pl"; require "wwwhtml.pl"; require "momconfig.pl"; require "momevent.pl"; require "momhistory.pl"; package momvisit; # ========================================================================== # Get defaults from momconfig.pl unless they are stupid $pwd = ($momconfig'PWDdir || $ENV{'PWD'} || $ENV{'cwd'} || '' ); # The initial base URL to use if a Top URL is relative $BaseURL = ($momconfig'BaseURL || "file://localhost$pwd/"); # Default maximum traversal depth $MaxDepth = ($momconfig'MaxDepth || 20); # The maximum number of seconds to wait for a response. $Timeout = ($momconfig'Timeout || 30); # Amount of time required between any two requests to the same site. if (!defined($momconfig'BetweenTime) || ($momconfig'BetweenTime < 15)) { $BetweenTime = 15; } else { $BetweenTime = $momconfig'BetweenTime; } # Max number of consecutive requests to a site before long pause is required. if (!defined($momconfig'MaxConsec) || ($momconfig'MaxConsec < 2)) { $MaxConsec = 5; } else { $MaxConsec = $momconfig'MaxConsec; } if (!defined($momconfig'PauseTime) || ($momconfig'PauseTime < 60)) { $PauseTime = 60; # The number of seconds to pause. } else { $PauseTime = $momconfig'PauseTime; } # ========================================================================== # Set things which should not be changed at installation time. $CurConsec = 1; # Current number of consecutive requests to a site $PrevSite = ''; # Site of the last network request $PrevTime = 0; # Time of the last network request @TravNodes = (); # Nodes that we have yet to traverse for this task @TravDepth = (); # and their traversal depth @TravParent = (); # and their parent's url @TestLinks = (); # URLs that we have yet to test for this task @TestAbs = (); # and their original HREF in absolute form @TestOrig = (); # and their original HREF @TestType = (); # and their anchor type (Link, Image, Query) # ========================================================================== # ========================================================================== # setMaxDepth($number) -- Set the maximum traversal depth to first argument. # sub setMaxDepth { $MaxDepth = $_[0]; } # ========================================================================== # Clear the Status of the traversal process (except the Visited tables) # sub clearstatus { undef @TravNodes; undef @TravDepth; undef @TravParent; undef @TestLinks; undef @TestAbs; undef @TestOrig; undef @TestType; &momhistory'reset_traversal; } # ========================================================================== # infostruct(): Traverse an entire infostructure, starting at the $tasktop # url and continuing until there are no more nodes to traverse within the # infostructure bounded by $tasktype and $taskname, or MaxDepth is reached. # sub infostruct { local($task, $tasktype, $taskname, $tasktop) = @_; local($url, $node, $depth, $parent, $child, $abs, $orig, $type, $pos, $vidx, $response, $reused); if (!&momevent'begin_infostruct($task)) { print STDERR "...skipping task $task named $taskname\n"; return; } $url = &wwwurl'absolute($BaseURL, $tasktop); $node = &momhistory'remember($url, $momhistory'S_seen_not_tested); $depth = 0; $parent = ''; $tasktop = $url; if (&momhistory'must_avoid($node)) # Node must be avoided? { print "Avoided Top URL $url\n"; &momevent'traversed($node); undef $node; } while ($node) { next if (!$url); # Just in case the record was deleted print "Traversing $url ... "; $response = &travlink($node, $parent); print $response, "\n"; &momevent'begin_traversed($node, $parent); while ($TestLinks[0]) { $child = shift(@TestLinks); # Note that $url is now the parent $abs = shift(@TestAbs); $orig = shift(@TestOrig); $type = shift(@TestType); next if (!$child); $vidx = &momhistory'remember($child, $momhistory'S_seen_not_tested); if (&momhistory'was_tested($vidx)) { print "Reusing test of $child\n"; $reused = 1; } else { if (&momhistory'must_avoid($vidx)) { print "Avoiding $child\n"; &momevent'tested($vidx, $url, $abs, $orig, $type, 1); next; } print "Testing $child ... "; $response = &testlink($vidx, $url); print $response, "\n"; $reused = 0; } if ($depth >= $MaxDepth) { # Don't traverse beyond maximum depth, ; # Do Nothing # but don't mark as a leaf either. } elsif (&should_traverse($vidx,$type,$tasktype,$taskname,$tasktop)) { $pos = $#TravNodes + 1; $TravNodes[$pos] = $vidx; $TravDepth[$pos] = $depth + 1; $TravParent[$pos] = $url; &momhistory'set_status($vidx, $momhistory'S_will_traverse); } else { &momhistory'set_status($vidx, $momhistory'S_leafed); } &momevent'tested($vidx, $url, $abs, $orig, $type, $reused); } &momevent'end_traversed($task, $url, $#TravNodes + 1); } continue { $node = shift(@TravNodes); $depth = shift(@TravDepth); $parent = shift(@TravParent); if ($node) { $url = &momhistory'get_url($node); } } &momevent'end_infostruct($task); &clearstatus; } # ========================================================================== # travlink(): Traverse the given URL and add any links in it to the # TestLinks queues. Returns the server response code. # sub travlink { local($node, $parent) = @_; local($url, $response, $headers, %headers, $content, $safe); $url = &momhistory'get_url($node); @TestLinks = (); # Note that these queues are global to this package @TestAbs = (); @TestOrig = (); @TestType = (); $headers = ''; # The response headers will be returned here %headers = (); # The parsed response headers will be returned here $content = ''; # The response content will be returned here # # Set up the HTTP request headers # $headers{'Accept'} = 'text/html'; if ($parent) { $headers{'Referer'} = $parent; } # # Make sure we are not making too many requests, too fast # &check_speed($url); # # Make the HTTP request # $response = &www'request('GET', $url, *headers, *content, $Timeout); # # Check the response code to see what we can do with it: # if (($momhistory'WhatToDo{$response} == $momhistory'DO_continue) && (&is_html($url, *headers)) ) # Only HTML contains links { $safe = 1; if ($ce = $headers{'content-encoding'}) # Yikes, its compressed { $safe = &unencode($ce, *content); } if ($safe) { &wwwhtml'extract_links($url, *headers, *content, *TestLinks, *TestAbs, *TestOrig, *TestType); } } # # Store the response and the metainfo we got from it # &momhistory'store($node, $momhistory'S_traversed, $response, *headers); # # Handle the redirected links as if they were a single child # if ($momhistory'WhatToDo{$response} == $momhistory'DO_redirect) { if ($redir = $headers{'location'}) { $redir =~ s/, .*//; # Get rid of multiple Location: entries } elsif ($redir = $headers{'uri'}) { $redir =~ s/\s*;.*//; $redir =~ s/, .*//; # Get rid of any multiple URI: entries } if ($redir) { $TestLinks[0] = $redir; $TestAbs[0] = $redir; $TestOrig[0] = $redir; $TestType[0] = 'R'; } } return $response; } # ========================================================================== # testlink(): Test the given URL using the HEAD request and update its status. # Returns the server response code. # sub testlink { local($node, $parent) = @_; local($url, $response, $headers, %headers, $content); $url = &momhistory'get_url($node); $headers = ''; # The response headers will be returned here %headers = (); # The parsed response headers will be returned here $content = ''; # The response content will be returned here # # Set up the HTTP request headers # $headers{'Accept'} = '*/*'; if ($parent) { $headers{'Referer'} = $parent; } # # Make sure we are not making too many requests, too fast # &check_speed($url); # # Make the HTTP request # $response = &www'request('HEAD', $url, *headers, *content, $Timeout); # # Store the response and the metainfo we got from it # &momhistory'store($node,$momhistory'S_tested_unknown,$response,*headers); return $response; } # ========================================================================== # check_speed(): Make sure that we are not going too fast and/or making # too many requests in sequence to a single site. This # prevents a robot from completely dominating the resources # of a particular server. If we are going too fast, go to # sleep for a while before returning. # sub check_speed { local($url) = @_; local($site, $secs); return unless ($url =~ m#^http#); # THIS LINE MUST CHANGE when libwww-perl # supports other network services # return if ($url =~ m#^file://localhost/#); # No speed limit on filesystem return unless ($site = &wwwurl'get_site($url)); LOITER: { if ($site ne $PrevSite) { $CurConsec = 1; last LOITER; } ++$CurConsec; if ($CurConsec > $MaxConsec) # Too many consecutive requests { # to the same site? sleep($PauseTime); # Yes - take a breather. $CurConsec = 1; last LOITER; } $secs = ($PrevTime + $BetweenTime) - time; if ($secs > 0) { sleep($secs); } } $PrevSite = $site; $PrevTime = time; } # ========================================================================== # should_traverse(): Determine whether or not the given node should be # traversed or leafed for the current infostructure. # # Returns 1 -> Traverse this node; # 0 -> Leaf this node; # sub should_traverse { local($node, $linktype, $tasktype, $taskname, $tasktop) = @_; local($url, $response, %headers); # # Don't traverse links from IMG's and Queries # return 0 if (($linktype eq 'I') || ($linktype eq 'Q')); # # Don't traverse if we have been told to leaf it # return 0 if (&momhistory'must_avoid($node)); # # Don't traverse again if we have already traversed it (or will soon) # return 0 if (&momhistory'checked_trav($node)); # # Recall the result we got from our HEAD test # %headers = (); $response = &momhistory'recall($node, *headers); # # Don't traverse if result of prior test was not OK # return 0 unless (($momhistory'WhatToDo{$response} == $momhistory'DO_continue) || ($momhistory'WhatToDo{$response} == $momhistory'DO_redirect)); # # Don't traverse if it is not an HTML document # $url = &momhistory'get_url($node); return 0 unless (&is_html($url, *headers)); # # Figure out whether or not this URL is in the current infostructure # if ($tasktype eq 'Owner') { if (defined($headers{'owner'}) && ($headers{'owner'} eq $taskname)) { return 1; } return 0; } # # We are now left with Site or Tree as possible task types # local($scheme, $addr, $port, $path, $query, $frag) = &wwwurl'parse($url); local($ttsch, $ttaddr, $ttport, $ttpath, $ttq, $ttf) = &wwwurl'parse($tasktop); return 0 unless (($scheme eq $ttsch) && ($addr eq $ttaddr) && ($port eq $ttport)); return 1 if ($tasktype eq 'Site'); # # This must be a Tree traversal # $ttpath =~ s#/[^/]*$#/#; # Trim any filename off the Top URL return 0 unless ($path =~ /^$ttpath/); # Check if url is below the Top URL return 1; } # ========================================================================== # is_html(): Determine whether or not the given URL and response headers # indicate that it points to a text/html document. # Note that it also has the side-effect of setting the content-type # and content-encoding if they were previously undefined. # # Returns 1 -> text/html content-type # 0 -> not test/html # sub is_html { local($url, *headers) = @_; local($suffix); if (!defined($headers{'content-type'})) { $_ = $url; s#/[^/.]*\.([^/]*)$#/#; # Grab any filename extension off the URL if ($1) { $suffix = $1; } else { $suffix = ''; } &wwwmime'set_content($suffix, *headers); } return 1 if ($headers{'content-type'} =~ m#\btext/html\b#io); return 0; } # ========================================================================== # unencode(): Translate encoded content into its unencoded form. # The new content is passed-back in place of the old. # This is one of those routines that you hope will never be called. # # Returns 0 -> failed to decode it # 1 -> successful, content is now decoded # sub unencode { local($enctype, *content) = @_; return 1 unless ($enctype); local($tfile) = $momconfig'CEfile; local($com) = $momconfig'CEdecoder{$enctype}; return 0 unless ($tfile && $com); local($ext) = ($momconfig'CEextension{$enctype} || ''); if (!open(UNCOMP, "> $tfile$ext")) { print STDERR "Can't write to $tfile$ext: $!\n"; return 0; } print UNCOMP $content; close(UNCOMP); undef $content; system("$com $tfile$ext"); if (!open(UNCOMP, $tfile)) { print STDERR "Can't open decompressed $tfile: $!\n"; return 0; } local($/); undef($/); $content = ; close(UNCOMP); unlink($tfile); return 1; } # ========================================================================== 1;