#!/usr/bin/perl # << 'HEADER_END'; SimpleGet.pl -- standalone replacement for LWP::Simple For more perl goodness, go to mrob.com/pub/perl This is a fairly minimal implementation of the HTTP GET protocol in Perl. It only does the 'GET' method via http (not https), and handles proxies and redirects but nothing more complicated. It is designed to be a simple replacement for the LWP library for scripts that only use the get(), getprint() and/or getstore() routines. It should be noted that web client Perl scripts fall into two general categories: Simple scripts that just get a page and grab one small bit of information from it, and complex tools like browsers, robots, site-shadowing utilities, custom search engines, etc. For the simple category, the LWP library and the other libraries it depends on (over 1.5 megabytes) is overkill. To use this code, put it in a file called "SimpleGet.pl" somewhere in your Perl @INC searchpath, such as /usr/lib/perl5/site_perl then add this line to your Perl script: require "SimpleGet.pl"; Each of the following is equivalent (except that the last creates a file "temp.html" in the current directory): They load the sample URL and print the HTML to STDOUT, and set $err to the HTTP result (usually 200, unless my ISP's server is down): print get("http://www.mrob.com"); $err = $http_get_result; $_ = get("http://www.mrob.com"); print $_; $err = $http_get_result; $err = getprint("http://www.mrob.com"); $err = getstore("http://www.mrob.com", "temp.html"); system("cat temp.html"); get() reads the data via HTTP and returns it. getprint() sends the data to STDOUT with relatively low memory overhead (useful if the data is large) This library also invites one-liner shell commands such as: perl -e "require 'SimpleGet.pl'; getprint('http://www.mrob.com')" If you want to do anything more than loading simple pages and parsing their contents yourself, you should use LWP and the associated libraries. These libraries include such functions as MIME support, HTML parsing, handling of other transfer protocols like HTTPS and FTP, and much more. To learn more, see http://www.linpro.no/lwp/ Existing web-client scripts that contain 'use LWP::Simple;' and only call the get() and/or getprint() functions can be converted to use SimpleGet by replacing the 'use LWP::Simple;' line with 'require "SimpleGet.pl";' Two extensions have been added to the functionality provided by the LWP get() and getprint() routines: - Set $http_no_cache to 1 to force proxies to reload, or to 0 for a normal GET. - The variable $http_get_result is set to the result code (e.g. 200 or 404). (It is also returned by getprint() and getstore()) Copyright, Usage, Feedback, etc: This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. The minimal adaptation of LWP::Simple was created by Robert Munafo (www.mrob.com) from the LWP code. If you have problems, suggestions, or feedback regarding this file, please send them to "mrob at mrob dot com". Please don't bother Gisle Aas (the primary author of LWP) about it because it isn't his creation. REVISION HISTORY 20000114 Initial version, derived from LWP::Simple, with minor additions to handle http_proxy 20000117 Add $http_get_result and get_to_stdout(). 20000118 Add $http_no_cache. Rename get_to_stdout() to getprint, because that's what it's called in LWP. 20000119 Add getstore(), is_success() and "1;" at end; make it work under "use strict;" 20000903 Add allow_post flag and POST handling; add post(), postprint(), and poststore(). Reference: http://www.oreilly.com/catalog/aspnut/chapter/ch06.html 20040531 On redirect, getstore and poststore close and reopen the file, so you get just the redirected data, without the redirecting page prepended (important if you're getting a binary or image file). 20050512 Add (nonstandard) optional Authorization: syntax 20100514 Add &get_pr_hdr and &post_pr_hdr 20120412 Don't try to reopen GET_OUTFILE in getprint mode (oops :) HEADER_END require 5.004; use MIME::Base64; my ($http_stream_out, %http_loop_check, $allow_post, $override_timeout); # Return value is the data (if any) # Result code in $http_get_result sub _trivial_http_get { my($host, $port, $path, $auth401) = @_; my($AGENT, $VERSION, $p); my($ifpost, $postdata); #print "_thg HOST=$host, PORT=$port, PATH=$path, AUTH=$auth401\n"; $AGENT = "get-minimal"; $VERSION = "20000118"; $path =~ s/ /%20/g; if ($allow_post) { if ($path =~ m/^(.+)\?(.+)$/) { $path = $1; $postdata = $2; $ifpost = 1; } else { $ifpost = 0; } } else { $ifpost = 0; } require IO::Socket; local($^W) = 0; my $sock = IO::Socket::INET->new(PeerAddr => $host, PeerPort => $port, Proto => 'tcp', Timeout => 60) || return; $sock->autoflush; my $netloc = $host; $netloc .= ":$port" if $port != 80; my $request = ($ifpost ? "POST" : "GET") . " $path HTTP/1.0\015\012" . "Host: $netloc\015\012" . "User-Agent: $AGENT/$VERSION/u\015\012"; # . "Referer: http://localhost/index.html\015\012" # . "User-Agent: Mozilla/4.0 (compatible; MSIE 4.5; Mac_PowerPC)\015\012"; $request .= "Pragma: no-cache\015\012" if ($main::http_no_cache); if ($ifpost) { $request .= "Content-type: application/x-www-form-urlencoded\015\012"; $request .= "Content-length: " . length($postdata) . "\015\012"; } if ($auth401 ne "") { $request .= "Authorization: BASIC " . encode_base64($auth401) . "\015\012"; } $request .= "\015\012"; if ($ifpost) { $request .= $postdata . "\015\012"; } #print $request; print $sock $request; my $buf = ""; my $n; my $b1 = ""; my $hdr = ""; while ($n = sysread($sock, $buf, 8*1024, length($buf))) { if ($b1 eq "") { # first block? $b1 = $buf; # Save this for errorcode parsing $buf =~ s/.+?\015?\012\015?\012//s; # zap header $hdr = $b1; $hdr =~ s|^(.+?)\015?\012\015?\012.*$|\1|s; # zap all but header if ($http_header_out) { print GET_HDRFILE "$hdr\n"; $hdr = ""; } } if ($http_stream_out) { print GET_OUTFILE $buf; $buf = ""; } } if (!(defined($n))) { return undef; } $main::http_get_result = 200; if ($b1 =~ m,^HTTP/\d+\.\d+\s+(\d+)[^\012]*\012,) { $main::http_get_result = $1; if (($main::http_get_result =~ /^30[1237]/) && ($b1 =~ /\012Location:\s*(\S+)/)) { # redirect my $url = $1; return undef if $http_loop_check{$url}++; # redirect must replace, not concatenate, output to a disk file. # if we're streaming, it will still concatenate. if ($http_stream_out) { if ($g_getstore_filepath eq 'stdout') { # Do not close and reopen } else { close GET_OUTFILE; open(GET_OUTFILE, "> $g_getstore_filepath"); } } return _get($url); } if ($main::http_get_result =~ /^2/) { # Okay } else { return undef; } } return $buf; } # Return value is the data (if any) # Result code in $http_get_result sub _get { my $url = shift; my $proxy = ""; my($auth401); #print "_get $url\n"; # To get web pages that require a name and password, append a string like # " Authorization: myname:mysecret" $auth401 = ""; if ($url =~ m/^(.+) Authorization: (.+)$/) { $url = $1; $url =~ s/ +$//; $auth401 = $2; $auth401 =~ s/ //g; } grep {(lc($_) eq "http_proxy") && ($proxy = $ENV{$_})} keys %ENV; if (($proxy eq "") && $url =~ m,^http://([^/:]+)(?::(\d+))?(/\S*)?$,) { #print "_get branch1 |$url|$auth401|\n"; my $host = $1; my $port = $2 || 80; my $path = $3; $path = "/" unless defined($path); return _trivial_http_get($host, $port, $path, $auth401); } elsif ($proxy =~ m,^http://([^/:]+):(\d+)(/\S*)?$,) { #print "_get branch2\n"; my $host = $1; my $port = $2; my $path = $url; return _trivial_http_get($host, $port, $path, $auth401); } else { #print "_get branch3 |$proxy|$url|\n"; return undef; } } # Return value is the data (if any) # Result code in $http_get_result sub get ($) { $http_stream_out = 0; $http_header_out = 0; $allow_post = 0; %http_loop_check = (); goto \&_get; } sub getprint ($) { my $url = shift; $allow_post = 0; $http_stream_out = 1; $g_getstore_filepath = 'stdout'; open(GET_OUTFILE, ">&STDOUT"); $http_header_out = 0; %http_loop_check = (); _get($url); close GET_OUTFILE; return $main::http_get_result; } sub getstore ($$) { my $url = shift; my $file = shift; $allow_post = 0; $http_stream_out = 1; $http_header_out = 0; $g_getstore_filepath = $file; open(GET_OUTFILE, "> $file"); %http_loop_check = (); _get($url); close GET_OUTFILE; return $main::http_get_result; } # Like getprint but also routes header to stderr sub get_pr_hdr ($) { my $url = shift; $allow_post = 0; $http_stream_out = 1; $g_getstore_filepath = 'stdout'; open(GET_OUTFILE, ">&STDOUT"); $http_header_out = 2; open(GET_HDRFILE, ">&STDERR"); %http_loop_check = (); _get($url); close GET_OUTFILE; return $main::http_get_result; } sub post ($) { $http_stream_out = 0; $http_header_out = 0; $allow_post = 1; %http_loop_check = (); goto \&_get; } sub postprint ($) { my $url = shift; $allow_post = 1; $http_stream_out = 1; open(GET_OUTFILE, ">&STDOUT"); $http_header_out = 0; %http_loop_check = (); _get($url); close GET_OUTFILE; return $main::http_get_result; } # Like postprint but also routes header to stderr sub post_pr_hdr ($) { my $url = shift; $allow_post = 1; $http_stream_out = 1; open(GET_OUTFILE, ">&STDOUT"); $http_header_out = 2; open(GET_HDRFILE, ">&STDERR"); %http_loop_check = (); _get($url); close GET_OUTFILE; return $main::http_get_result; } sub poststore ($$) { my $url = shift; my $file = shift; $allow_post = 1; $http_stream_out = 1; $http_header_out = 0; $g_getstore_filepath = $file; open(GET_OUTFILE, "> $file"); %http_loop_check = (); _get($url); close GET_OUTFILE; return $main::http_get_result; } sub is_success ($) { my $code = shift; return ($code =~ /^2/); } 1; # end of SimpleGet.pl 013