--- toast 2003/08/24 23:48:52 1.179 +++ toast 2003/08/25 05:06:36 1.180 @@ -622,22 +622,41 @@ $_; } -sub openhttpurl(*$) +sub openhttp(*$;$) { local(*HANDLE) = shift; - my($url) = @_; + my($url, $method) = @_; + $method ||= "GET"; $url =~ m!^http://([\w\.]+)(:(\d+))?(/[\!-\~]*)?$! || error("bad url: $url"); explain("fetching $url"); my($host, $port, $path) = ($1, $3 || 80, $4 || '/'); my($ip) = gethostbyname($host) || error("gethostbyname $host: $!"); my($uagent) = "$myname/$myversion ($^O; $myurl)"; - my($request) = "GET $path HTTP/1.0\nHost: $host\nUser-Agent: $uagent\n\n"; - local($_); + my($request) = "$method $path HTTP/1.0\r\nHost: $host\r\n". + "User-Agent: $uagent\r\nAccept: */*\r\n\r\n"; socket(HANDLE, 2, 1, 0) || error("$!"); connect(HANDLE, pack("Sna4x8", 2, $port, $ip)) || error("connect $host:$port: $!"); unbuffer(HANDLE); print HANDLE $request || error("write to $host:$port: $!"); +} + +sub httphead($) +{ + my($url) = @_; + local(*HANDLE); + openhttp(*HANDLE, $url, "HEAD"); + my($result) = join('', <HANDLE>); + close(HANDLE) || error; + return $result; +} + +sub openhttpurl(*$) +{ + local(*HANDLE) = shift; + my($url) = @_; + openhttp(*HANDLE, $url); + local($_); $_ = <HANDLE>; s/\r?\n?$//; m|^HTTP/[\w\.]+ 200 | || error("$url: $_"); @@ -713,6 +732,56 @@ close(SOURCE) || error("close $url for read: $!"); } +sub linksfromstring($$) +{ + my($html, $url) = @_; + $url =~ m!^(((\w+:)//[^/]+)[^\?]*/)([^/\?](\?|$))?! + || error("bad url: $url"); + my($proto, $host, $parent) = ($3, $2, $1); + my(@links); + while($html =~ m!\b(href\s*\=\s*\"?|((http|ftp)://))([^\s\>\"]+)!ig) + { + my($link) = join("", undeftoempty($2, $4)); +warn("url = $url"); +warn("proto = $proto"); +warn("host = $host"); +warn("parent = $parent"); +warn("link = $link"); + next if $link =~ /^mailto:/i; +warn("link = $link"); + $link = $proto . $link if $link =~ m!^//!; +warn("link = $link"); + $link = $host . $link if $link =~ m!^/!; +warn("link = $link"); + $link = $parent . $link unless $link =~ m!^(\w+)://!; +warn("link = $link"); + push(@links, $link); + } + return @links; +} + +sub linksfromstream(*$) +{ + local(*HANDLE, $_) = shift; + my($url) = @_; + my(@links); + while(<HANDLE>) + { + push(@links, linksfromstring($_, $url)); + } + return @links; +} + +sub linksfromurl($) +{ + my($url) = @_; + local(*HANDLE); + openurl(*HANDLE, $url); + my(@links) = linksfromstream(*HANDLE, $url); + close(HANDLE) || error; + return @links; +} + ############################################################################## { @@ -1237,6 +1306,22 @@ ############################################################################## +sub lookslikepkgurl($;$$) +{ + my($url, $name, $version) = @_; + return false unless $url =~ m!^(http|ftp)://!; + return false if $url =~ m/\#/; + my($noquery) = stripquery($url); + return false unless $noquery =~ m!\.\w+$!; + return false if $noquery =~ m!\.html?$!i; + return true unless defined($name); + error unless defined($version); + my($basename) = basename($noquery); + return false unless $basename =~ /\Q$name\E/i; + return false unless $basename =~ /\Q$version\E/i; + return true; +} + sub findnewpkg($$) { my($name, $version) = @_; @@ -1258,31 +1343,20 @@ error("no listing for package $name on $sitename") if $notfound; - my($listurl); - $listurl ||= $fmurl{$_} for qw(bz2 tgz zip rpm); - $listurl || error("no suitable URL for package $name on $sitename"); + my($redirurl); + $redirurl ||= $fmurl{$_} for qw(bz2 tgz zip rpm); + $redirurl || error("no suitable URL for package $name on $sitename"); defined($version) || error("no version for package $name on $sitename"); - local(*LIST); - openurl(*LIST, $listurl); - my(@urls); - while(<LIST>) - { - while(m!\bhref\s*=\s*"([^"]+)"!gi) - { - my($url) = $1; - next unless $url =~ m!^(http|ftp)://!; - next if $url =~ m/\#/; - my($noquery) = stripquery($url); - next unless $noquery =~ m!\.\w+$!; - my($basename) = basename($noquery); - next unless $basename =~ /\Q$name\E/i; - next unless $basename =~ /\Q$version\E/i; - push(@urls, $url); - } - } - close(LIST) || error; + my($redirhead) = httphead($redirurl); + $redirhead =~ /^Location: ([^\r\n]+)/m || + error("unexpected response from $redirurl"); + my($url) = $1; + return ($name, $version, $url) if lookslikepkgurl($url); + + my(@urls) = grep(lookslikepkgurl($_, $name, $version), linksfromurl($url)); + my($ext); for $ext (qw[.tar.bz2 .tar.gz .tgz .zip .rpm]) { @@ -1290,7 +1364,7 @@ return ($name, $version, $matches[0]) if @matches; } - error("can't find URLs for $name version $version at $listurl"); + error("can't find URLs for $name version $version at $url"); } sub pkgurls($$) @@ -1382,9 +1456,6 @@ $ttl = 5 unless defined($ttl); error("too many links: $url") unless $ttl; - $url =~ m!^(\w+:)(//[^/]+)(([^\?]*)/[^/\?](\?|$))?! - || error("bad url: $url"); - my($proto, $host, $parent) = ($1, $1.$2, $1.$2.undeftoempty($4)); my($basename) = basename(stripquery($url)); my($file) = path($dir, $basename); @@ -1398,20 +1469,10 @@ if($header =~ /^\<.*\bHTML\b/i) { seek(FILE, 0, 0) || error("rewind $file: $!"); - local($_); - while(<FILE>) + my($link); + for $link (linksfromstream(*FILE, $url)) { - while(m!\b(href\s*\=\s*\"?|((http|ftp)://))([^\s\>\"]+)!ig) - { - my($link) = join("", undeftoempty($2, $4)); - next if $link =~ /^mailto:/i; - - $link = $proto . $link if $link =~ m!^//!; - $link = $host . $link if $link =~ m!^/!; - $link = $parent . $link unless $link =~ m!^(\w+)://!; - - $redir = $link if !$redir && basename(stripquery($link)) eq $basename; - } + $redir = $link if !$redir && basename(stripquery($link)) eq $basename; } } close(FILE) || error("close $file: $!");