Welcome to HBH! If you have tried to register and didn't get a verification email, please using the following link to resend the verification email.
Link Seeker - Perl Code Bank
Link Seeker
Seeks and prints all URLs from a website.
#!/usr/bin/perl
# Coded by Trizen
# http://trizen.go.ro
use LWP::UserAgent;
$lwp = 'LWP::UserAgent'->new;
$lwp->agent('Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20');
$lwp->timeout(10);
$lwp->env_proxy;
foreach $_ (@ARGV) {
if ($_ =~ /^[-]+m/) {
$mainurl = 1;
}
if ($_ =~ /^[-]+h/) {
&help;
}
if ($_ =~ /^[-]+i/) {
$intern = 1;
}
}
sub help {
die "
Usage: $0 <URL_list.txt> | <URLS>
Options:
\t -m, --main : prints main URL
\t -i, --intern : seek for intern URLs
\t -h, --help : prints this message
\t
Example: $0 google.com -i > google_urls.txt
";
}
foreach $url (@ARGV) {
next if $url =~ /^-/;
if ($url =~ /\./ and not $url =~ /\.(txt|lst|list)$/) {
unless ($url =~ /^http/) {
$url = 'http://' . $url;
}
&Parse;
}
else {
foreach $url (<ARGV>) {
next if $url =~ /^-/;
if ($url =~ /(www|http)([^"'\n<>]+)/) {
$url = $&;
unless ($url =~ /^http/) {
$url = 'http://' . $url;
}
&Parse;
}
}
}
}
sub Parse {
if ($mainurl) {
print "\n\n=>> Mainurl: $url\n";
}
$content = $lwp->get($url)->content;
@list = split(' ', $content, 0);
foreach $line (@list) {
if ($line =~ /http:([^"'\n<>]+)/ and not $1 =~ m[\.$|//www$|http://$]) {
$found_url = $&;
&Push_URL;
}
if ($intern) {
if ($line =~ /href=['"\s]*([^'"\s<>]+)/) {
$found_url = $1;
unless ($found_url =~ /^http/) {
$url =~ s[http://([^/]+).*][http://$1/];
$found_url = "$url$found_url";
&Push_URL;
}
}
}
}
@urls = sort @urls;
foreach $url (@urls) {
next if $url eq $lasturl;
print "$url\n";
$lasturl = $url;
}
splice @urls;
splice @list;
}
sub Push_URL {
$found_url =~ s/\)$//;
$found_url =~ s[\\/][/]g;
$found_url =~ s/\\$//;
$found_url =~ s[^http://][];
$found_url =~ s[//][/]g;
unless ($found_url =~ m[^http://]) {
$found_url = 'http://' . $found_url;
}
push @urls, $found_url;
}
Comments
Sorry but there are no comments to display