Welcome to HBH! If you have tried to register and didn't get a verification email, please using the following link to resend the verification email.

Link Seeker - Perl Code Bank


Link Seeker
Seeks and prints all URLs from a website.
                #!/usr/bin/perl

# Coded by Trizen
# http://trizen.go.ro

use LWP::UserAgent;

$lwp = 'LWP::UserAgent'->new;
$lwp->agent('Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20');
$lwp->timeout(10);
$lwp->env_proxy;
foreach $_ (@ARGV) {
    if ($_ =~ /^[-]+m/) {
        $mainurl = 1;
    }
    if ($_ =~ /^[-]+h/) {
        &help;
    }
    if ($_ =~ /^[-]+i/) {
        $intern = 1;
    }
}
sub help {
    die "
 Usage: $0 <URL_list.txt> | <URLS>

 Options:
\t  -m, --main   : prints main URL
\t  -i, --intern : seek for intern URLs 
\t  -h, --help   : prints this message


\t  
 Example: $0 google.com -i > google_urls.txt

";
}
foreach $url (@ARGV) {
    next if $url =~ /^-/;
    if ($url =~ /\./ and not $url =~ /\.(txt|lst|list)$/) {
        unless ($url =~ /^http/) {
            $url = 'http://' . $url;
        }
        &Parse;
    }
    else {
        foreach $url (<ARGV>) {
            next if $url =~ /^-/;
            if ($url =~ /(www|http)([^"'\n<>]+)/) {
                $url = $&;
                unless ($url =~ /^http/) {
                    $url = 'http://' . $url;
                }
                &Parse;
            }
        }
    }
}
sub Parse {
    if ($mainurl) {
        print "\n\n=>> Mainurl: $url\n";
    }
    $content = $lwp->get($url)->content;
    @list = split(' ', $content, 0);
    foreach $line (@list) {
        if ($line =~ /http:([^"'\n<>]+)/ and not $1 =~ m[\.$|//www$|http://$]) {
            $found_url = $&;
            &Push_URL;
        }
        if ($intern) {
            if ($line =~ /href=['"\s]*([^'"\s<>]+)/) {
                $found_url = $1;
                unless ($found_url =~ /^http/) {
                    $url =~ s[http://([^/]+).*][http://$1/];
                    $found_url = "$url$found_url";
                    &Push_URL;
                }
            }
        }
    }
    @urls = sort  @urls;
    foreach $url (@urls) {
        next if $url eq $lasturl;
        print "$url\n";
        $lasturl = $url;
    }
    splice @urls;
    splice @list;
}
sub Push_URL {
    $found_url =~ s/\)$//;
    $found_url =~ s[\\/][/]g;
    $found_url =~ s/\\$//;
    $found_url =~ s[^http://][];
    $found_url =~ s[//][/]g;
    unless ($found_url =~ m[^http://]) {
        $found_url = 'http://' . $found_url;
    }
    push @urls, $found_url;
}

            
Comments
Sorry but there are no comments to display