方式:透過Perl包裹wget,將每日繁瑣的數據儲存作業儲存在特定主機,並使用cron job來定期驅動本程式。
#!/usr/bin/perl
#
# webcatcher.pl
#
# $Id: webcatcher.pl,v 1.1 2008/08/12 08:24:28 kent Exp $
#
# This script is used to claw web pages. During the execution period
# it will try to wrap wget as the grabber. It can help you to grab
# web pages regluarly and send notification email to the appropriate
# person. You are able to modify it under the same terms as Perl.
#
#
# Copyright(C)2008 Kent Hsu
#
#
use strict;
use Getopt::Std;
use XML::Simple;
use Mail::Sender;
###################
# ENV definition #
###################
my $module_name = 'webcatcher.pl';
my $config_name = 'webcatcher.xml';
my $wget_command = (`which wget`);
my $file_home_dir = `pwd`;
my $base_url = undef;
my $date_now = undef;
###################
#String definition#
###################
my $help;
my $wget_not_exist;
my $wget_failed;
my $config_file_not_exist;
my $vars_name_null = undef;
my $vars_undef = undef;
my $xml_parse_failed = undef;
my $xml_null = undef;
my $base_url_empty = undef;
my $no_server_exist = undef;
my $message_subject = undef;
my $message_body = undef;
my $done = undef;
####################
# member variables #
####################
my $full_xml = undef;
my @servers = ();
my $notify_ref = undef;
my $server_ref = undef;
my %server_detail = ();
&init_string; #ini strings
MAIN:{
&parse_args;
&parse_from_xml;
&main_proc;
&send_notification;
print $done;
} #End of MAIN:
sub parse_args {
use vars qw/ %opt /;
getopts( "hevf:", \%opt ) or die $help;;
if ($opt{e}){
#nothing need to be proceed here....
} elsif ($opt{f}){
$config_name = $opt{f};
} elsif ($opt{v}){
&show_parsed_xml;
die $done;
} elsif ($opt{h}){
die $help;
} else {
die $help;
}
}
sub main_proc {
foreach (@servers){
%server_detail = %{$server_ref->{$_}};
my $params = ' -nH -nd --no-check-certificate ';
my $today = &get_date;
$params .= "-P" . $server_detail{save_at} . "/$today";
if ( $server_detail{deeplyclaw} eq "true" ){
$params .= " -r -l 0 ";
} else {
$params .= " -p ";
}
if ( $server_detail{usepassword} eq "true" ){
$params .= ' --http-user=' . $server_detail{userid} . ' --http-passwd=' . $server_detail{password} ;
}
my @url = $server_detail{url}{item};
my $cookie_initd = 'false';
my $cookie_string = '';
foreach my $urlitem (@url){
if( ref($urlitem) eq 'ARRAY' ){
foreach (@$urlitem){
if ( $server_detail{usecookie} eq "true" ){
if ($cookie_initd eq "false") {
$cookie_string = " --cookies=on --keep-session-cookies --save-cookies=cookie.txt ";
$cookie_initd = "true";
} else {
$cookie_string = " --referer=" . %{@$urlitem[0]}->{value} . " --cookies=on --keep-session-cookies --load-cookies=cookie.txt ";
}
}
#print "parms=" . $params . $cookie_string . "--post-data '" . %{$_}->{postdata} . "' " . %{$_}->{value} . "\n\n";
my $tmpParams = $params . $cookie_string;
if (%{$_}->{postdata}){
$tmpParams .= "--post-data '" . %{$_}->{postdata} . "' ";
}
&exec_wget($tmpParams , %{$_}->{value});
}
} else {
&exec_wget($params , $server_detail{url}{item});
}
}#End of foreach(@url)
$message_body .= "
$_
Click me to view MRTG Graphy.\n";
} #End of foreach (@servers)
#print $message_body;
}
sub exec_wget {
my ($args, $url) = @_;
chomp $wget_command;
if ( ! -f $wget_command ){
die $wget_not_exist;
}
my $cmd_line = " " . $args . " " . $url;
WGET:{
my $cmd = `$wget_command $cmd_line`;
print $cmd . "\n";
}
}
sub parse_from_xml {
unless( -f $config_name ){
die $config_name . $config_file_not_exist;
}
$full_xml = XMLin($config_name);
unless ($full_xml){
die $xml_null;
}
$base_url = %$full_xml->{base_url};
unless ($base_url){
die $base_url_empty;
}
$notify_ref = %$full_xml->{notification};
$server_ref = %$full_xml->{server};
@servers = sort keys(%$server_ref);
unless (@servers){
die $no_server_exist;
}
foreach (@servers){
my @all_models = qw(save_at requirehttps usepassword userid password usecookie url);
validate_field( \@all_models, \%{$server_ref->{$_}} );
}
}
sub send_notification {
eval {
(new Mail::Sender)
->OpenMultipart({
smtp => "$notify_ref->{smtp_server}",
from => "$notify_ref->{mail_from}",
to => "$notify_ref->{rcpt_to}",
subject => "$message_subject . $date_now",
#debug => 'c:\temp\zkMailFlow.log',
multipart => 'mixed',
})
->Part({ctype => 'multipart/alternative'})
->Part({ctype => 'text/html', disposition => 'NONE', msg => $message_body})
->EndPart("multipart/alternative")
->Close();
} or print "Error sending mail: $Mail::Sender::Error\n";
}
sub get_date {
my @date = (localtime) [5,4,3];
$date[0] += 1900;
$date[1] += 1;
$date_now = sprintf "%4d-%02d-%02d", @date;
return $date_now;
}
sub show_parsed_xml {
if( ! -f $config_name ){
die $config_name . $config_file_not_exist;
}
$full_xml = XMLin($config_name);
use Data::Dumper;
print Dumper($full_xml);
}
sub validate_field (\@\%){
my ($vars_keys, $vars_hash) = @_;
foreach (@$vars_keys){
if(exists $vars_hash->{$_}) {
if( ! defined $vars_hash->{$_} || $vars_hash->{$_} eq "" ) {
die "'". $_ . "' " . $vars_undef . $xml_parse_failed . "\n";
}
} else {
die "'" . $_ . "' " . $vars_name_null . $xml_parse_failed . "\n";
}
}
}
###################
#String defination#
###################
sub init_string {
$vars_name_null = <
must be specifyed please check your configuration file.
EOM
$vars_undef = <
can not be null, please correct your $config_name files.
EOM
$config_file_not_exist = <
is not exist please create it first.
EOM
$xml_null = <
XML file can not be null, parsing falied!
EOM
$base_url_empty = <
base_url can not be null, please check your configuration file - $config_name.
EOM
$no_server_exist = <
No servers need to be parsed.!
EOM
$xml_parse_failed = <
XML parsing falied!
EOM
$wget_not_exist = <
wget is not exist please install it first.
EOM
$wget_failed = <
wget failed to execute please follow the default XML configuration setting.
EOM
$message_subject = '@trip service MRTG Report';
$message_body = <
Dear all,
Below is the RTG Report for a-trip Service:
EOM
$done = <
Done...
EOM
$help = <
usage: $module_name / $module_name [options] filename
Standard $module_name options:
-e execute : executing with default configuration file.
(with this other arguments will be ignored.)
-f filename : specifying configuration file to be parsed
(The default configuration is named to webcatcher.xml, with
this paramater to alternatively specify the configuration file.)
-v view XML : viewing default XML content
-h help : dumpping help message
EOM
}