2008年8月12日 星期二

Programming for Administration

概述:原本考慮使用現有的商業套件(teleport)來進行工作上每日要進行的查詢及登載作業,但受訪的頁面需要登錄,Cookies,並透過sessionID持續進行各頁面的存取,無奈對商業軟體不熟悉不知如何設定,只好消滅自己的腦細胞,來製作相關的程式碼。


方式:透過Perl包裹wget,將每日繁瑣的數據儲存作業儲存在特定主機,並使用cron job來定期驅動本程式。


#!/usr/bin/perl
#
# webcatcher.pl
#
# $Id: webcatcher.pl,v 1.1 2008/08/12 08:24:28 kent Exp $
#
# This script is used to claw web pages. During the execution period 
# it will try to wrap wget as the grabber. It can help you to grab 
# web pages regluarly and send notification email to the appropriate 
# person. You are able to modify it under the same terms as Perl.


# Copyright(C)2008 Kent Hsu
#
#


use strict;
use Getopt::Std;
use XML::Simple;
use Mail::Sender;


###################
# ENV definition  #
###################
my $module_name = 'webcatcher.pl';
my $config_name = 'webcatcher.xml';
my $wget_command = (`which wget`);
my $file_home_dir = `pwd`;
my $base_url = undef;
my $date_now = undef;


###################
#String definition#
###################
my $help;
my $wget_not_exist;
my $wget_failed;
my $config_file_not_exist;
my $vars_name_null = undef;
my $vars_undef = undef;
my $xml_parse_failed = undef;
my $xml_null = undef;
my $base_url_empty = undef;
my $no_server_exist = undef;
my $message_subject = undef;
my $message_body = undef;
my $done = undef;


####################
# member variables #
####################
my $full_xml = undef;
my @servers = ();
my $notify_ref = undef;
my $server_ref = undef;
my %server_detail = ();


&init_string; #ini strings


MAIN:{


&parse_args;


&parse_from_xml;

&main_proc;

&send_notification;

print $done;

} #End of MAIN:


sub parse_args {
use vars qw/ %opt /;


getopts( "hevf:", \%opt ) or die $help;;


if ($opt{e}){
#nothing need to be proceed here....


} elsif ($opt{f}){
$config_name = $opt{f};

} elsif ($opt{v}){
&show_parsed_xml;
die $done;


} elsif ($opt{h}){
die $help;

} else {
die $help;
}
}


sub main_proc {
foreach (@servers){
%server_detail = %{$server_ref->{$_}};

my $params = ' -nH -nd --no-check-certificate ';
my $today = &get_date;

$params .= "-P" . $server_detail{save_at} . "/$today";


if ( $server_detail{deeplyclaw} eq "true" ){
$params .= " -r -l 0 ";
} else {
$params .= " -p ";
}


if ( $server_detail{usepassword} eq "true" ){
$params .= ' --http-user=' . $server_detail{userid} . ' --http-passwd=' . $server_detail{password} ;
}

my @url = $server_detail{url}{item};
my $cookie_initd = 'false';
my $cookie_string = '';

foreach my $urlitem (@url){
if( ref($urlitem) eq 'ARRAY' ){
foreach (@$urlitem){
if ( $server_detail{usecookie} eq "true" ){
if ($cookie_initd eq "false") {
$cookie_string = " --cookies=on --keep-session-cookies --save-cookies=cookie.txt ";
$cookie_initd = "true";
} else {
$cookie_string = " --referer=" . %{@$urlitem[0]}->{value} . " --cookies=on --keep-session-cookies --load-cookies=cookie.txt ";
}
}
#print "parms=" . $params . $cookie_string . "--post-data '" . %{$_}->{postdata} . "' " . %{$_}->{value} . "\n\n";
my $tmpParams = $params . $cookie_string;
if (%{$_}->{postdata}){
$tmpParams .= "--post-data '" . %{$_}->{postdata} . "' ";
}
&exec_wget($tmpParams , %{$_}->{value});
}


} else {
&exec_wget($params , $server_detail{url}{item});
}
}#End of foreach(@url)

$message_body .= "

$_

Click me to view MRTG Graphy.
\n";


} #End of foreach (@servers)

#print $message_body;
}






sub exec_wget {
my ($args, $url) = @_;

chomp $wget_command;

if ( ! -f $wget_command ){
die $wget_not_exist;
}

my $cmd_line = " " . $args . " " . $url;


WGET:{
my $cmd = `$wget_command $cmd_line`;
print $cmd . "\n";
}
}


sub parse_from_xml {
unless( -f $config_name ){
die $config_name . $config_file_not_exist;
}


$full_xml = XMLin($config_name);

unless ($full_xml){
die $xml_null;
}

$base_url = %$full_xml->{base_url};
unless ($base_url){
die $base_url_empty;
}

$notify_ref = %$full_xml->{notification};
$server_ref = %$full_xml->{server};
@servers = sort keys(%$server_ref);


unless (@servers){
die $no_server_exist;
}


foreach (@servers){
my @all_models = qw(save_at requirehttps usepassword userid password usecookie url);
validate_field( \@all_models, \%{$server_ref->{$_}} );
}
}


sub send_notification {
eval {
(new Mail::Sender)
->OpenMultipart({
smtp => "$notify_ref->{smtp_server}",
from => "$notify_ref->{mail_from}",
to => "$notify_ref->{rcpt_to}",
subject => "$message_subject . $date_now",
#debug => 'c:\temp\zkMailFlow.log',
multipart => 'mixed',
})
->Part({ctype => 'multipart/alternative'})
->Part({ctype => 'text/html', disposition => 'NONE', msg => $message_body})
->EndPart("multipart/alternative")
->Close();
} or print "Error sending mail: $Mail::Sender::Error\n";
}


sub get_date {
my @date = (localtime) [5,4,3];
$date[0] += 1900;
$date[1] += 1;
$date_now = sprintf "%4d-%02d-%02d", @date;
return $date_now;
}


sub show_parsed_xml {
if( ! -f $config_name ){
die $config_name . $config_file_not_exist;
}
$full_xml = XMLin($config_name);

use Data::Dumper;
print Dumper($full_xml);
}


sub validate_field (\@\%){
my ($vars_keys, $vars_hash) = @_;

foreach (@$vars_keys){
if(exists $vars_hash->{$_}) {
if( ! defined $vars_hash->{$_} || $vars_hash->{$_} eq "" ) {
die "'". $_ . "' " . $vars_undef . $xml_parse_failed . "\n";
}
} else {
die "'" . $_ . "' " . $vars_name_null . $xml_parse_failed . "\n";
}
}
}


###################
#String defination#
###################


sub init_string {
$vars_name_null = <
must be specifyed please check your configuration file.
EOM


$vars_undef = <
can not be null, please correct your $config_name files.
EOM


$config_file_not_exist = <
 is not exist please create it first.
EOM


$xml_null = <
XML file can not be null, parsing falied!
EOM


$base_url_empty = <
base_url can not be null, please check your configuration file - $config_name.
EOM


$no_server_exist = <
No servers need to be parsed.!
EOM




$xml_parse_failed = <
XML parsing falied!
EOM




$wget_not_exist = <
wget is not exist please install it first.
EOM




$wget_failed = <
wget failed to execute please follow the default XML configuration setting.
EOM


$message_subject = '@trip service MRTG Report';


$message_body = <
Dear all,


Below is the RTG Report for a-trip Service:


EOM


$done = <


Done...
EOM




$help = <
usage: $module_name / $module_name [options] filename


Standard $module_name options:
-e execute  : executing with default configuration file.
             (with this other arguments will be ignored.)
-f filename : specifying configuration file to be parsed
             (The default configuration is named to webcatcher.xml, with 
             this paramater to alternatively specify the configuration file.)
-v view XML : viewing default XML content
-h help     : dumpping help message
EOM


}