Hmmm, sorry I have not made myself clear.
Here is the site I am using with all pages to be scraped
http://www.ozeform.com/Default.aspx?pageid=e2632b85-fe1d-40fb-96b1-59d9800646e5&seldate=20080705
this is the script
#!/usr/bin/perl
# Web8.pl
# last updated 24/06/2007
# Purpose
# To extract horse result details for each race at a nominated race meet
# The ozeform site has an archive of race meetings going back several
years
# by date by meeting location. By submitting the date and the meet name
# the results for each race and each horse can be retrieved in html format
# The retrieved data is then processed to csv format for use by Excel or
MS Access
use diagnostics;
use sigtrap;
use strict;
use warnings;
use LWP::Simple;
print "\t$0 Starting ", scalar localtime, " \n";
print "\tOperating System\t$^O\n";
print "\tPerl version \t$] \n";
my $meet_code = "G"; # meet types are R or Greyhound or T
my $file_name_1 = 'web5.txt'; # Output file name
my $file_name_2 = 'web5.csv'; # Output excel file
my $fstype = Win32::FsType();
my $item_num = 4; # the element number within the
result array
my $race_num = 1;
my $result_count = 0;
my @result ;
my $suffix ;
my $test = 0; # 0 is the default and provides minimum
output
# 1 will provide extended
displays for debugging any problems
print "\tCurrent Active Drive Type is $fstype\n";
my $race_date = shift; # format CCyymmdd 20070117
unless (defined $race_date)
{
print "\n\tPROBLEM: Race Meet Date is missing \n";
usage();
}
my $meet_date = $race_date;
validate_meet_date();
my $meet_name = shift;
unless (defined $meet_name)
{
print "\n\tPROBLEM: Race Meet Name is missing \n";
usage();
}
$meet_name = uc($meet_name);
my $meet_type = shift;
$meet_type = "R" unless (defined($meet_type)) ;
# Thoroughbred Harness(Trotter) or Greyhound
if ($meet_type eq "T")
{ # Trotter
$suffix = '&grt=t';
$meet_code = $meet_type;
}
elsif ($meet_type eq "G")
{ # Greyhound
$suffix = '&grt=g';
$meet_code = $meet_type;
}
else
{ # Thoroughbreed
$suffix = '&grt=r';
$meet_code = "R";
}
# Save the Meet Name and Date into the result array
$result[0] = $meet_name;
$result[1] = $race_date;
get_url(); # Scrape the web page and store it in web5.txt file for further
processing
print "\tGetting $meet_name results for the meet held on $race_date ... \n";
open_file_1(); # Open the web5.txt file which holds the retrieved scrapped
web page
open_file_2(); # web5.csv
filter_results();
close_file_1();
close_file_2();
sub filter_results
{
# Start looking through web5.txt for race results
while (<INF>)
{
$_ =~ s/^\s+//;
chomp ;
# If record starts with
# Name
# Race
# <span
# <td class
# then we check further for selection
if (/^name:/i)
{ # Race name
print "$_\n" if ($test == 1);
if (m!^name:.nbsp.([A-Z0-9 \-'&/.]+)<!i)
{
print "$.\tRace Name :\t$1 \n" if ($test == 1);
$result[3] = $1;
next;
}
} # End race name
if (/^Race/i)
{ # Meet name and date
display_meet_name();
next;
} # End meet name
if (/^<span id/i)
{ # Winner place price
next if (/pbuc8/i);
next if (/pbuc9/i);
if (m!^<span id.+>([0-9.]*)<!i)
{
print "$.\tWinner place :\t$1\n" if ($test == 1);
print "$_\n" if ($test == 1);
store_data(1);
next;
}
} # End Winner place price
if (/^<td class/i)
{ # Place Number Horse name and payout for Super NSW UNI STAB
next unless (/Row/i);
next if (/Row2/i);
print "$_ \n" if ($test == 1);
if (m!^<td class.+Row1.+>Race no:.nbsp.(\d+).nbsp.-.nbsp!i)
{ # Race number
print "$.\tRace number :\t$1 \n" if ($test == 1);
if ($1 ne $race_num)
{ # If it is the next race number
write_record();
$item_num = 4;
$race_num = $1;
}
$result[2] = $1;
next;
} # End race number
if (m!^<td class.+Row..>([1-3dnrst]+)<\/td>!i)
{ # Horse result for 1st 2nd or 3rd
print "$.\tHorse result :\t$1\n" if ($test == 1);
store_data(1);
next;
} # End horse result
if (m!^<td class.+Row....+>(\d+)<\/span><\/td>!i)
{ # Horse number
print "$.\tHorse Number :\t$1\n" if ($test == 1);
store_data(1);
next;
} # End horse number
if (m!^<td class.+Row.+Name..>([A-Z '\&]+)<!i)
{ # Horse name
print "$.\tHorse name :\t$1\n" if ($test == 1);
store_data(1);
next;
} # End horse name
if ((m!^<td class.+Row.+first.+>([0-9.A_Z]*)<\/span>!i) ||
(m!^<td class.+Row.+econd.+>([0-9.A_Z]*)<\/span>!i) ||
(m!^<td class.+Row.+third.+>([0-9.A-Z]*)<\/span>!i))
{ # Result payout
print "$.\tResult payout :\t$1\n" if ($test == 1);
store_data(1);
next;
} # End result payout
# print "$_\n";
} # End td class
} # End while record processing
write_record();
} # End of sub filter_results
sub display_meet_name
{
if (m!.+Name.>([A-Za-z ]+)<.+Date.>([0-9A-Za-z ]+)<!i)
{
print "$.\tMeeting held at $1 on $2 \n" if ($test == 1);
}
} # End of sub display_meet_name
sub get_url
{ # Scrape the web page and store it in web5.txt file for further processing
my $criteria = '&track=';
my $scheme = '
http://';
my $server = '
www.ozeform.com/';
my $prefix =
'Default.aspx?pageid=e2632b85-fe1d-40fb-96b1-59d9800646e5&seldate=';
# Sample page ref
#
http://www.ozeform.com/Default.aspx...d9800646e5&seldate=20070310&track=ASCOT&grt=r
my $url =
$scheme.$server.$prefix.$meet_date.$criteria.$meet_name.$suffix;
print "\tSearching $url ...\n";
my $html = get($url)
or die "\tget_url : No reply from server : $!";
print "\tReceived ", length($html), " bytes of data\n";
print "\tOpening file $file_name_1 for output \n";
open FH1, ">", $file_name_1
or die "\tget_url : Cannot open file $file_name_1 : $!";
print FH1 $html
or die "\tget_url : Cannot write to $file_name_1 : $!";
print "\tClosing file $file_name_1 \n";
close FH1 || die "\tget_url : Cannot close file $file_name_1 : $!";
# print "$html \n" if ($test == 2);
} # End of sub get_url
sub store_data
{ # Store the matched data into the result array
my $select = shift;
unless (defined($select))
{
print "\tstore_data: Parameter missing from sub call\n";
die "\tstore_data: Internal program error : $!";
}
unless ($select == 1 || $select ==2)
{
print "store_data: Invalid sub call parameter $select\n";
die "store_data: Internal program error : $!";
}
my $data;
if ($select == 1)
{
$data = $1;
}
elsif ($select == 2)
{
$data = $2;
}
if (length($data) == 0)
{
$data = '0.00';
}
if ($data eq "NPP" || $data eq "npp" || $data eq "NTD" )
{
$data = '0.00';
}
print " $data " if ($test == 2);
$result[$item_num++] = $data;
} # End of sub store_data
sub write_record
{ # Extract the data from the result array and reformat to csv
$result_count++;
my $rcd = '"'.$meet_code.'",';
my $num_items = @result;
for (my $i = 0; $i < @result; $i++)
{
$rcd .= '"'.$result[$i].'",';
}
my $x = chop $rcd;
print OUF $rcd,"\n" ||
die "\twrite_record : Cannot write to file : $!";
} # End of sub write_record
sub open_file_1
{
# Open the input file
print "\tOpening file $file_name_1 for input \n";
open INF, "<", $file_name_1 ||
die "open_file_1 : Cannot open file $file_name_1 : $! ";
} # End of sub open_file_1
sub open_file_2
{
# Open the output file
print "\tOpening file $file_name_2 for output \n";
open OUF, ">>", $file_name_2 ||
die "open_file_2 : Cannot open file $file_name_2 : $! ";
} # End of sub open_file_2
sub close_file_1
{
print "\n\n\tClosing input file $file_name_1 $. records\n";
close INF || die "close_file_1 : Can't close $file_name_1: $!";
} # End of sub close_file_1
sub close_file_2
{
print "\tClosing output file $file_name_2 $result_count records\n";
close OUF || die "close_file_2 : Can't close $file_name_2 : $!";
} # End of sub close_file_2
sub validate_meet_date
{
# Check that the meet date submitted is valid
# ccyymmdd
# Re format the date to dd/mm/yyyy
my $error_code = 0;
my $century ;
my $year ;
my $cal_month ;
my $month_day ;
if (length($race_date) < 8)
{
$error_code = 1;
print "\n\tERROR $error_code: Meet date is less than 8 digits\n";
}
$century = substr($race_date,0,2);
if (int($century) != 20)
{
$error_code = 2;
print "\n\tERROR $error_code: Incorrect Century number\n" ;
}
$year = substr($race_date,2,2);
if (int($year) < 1 || int($year) > 99)
{
$error_code = 3;
print "\n\tERROR $error_code: Incorrect year number\n" ;
}
$cal_month = substr($race_date,4,2);
if (int($cal_month) < 1 || int($cal_month) > 12)
{
$error_code = 4;
print "\n\tERROR $error_code: Incorrect month number\n" ;
}
$month_day = substr($race_date,6,2);
if (int($month_day) < 1 || int($month_day) > 31)
{
$error_code = 5;
print "\n\tERROR $error_code: Incorrect day number\n" ;
}
$race_date = $month_day.'/'.$cal_month.'/'.$century.$year;
usage() if($error_code != 0);
} # End of sub validate_meet_date
sub usage
{
# Operating instructions
my $usage = <<'****';
USAGE:
perl web5.pl [race meeting date] [race meeting name]
Race meeting date is an 8 digit date in the format CCyymmdd
Example: 20070117 for 17th January 2007
Race meeting name is alphabetic with no numbers or special characters
Example: COLAC NEW ZEALAND ASCOT WARRNAMBOOL
****
print "$usage\n\n";
die;
} # End of sub usage
if ($test == 1)
{ # Print web5.csv for checking
open INF, "<", $file_name_2 ||
die "Cannot open file $file_name_2 : $! ";
while(<INF>)
{
print $_;
}
close INF || die "Cannot close file $file_name_2 : $!";
} # End of print web5.csv
my $num = @result;
print "\tNumber of items in result array is $num\n";
print "\t$0 Ended at ", scalar localtime, " \n";
just need it to be automated really
in other words go through each venue automatically
Regards
Graham