J
Jay
I'm trying to get HTML:arser to strip out the comments using some of the
sample code from the man page. I'm using the ignore_elements and I still
get comments in the dtext. Am I doing something wrong?
Tia,
Jay
CODE:
use HTML:arser ();
# Create parser object
$p = HTML:arser->new( api_version => 3,
start_h => [\&start, "tagname, attr"],
end_h => [\&end, "tagname"],
comment_h => [\&comment, "self,text"],
text_h => [\&dtext, "self,text"],
marked_sections => 1,
);
$p->ignore_elements( qw(script, comment, style) );
$p->strict_comment( [1] );
# Parse directly from file
$p->parse_file("0");
sub start {
my($self, $tagname, $attr, $attrseq, $origtext) = @_;
#...
}
sub end {
my($self, $tagname, $origtext) = @_;
#...
}
sub text {
my($self, $origtext, $is_cdata) = @_;
#...
}
sub comment{
#my($self, $origtext, $is_cdata) = @_;
#...
}
sub dtext {
my($self, $dtext ) = @_;
$dtext=~s/\s+/ /g;
print "DTEXT: $dtext\n";
}
Example of some of the output from parsing some web page:
DTEXT: <!-- /* You may give each page an identifying name, server, and
channel on the next lines. */ var s_pageName="buy"; var s_server="CWEB15";
var s_channel="buy"; var s_pageTyp
e=""; var s_prop1="Autoweb Direct to Site"; var s_prop2="Autoweb Direct to
Site 10714"; var s_prop3=""; var s_prop4=""; var s_prop5=""; var s_prop6="";
var s_prop7="buy|"; var s_pr
op8=""; var s_prop9="buy|Autoweb Direct to Site|10714"; var s_prop10="buy|";
var s_prop11="Autoweb Direct to Site|10714|taweb"; var s_prop12="||"; var
s_prop13="||||||buy||No"; var
s_prop14="Autoweb Direct to Site|10714|taweb|||||buy||No"; var s_prop15="No
Article|No Article"; var s_prop16=""; var s_prop17=""; var s_prop18="Autoweb
Direct to Site|10714|buy";
var s_prop19="Autoweb Direct to Site|10714||buy"; var
s_prop20="buy||||sky|ban|Autoweb Direct to Site"; /* E-commerce Variables */
var s_campaign="10714"; var s_state=""; var s_zi
p=""; var s_events=""; var s_products=""; var s_purchaseID=""; var
s_eVar1="Autoweb Direct to Site"; var s_eVar2="Autoweb Direct to Site
10714"; var s_eVar3="NT-sky-ban"; var s_eVa
r4=""; var s_eVar5=""; /********* INSERT THE DOMAIN AND PATH TO YOUR CODE
BELOW ************/ /********** DO NOT ALTER ANYTHING ELSE BELOW THIS LINE!
*************/ var s_code=' '/
/-->
DTEXT:
DTEXT:
sample code from the man page. I'm using the ignore_elements and I still
get comments in the dtext. Am I doing something wrong?
Tia,
Jay
CODE:
use HTML:arser ();
# Create parser object
$p = HTML:arser->new( api_version => 3,
start_h => [\&start, "tagname, attr"],
end_h => [\&end, "tagname"],
comment_h => [\&comment, "self,text"],
text_h => [\&dtext, "self,text"],
marked_sections => 1,
);
$p->ignore_elements( qw(script, comment, style) );
$p->strict_comment( [1] );
# Parse directly from file
$p->parse_file("0");
sub start {
my($self, $tagname, $attr, $attrseq, $origtext) = @_;
#...
}
sub end {
my($self, $tagname, $origtext) = @_;
#...
}
sub text {
my($self, $origtext, $is_cdata) = @_;
#...
}
sub comment{
#my($self, $origtext, $is_cdata) = @_;
#...
}
sub dtext {
my($self, $dtext ) = @_;
$dtext=~s/\s+/ /g;
print "DTEXT: $dtext\n";
}
Example of some of the output from parsing some web page:
DTEXT: <!-- /* You may give each page an identifying name, server, and
channel on the next lines. */ var s_pageName="buy"; var s_server="CWEB15";
var s_channel="buy"; var s_pageTyp
e=""; var s_prop1="Autoweb Direct to Site"; var s_prop2="Autoweb Direct to
Site 10714"; var s_prop3=""; var s_prop4=""; var s_prop5=""; var s_prop6="";
var s_prop7="buy|"; var s_pr
op8=""; var s_prop9="buy|Autoweb Direct to Site|10714"; var s_prop10="buy|";
var s_prop11="Autoweb Direct to Site|10714|taweb"; var s_prop12="||"; var
s_prop13="||||||buy||No"; var
s_prop14="Autoweb Direct to Site|10714|taweb|||||buy||No"; var s_prop15="No
Article|No Article"; var s_prop16=""; var s_prop17=""; var s_prop18="Autoweb
Direct to Site|10714|buy";
var s_prop19="Autoweb Direct to Site|10714||buy"; var
s_prop20="buy||||sky|ban|Autoweb Direct to Site"; /* E-commerce Variables */
var s_campaign="10714"; var s_state=""; var s_zi
p=""; var s_events=""; var s_products=""; var s_purchaseID=""; var
s_eVar1="Autoweb Direct to Site"; var s_eVar2="Autoweb Direct to Site
10714"; var s_eVar3="NT-sky-ban"; var s_eVa
r4=""; var s_eVar5=""; /********* INSERT THE DOMAIN AND PATH TO YOUR CODE
BELOW ************/ /********** DO NOT ALTER ANYTHING ELSE BELOW THIS LINE!
*************/ var s_code=' '/
/-->
DTEXT:
DTEXT: