R
robic0
Version 0.1000
Here it is. About %85 complete.
Not quite there yet. I know how to finish it but too lazy
right now. I may get a hair up my butt and finish it in a 2
day timeframe.
Whats not finished yet:
- Entities.. (I don't care to explain why, has placeholder)
- Attlist.. (same, has placeholder
- Dtd.. (same, matter of what I'm going to do in the way of a combined html package or not)
- Encoding (same, I'm %90 on this subject, but looking at the %10 for overall integration)
- other misc.. (same, its in my head)
- Class/package interface.. (I haven't digested/read the total bullshit on this yet, have to cut aside a day for this)
The parse framework for Xml/Xhtml is completely there. The standard used is Xml 1.1
The code, though incomplete, is %100 compliant.
This is hard to benchmark because it depends on the entities and complexity. In general,
I used a very complex html file for harcore numbers that runs about 340k in size.
Actually the most complex html file I could find. Xml will be significantly faster.
A 380k file parsed in .25 second with debug turned off. Of course I have a 3 drive
Hitachi 750 gig (total) raid 0 array on a Dfi SLI-DR, Amd 3700+ San Diego, overclocked to 3ghz, 620 ddr.
I haven't tested it but this might be faster than the current dll (C) alternatives.
It dies with my custum, accurate, full error reporting, exact line/column numbers. Just like the big boys.
The regexp's are designed with speed in mind. There is no faster a regexp processor can go in an all Perl
solution.
The nice part is that its open. This is just the core. Its not complicated. Customization can occurr within
the module itself (hey u gonna customize my code?) in reference to not only handlers but the core itself.
I know VanderDick is going to steel it. I will work on completeness in my idea of what it could be. My idea
will always be better than anybody elses out there, thats why I'm posting it here.
If you don't think this is of any use, you may have been kicked by a mule at one point in your life.
I don't care to answer why I'm posting it. Lifes too short I guess, could drop dead tommorow...
(Hey but some of you are definetly 'not worthy')
use strict;
use warnings;
$|=1;
#######################
# Useage examples
#######################
use Benchmark ':hireswallclock';
my $t0 = new Benchmark;
my $p = new RXParse();
#my $fname = "some.html";
my $fname = "config.html";
if (1) {
open DATA, $fname or die "can't open config.html...";
my $parse_ln = "";
$p->setDebugMode(1);
$parse_ln = join ('', <DATA>);
$p->parse(\$parse_ln);
#$p->parse($parse_ln);
close DATA;
}
else {
open DATA, $fname or die "can't open config.html...";
$p->setDebugMode(1);
$p->parse(*DATA);
close DATA;
#open my $fref, "config.html" or die "can't open config.html...";
#$p->parse($fref, 1);
#close $fref;
}
my $t1 = new Benchmark;
my $td = timediff($t1, $t0);
print STDERR "the code took:",timestr($td),"\n";
exit;
############################################
# XML - RXParse module (by robic0)
# ------------------------------------
# Compliant w3c XML: 1.1
# Resources:
# Extensible Markup Language (XML) 1.1
# W3C Recommendation 04 February 2004,
# 15 April 2004
# http://www.w3.org/TR/xml11/#NT-PITarget
############################################
package RXParse;
use vars qw(@ISA);
@ISA = qw();
#############################
# ReXparse private globals
# =========================
my (
%ErrMsg,%Dflth,
$Nstrt,$Nchar,$Name,
@UC_Nstart,@UC_Nchar,
$RxParse,$RxAttr,
$RxAttr_DL1,
$RxAttr_DL2,
$RxAttr_RM,
$RxPi,
$RxENTITY,
$Entities,
$RxEntConv,
%ents
);
my $initflg = 0;
####################
# ReXparse methods
# =================
sub new {
my ($class, @args) = @_;
my $self = {};
if (!$initflg) {
InitVars();
$initflg = 1;
}
$self->{'InParse'} = 0;
$self->{'debug'} = 0;
$self->{'origcontent'} = undef;
setDfltHandlers ($self);
return bless ($self, $class);
}
sub original_content {
my $self = shift;
if (defined $self->{'origcontent'} &&
ref($self->{'origcontent'}) eq 'SCALAR') {
return ${$self->{'origcontent'}};
} else {return "";}
}
sub setDebugMode {
my $self = shift;
my $dmode = shift;
if (defined $dmode && $dmode) {$dmode = 1;}
else {$dmode = 0;}
$self->{'debug'} = $dmode;
}
sub setDfltHandlers {
my ($self,$name) = @_;
if (defined $name) {
$name =~ s/^\s+//s; $name =~ s/\s+$//s;
my $hname = "h".lc($name);
if (exists $Dflth{$hname}) {
$self->{$hname} = $Dflth{$hname};
}
} else {
foreach my $key (%Dflth) {
$self->{$key} = $Dflth{$key};
}
}
}
sub setHandlers {
my ($self, @args) = @_;
my %oldh = ();
if (!scalar(@args)) {
while (my ($name,$val) = splice (@args, 0, 2)) {
$name =~ s/^\s+//s; $name =~ s/\s+$//s;
my $hname = "h".lc($name);
if (exists $self->{$hname}) {
$oldh{$name} = $self->{$hname};
if (ref($val) eq 'CODE') {
$self->{$hname} = $val;
} else {
# if its not a CODE ref,
# just set default handler
$self->setDfltHandlers ($name);
}
}
}
}
return %oldh;
}
sub parse {
my ($self, $data) = @_;
throwX ('30') unless (!$self->{'InParse'});
throwX ('31') unless (defined $data);
$self->{'InParse'} = 1;
# call processor
if (ref($data) eq 'SCALAR') {
print "SCALAR ref\n" if ($self->{'debug'});
eval {Processor($self, 1, $data);};
if ($@) {
Cleanup($self); die $@;
}
}
elsif (ref(\$data) eq 'SCALAR') {
print "SCALAR string\n" if ($self->{'debug'});
eval {Processor($self, 1, \$data);};
if ($@) {
Cleanup($self); die $@;
}
} else {
if (ref($data) ne 'GLOB' && ref(\$data) ne 'GLOB') {
$self->{'InParse'} = 0;
die "rp_error_parse, data source not a string or filehandle nor reference to one\n";
}
print "GLOB ref or filehandle\n" if ($self->{'debug'});
eval {Processor($self, 0, $data);};
if ($@) {
Cleanup($self); die $@;
}
}
$self->{'InParse'} = 0;
}
##############################
# ReXparse private func's
# ========================
sub Cleanup
{
my $obj = shift;
$obj->{'origcontent'} = undef;
$obj->{'debug'} = 0;
$obj->{'InParse'} = 0;
$obj->setDfltHandlers();
}
sub Processor
{
my ($obj, $BUFFERED, $rpl_mk) = @_;
my ($markup_file);
my $parse_ln = '';
my $dyna_ln = '';
my $ref_parse_ln = \$parse_ln;
my $ref_dyna_ln = \$dyna_ln;
if ($BUFFERED) {
$ref_parse_ln = $rpl_mk;
$ref_dyna_ln = \$dyna_ln;
} else {
# assume its a ref to a global or global itself
$markup_file = $rpl_mk;
$ref_dyna_ln = $ref_parse_ln;
}
my $ln_cnt = 0;
my $complete_comment = 0;
my $complete_cdata = 0;
my @Tags = ();
my $havroot = 0;
my $last_cpos = 0;
my $done = 0;
my $content = '';
my $altcontent = undef;
$obj->{'origcontent'} = \$content;
while (!$done)
{
$ln_cnt++;
# stream processing (if not buffered)
if (!$BUFFERED) {
if (!($_ = <$markup_file>)) {
# just parse what we have
$done = 1;
# boundry check for runnaway
if (($complete_comment+$complete_cdata) > 0) {
$ln_cnt--;
}
} else {
$$ref_parse_ln .= $_;
## buffer if needing comment/cdata closure
next if ($complete_comment && !/-->/);
next if ($complete_cdata && !/\]\]>/);
## reset comment/cdata flags
$complete_comment = 0;
$complete_cdata = 0;
## flag serialized comments/cdata buffering
if (/(<!--)|(<!\[CDATA\[)/)
{
if (defined $1) { # complete comment
if ($$ref_parse_ln !~ /<!--.*?-->/s) {
$complete_comment = 1;
next;
}
}
elsif (defined $2) { # complete cdata
if ($$ref_parse_ln !~ /<!\[CDATA\[.*?\]\]>/s) {
$complete_cdata = 1;
next;
}
}
}
## buffer until '>' or eof
next if (!/>/);
}
} else {
$ln_cnt = 1;
$done = 1;
}
## REGEX Parsing loop
while ($$ref_parse_ln =~ /$RxParse/g)
{
## handle contents
if (defined $14) {
$content .= $14;
$last_cpos = pos($$ref_parse_ln);
next;
}
## valid content here ... can be taken off
print "-"x20,"\n" if ($obj->{'debug'});
if (length ($content)) {
## check reserved characters in content
if ($content =~ /[<>]/) {
#$content =~ s/^\s+//s; $content =~ s/\s+$//s;
## mark-up characters in content
throwX('01', $ref_parse_ln, $last_cpos, $content, $ln_cnt);
}
if (!scalar(@Tags)) {
#$content =~ s/^\s+//s; $content =~ s/\s+$//s;
if ($content =~ /[^\s]/s) {
## content at root level
throwX('02', $ref_parse_ln, $last_cpos, $content, $ln_cnt);
}
}
# substitute special xml characters, then call content handler with $content
# ------------------------------------------------------
# $content has to be a constant if xml reserved chars
# are found, copy altered string to pass to handler
# otherwise pass original $content
# ------------------------------------------------------
if (defined ($altcontent = convertEntities (\$content))) {
$obj->{'hchar'}($obj, $$altcontent, $obj->{'debug'});
} else {
$obj->{'hchar'}($obj, $content, $obj->{'debug'});
}
#print "14 $content\n" if ($obj->{'debug'});
print "-"x20,"\n" if ($obj->{'debug'});
$content = '';
}
#if ($show_pos && $debug) {
# my $rr = pos $$ref_parse_ln;
# print "$rr ";
#}
## <tag> or </tag> or <tag/>
if (defined $2)
{
my ($l1,$l3) = (length($1),length($3));
if (($l1+$l3)==0) { ## <tag>
if (!scalar(@Tags) && $havroot) {
## new root node <tag>
throwX('03', $ref_parse_ln, pos($$ref_parse_ln), $2, $ln_cnt);
}
push @Tags,$2;
$havroot = 1;
# call start tag handler with $2
$obj->{'hstart'}($obj, $2);
}
elsif ($l1==1 && $l3==0) { ## </tag>
my $pval = pop @Tags;
if (!defined $pval) {
## missing start tag </tag>
throwX('04', $ref_parse_ln, pos($$ref_parse_ln), $2, $ln_cnt);
}
if ($2 ne $pval) {
## expected closing tag </tag>
throwX('05', $ref_parse_ln, pos($$ref_parse_ln), $pval, $ln_cnt);
}
# call end tag handler with $2
$obj->{'hend'}($obj, $2);
}
elsif ($l1==0 && $l3==1) { ## <tag/>
if (!scalar(@Tags) && $havroot) {
## new root node <tag/>
throwX('06', $ref_parse_ln, pos($$ref_parse_ln), $2, $ln_cnt);
}
$havroot = 1; # first and only <root/>
# call start tag handler, then end tag handler, with $2
$obj->{'hstart'}($obj, $2);
$obj->{'hend'}($obj, $2);
} else {
## <//node//> errors
throwX('07', $ref_parse_ln, pos($$ref_parse_ln), "$1$2$3", $ln_cnt);
}
#print "2 TAG: $1$2$3\n" if ($obj->{'debug'});
}
## <tag attrib/> or <tag attrib>
elsif (defined $5)
{
my $l7 = length($7);
## attributes
my $attref = getAttrARRAY($6);
unless (ref($attref)) {
## missing or extra token
throwX('08', $ref_parse_ln, pos($$ref_parse_ln), $attref, $ln_cnt);
}
if ($l7==0) { ## <tag attrib>
if (!scalar(@Tags) && $havroot) {
## new root node
throwX('03', $ref_parse_ln, pos($$ref_parse_ln), $5, $ln_cnt);
}
push @Tags,$5;
$havroot = 1;
# call start tag handler with $5 and attributes @{$attref}
$obj->{'hstart'}($obj, $5, @{$attref});
}
elsif ($l7==1) { ## <tag attrib/>
if (!scalar(@Tags) && $havroot) {
## new root node
throwX('06', $ref_parse_ln, pos($$ref_parse_ln), $7, $ln_cnt);
}
$havroot = 1; # first and only <root attrib/>
# call start tag handler with $5 and attributes @{$attref}, then end tag handler with $5
$obj->{'hstart'}($obj, $5, @{$attref});
$obj->{'hend'}($obj, $5);
} else {
## syntax error
throwX('07', $ref_parse_ln, pos($$ref_parse_ln), "$5$6$7", $ln_cnt);
}
#if ($obj->{'debug'}) {
# print "5,6 TAG: $5 Attr: $6$7\n" ;
#}
}
## XMLDECL or PI (processing instruction)
elsif (defined $8)
{
my $pi = $8;
# xml declaration ?
if ($pi =~ /^xml(.*?)$/) {
my $attref = getAttrARRAY($1);
unless (ref($attref)) {
## missing or extra token in xmldecl
throwX('14', $ref_parse_ln, pos($$ref_parse_ln), $attref, $ln_cnt);
}
if (!scalar(@{$attref})) {
## missing xmldecl parameters
throwX('15', $ref_parse_ln, pos($$ref_parse_ln), $pi, $ln_cnt);
}
my ($version,$encoding,$standalone);
while (my ($name,$val) = splice (@{$attref}, 0, 2)) {
if ('version' eq lc($name) && !defined $version) {
if ($val !~ /^[0-9]\.[0-9]+$/) {
## invalid version character data in xmldecl
throwX('16', $ref_parse_ln, pos($$ref_parse_ln), "$name = '$val'", $ln_cnt);
}
$version = $val;
} elsif ('encoding' eq lc($name) && !defined $encoding) {
if ($val !~ /^[A-Za-z][\w\.-]*$/) {
## invalid encoding character data in xmldecl
throwX('17', $ref_parse_ln, pos($$ref_parse_ln), "$name = '$val'", $ln_cnt);
}
$encoding = $val;
} elsif ('standalone' eq lc($name) && !defined $standalone) {
if ($val !~ /^(?:yes)|(?:no)$/) {
## invalid standalone character data in xmldecl
throwX('18', $ref_parse_ln, pos($$ref_parse_ln), "$name = '$val'", $ln_cnt);
}
$standalone = ($val eq 'yes' ? 1 : 0);
} else {
## unknown xmldecl parameter
throwX('19', $ref_parse_ln, pos($$ref_parse_ln), "$name = '$val'", $ln_cnt);
}
}
if (!defined $version) {
# missing version in xmldecl
throwX('20', $ref_parse_ln, pos($$ref_parse_ln), $pi, $ln_cnt);
}
# call xmldecl handler
$obj->{'hxmldecl'}($obj, $version,$encoding,$standalone);
}
# PI - processing instruction
elsif ($pi =~ /$RxPi/) {
# call pi handler
$obj->{'hproc'}($obj, $1, $2);
} else {
# unknown PI data
throwX('21', $ref_parse_ln, pos($$ref_parse_ln), $pi, $ln_cnt);
}
#print "8 VERSION: $8\n" if ($obj->{'debug'});
}
## META
elsif (defined $4) {
# If doctype is HTML then META is not closed
# parse meta data, call handler
$obj->{'hmeta'}($obj, $4, $obj->{'debug'});
#print "4 META: $4\n" if ($obj->{'debug'});
}
## DOCTYPE
elsif (defined $9) {
# parese doctype, call handler
$obj->{'hdoctype'}($obj, $9, $obj->{'debug'});
#print "9 DOCTYPE: $9\n" if ($obj->{'debug'});
}
## CDATA
elsif (defined $10) {
if (!scalar(@Tags)) {
## CDATA content at root
throwX('09', $ref_parse_ln, pos($$ref_parse_ln), $10, $ln_cnt);
}
# call cdata handler
$obj->{'hcdata'}($obj, $10, $obj->{'debug'});
#print "10 CDATA: $10\n" if ($obj->{'debug'});
}
## COMMENT
elsif (defined $11) {
# call comment handler
$obj->{'hcomment'}($obj, $11, $obj->{'debug'});
#print "11 COMMENT: $11\n" if ($obj->{'debug'});
}
## ATTLIST
elsif (defined $12) {
# parese attlist, call handler
$obj->{'hattlist'}($obj, $12, $obj->{'debug'});
#print "12 ATTLIST: $12\n" if ($obj->{'debug'});
}
## ENTITY
elsif (defined $13) {
# parese entity, call handler
my $entdata = $13;
if ($entdata =~ /$RxENTITY/) {
if (defined $1) {
# general entity replacement
# add general entity $3 (EntityDef)
} else {
# parameter entity replacement
# add parameter entity $3 (PEDef)
}
}
else {
# unknown ENTITY data
# throwx
}
$obj->{'hentity'}($obj, $13, $obj->{'debug'});
#print "13 ENTITY: $13\n" if ($obj->{'debug'});
}
}
$$ref_dyna_ln = $content;
$content = '';
}
if (!$havroot) {
# not valid xml
throwX('10', undef, undef, undef, undef);
}
if (scalar(@Tags)) {
my $str = '';
while (defined (my $etag = pop @Tags)) {
$str .= ", /$etag";
}
$str =~ s/^, +//;
# missing end tag
throwX('11', undef, undef, $str, undef);
}
if ($$ref_dyna_ln =~ /[^\s]/s) {
if ($$ref_dyna_ln =~ /[<>]/) {
#mark-up characters in content
throwX('12',undef, undef, $$ref_dyna_ln, undef);
} else {
# content at root level (end)
throwX('13',undef, undef, $$ref_dyna_ln, undef);
}
}
$obj->{'origcontent'} = undef;
return 1;
}
sub getAttrARRAY
{
my $attrstr = shift;
my $aref = [];
my ($alt_attval,$attval,$rx);
while ($attrstr =~ s/$RxAttr//) {
push @{$aref},$1;
if ($2 eq "'") {
$rx = \$RxAttr_DL1;
} else {
$rx = \$RxAttr_DL2;
}
if ($attrstr =~ s/$$rx//) {
if (defined $1) {
push @{$aref},$1;
next;
}
$attval = $2;
if (defined ($alt_attval = convertEntities (\$attval))) {
push @{$aref},$$alt_attval;
next;
}
push @{$aref},$attval;
next;
}
return $attrstr;
}
if ($attrstr=~/$RxAttr_RM/) {
$attrstr =~ s/^\s+//s; $attrstr =~ s/\s+$//s;
return $attrstr if (length($attrstr));
}
return $aref;
}
sub convertEntities
{
my $str_ref = shift;
my $alt_str = '';
my $res = 0;
my ($entchr);
while ($$str_ref =~ /$RxEntConv/gc) {
if (defined $4) {
# decimal
$entchr = chr($4) ;
if (exists $ents{$entchr}) {
$alt_str .= "$1".$entchr;
}
} elsif (defined $5) {
if (length($5) < 9) {
# hex
$entchr = chr(hex($5));
if (exists $ents{$entchr}) {
$alt_str .= "$1".$entchr;
}
}
} else {
$alt_str .= "$1$ents{$3}";
$res = 1;
}
}
if ($res) {
$alt_str .= substr $$str_ref, pos($$str_ref);
return \$alt_str;
}
return undef;
}
sub throwX
{
my ($errno, $lrefseg, $cseg_err, $datastr, $l_tot) = @_;
my ($line,$col) = (0,0);
if (defined $lrefseg) {
($line,$col) = getRealColumn($lrefseg, $l_tot, $cseg_err);
}
die "No such error message ($errno)\n" if (!exists $ErrMsg{$errno});
my $estr = '';
my $ctmpl = "\$estr = sprintf ($ErrMsg{$errno});";
eval $ctmpl;
die $estr;
}
sub getRealColumn
{
my ($lrefseg, $l_tot, $cseg_err) = @_;
my $cseg_offset = 0;
pos($$lrefseg) = 0;
my ($lseg_tot, $lseg_offset) = (0,1);
while ($$lrefseg =~ /\n/g) {
$lseg_tot++;
if (pos($$lrefseg) < $cseg_err) {
$cseg_offset = pos($$lrefseg);
$lseg_offset++;
next;
}
if ($l_tot <= 1) {
$lseg_tot = $l_tot;
last;
}
}
return ($l_tot-$lseg_tot+$lseg_offset, $cseg_err-$cseg_offset);
}
# Globals Init
# =============
sub InitVars
{
%Dflth = (
'hstart' => \&dflt_start,
'hend' => \&dflt_end,
'hchar' => \&dflt_char,
'hcdata' => \&dflt_cdata,
'hcomment' => \&dflt_comment,
'hmeta' => \&dflt_meta,
'hattlist' => \&dflt_attlist,
'hentity' => \&dflt_entity,
'hdoctype' => \&dflt_doctype,
'helement' => \&dflt_element,
'hxmldecl' => \&dflt_xmldecl,
'hproc' => \&dflt_proc,
);
@UC_Nstart = (
"\\x{C0}-\\x{D6}",
"\\x{D8}-\\x{F6}",
"\\x{F8}-\\x{2FF}",
"\\x{370}-\\x{37D}",
"\\x{37F}-\\x{1FFF}",
"\\x{200C}-\\x{200D}",
"\\x{2070}-\\x{218F}",
"\\x{2C00}-\\x{2FEF}",
"\\x{3001}-\\x{D7FF}",
"\\x{F900}-\\x{FDCF}",
"\\x{FDF0}-\\x{FFFD}",
"\\x{10000}-\\x{EFFFF}",
);
@UC_Nchar = (
"\\x{B7}",
"\\x{0300}-\\x{036F}",
"\\x{203F}-\\x{2040}",
);
$Nstrt = "[A-Za-z_:".join ('',@UC_Nstart)."]";
$Nchar = "[-\\w:\\.".join ('',@UC_Nchar).join ('',@UC_Nstart)."]";
$Name = "(?:$Nstrt$Nchar*?)";
#die "$Name\n";
$RxParse =
qr/(?:<(??\/*)($Name)\s*(\/*))|(?:META(.*?))|(?$Name)((?:\s+$Name\s*=\s*["'][^<]*['"])+)\s*(\/*))|(?:\?(.*?)\?)|(?:!(??OCTYPE(.*?))|(?:\[CDATA\[(.*?)\]\])|(?:--(.*?[^-])--)|(?:ATTLIST(.*?))|(?:ENTITY(.*?)))))>)|(.+?)/s;
# ( <( ( 1 12 2 3 3)|( 4 4)|( 5 56( ) 6 7 7)|( 8 8 )|( !( ( 9 9)|( 0 0 )|( 1 1 )|(
2 2)|( 3 3))))>)|4 4
$RxAttr = qr/^\s+($Name)\s*=\s*("|')/;
$RxAttr_DL1 = qr/^(?[^'&]*?)|([^']*?))'/;
$RxAttr_DL2 = qr/^(?[^"&]*?)|([^"]*?))"/;
$RxAttr_RM = qr/[^\s\n]+/;
$RxPi = qr/^($Name)\s+(.*?)$/s;
#[52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
#[53] AttDef ::= S Name S AttType S DefaultDecl
$RxENTITY = qr/\s+($Name)|(?:%\s+($Name))\s+(.*?)/s;
# 1 1 ( 2 2) 3 3
$Entities = "(?:amp)|(?:gt)|(?:lt)|(?:apos)|(?:quot)|(?:#(?[0-9]+)|(x[0-9a-fA-F]+)))"; # cat more entities
# ( #( 4 4|5 5))
$RxEntConv = qr/(.*?)(&|%)($Entities);/s;
# 1 12 23 3
%ents = (
'amp' =>'&',
'gt' =>'>',
'lt' =>'<',
'apos'=>"'",
'quot'=>"\"",
'&' => '',
'<' => '',
'>' => '',
"'" => '',
"\"" => ''
);
%ErrMsg = (
'01' => "\"rp_error_01, mark-up or reserved characters in content (line %s, col %s), malformed element? '%s'\\n\", \$line, \$col, \$datastr",
'02' => "\"rp_error_02, content at root level (line %s, col %s): '%s'\\n\", \$line, \$col, \$datastr",
'03' => "\"rp_error_03, element wants to be new root node (line %s, col %s): '%s'\\n\", \$line, \$col, \$datastr",
'04' => "\"rp_error_04, missing start tag for '/%s' (line %s, col %s)\\n\", \$datastr, \$line, \$col",
'05' => "\"rp_error_05, expected closing tag '/%s' (line %s, col %s)\\n\", \$datastr, \$line, \$col",
'06' => "\"rp_error_06, element wants to be new root node (line %s, col %s): '%s/'\\n\", \$line, \$col, \$datastr",
'07' => "\"rp_error_07, tag syntax '%s' (line %s, col %s)\\n\", \$datastr, \$line, \$col",
'08' => "\"rp_error_08, invalid, missing or extra tokens in attribute asignment (line %s, col %s): %s\\n\", \$line, \$col, \$datastr",
'09' => "\"rp_error_09, CDATA content at root level (line %s, col %s): '%s'\\n\", \$line, \$col, \$datastr",
'10' => "\"rp_error_10, not a valid xml document\\n\"",
'11' => "\"rp_error_11, missing end tag '%s'\\n\", \$datastr",
'12' => "\"rp_error_12, mark-up or reserved characters in content (end), malformed element? '%s'\\n\", \$datastr",
'13' => "\"rp_error_13, content at root level (end): '%s'\\n\", \$datastr",
'14' => "\"rp_error_14, invalid, missing or extra tokens in xmldecl asignment (line %s, col %s): %s\\n\", \$line, \$col, \$datastr",
'15' => "\"rp_error_15, missing xmldecl parameters (line %s, col %s): %s\\n\", \$line, \$col, \$datastr",
'16' => "\"rp_error_16, invalid 'version' character data in xmldecl (line %s, col %s): %s\\n\", \$line, \$col, \$datastr",
'17' => "\"rp_error_17, invalid 'encoding' character data in xmldecl (line %s, col %s): %s\\n\", \$line, \$col, \$datastr",
'18' => "\"rp_error_18, invalid 'standalone' character data in xmldecl (line %s, col %s): %s\\n\", \$line, \$col, \$datastr",
'19' => "\"rp_error_19, unknown xmldecl parameter (line %s, col %s): %s\\n\", \$line, \$col, \$datastr",
'20' => "\"rp_error_20, missing xmldecl 'version' parameter (line %s, col %s): %s\\n\", \$line, \$col, \$datastr",
'21' => "\"rp_error_21, unknown or missing processing instruction parameters (line %s, col %s): '%s'\\n\", \$line, \$col, \$datastr",
'30' => "\"rp_error_30, already in parse\\n\"",
'31' => "\"rp_error_31, data source not defined\\n\"",
);
}
# Default Handlers
# =================
sub dflt_start {
my ($obj, $el, @attr) = @_;
if ($obj->{'debug'}) {
print "start _: $el\n";
while (my ($name,$val) = splice (@attr, 0,2)) {
print " "x12,"$name = $val\n";
}
}
}
sub dflt_end {my ($obj, $el) = @_;print "end _: /$el\n" if ($obj->{'debug'});}
sub dflt_char {my ($obj, $str) = @_;print "char _: $str\n" if ($obj->{'debug'});}
sub dflt_cdata {my ($obj, $str) = @_;print "cdata _: $str\n" if ($obj->{'debug'});}
sub dflt_comment {my ($obj, $str) = @_;print "comnt _: $str\n" if ($obj->{'debug'});}
sub dflt_meta {my ($obj, $str) = @_;print "meta _: $str\n" if ($obj->{'debug'});}
sub dflt_attlist {my ($obj, $parm) = @_;print "attlist_h _: $parm\n" if ($obj->{'debug'});}
sub dflt_entity {my ($obj, $parm) = @_;print "entity_h _: $parm\n" if ($obj->{'debug'});}
sub dflt_doctype {my ($obj, $parm) = @_;print "doctype_h _: $parm\n" if ($obj->{'debug'});}
sub dflt_element {my ($obj, $parm) = @_;print "element_h _: $parm\n" if ($obj->{'debug'});}
sub dflt_xmldecl {
my ($obj, $version,$encoding,$standalone) = @_;
if ($obj->{'debug'}) {
print "xmldecl_h _: version = $version\n" if (defined $encoding);
print " "x14,"encoding = $encoding\n" if (defined $encoding);
print " "x14,"standalone = $standalone\n" if (defined $standalone);
}
}
sub dflt_proc {
my ($obj, $target,$data) = @_;
if ($obj->{'debug'}) {
print "proc_h _: target = $target\n";
print " "x14,"data = $data\n";
}
}
__END__
#$RxParse =
qr/(?:<(??\/*)($Name)\s*(\/*))|(?:META(.*?))|(?$Name)((?:\s+$Name\s*=\s*["'][^<]*['"])+)\s*(\/*))|(?:\?(.*?)\?)|(?:!(??OCTYPE(.*?))|(?:\[CDATA\[(.*?)\]\])|(?:--(.*?[^-])--)|(?:ATTLIST(.*?))|(?:ENTITY(.*?)))))>)|(.+?)/s;
# ( <( ( 1 12 2 3 3)|( 4 4)|( 5 56( ) 6 7 7)|( 8 8 )|( !( ( 9 9)|( 0 0 )|( 1 1 )|(
2 2)|( 3 3))))>)|4 4
#$Entities = "(?:amp)|(?:gt)|(?:lt)|(?:apos)|(?:quot)|(?:#(?[0-9]+)|(x[0-9a-fA-F]+)))";
# ( #( 3 3 4 4))
#$RxEntConv = qr/(.*?)&($Entities);/s;
Here it is. About %85 complete.
Not quite there yet. I know how to finish it but too lazy
right now. I may get a hair up my butt and finish it in a 2
day timeframe.
Whats not finished yet:
- Entities.. (I don't care to explain why, has placeholder)
- Attlist.. (same, has placeholder
- Dtd.. (same, matter of what I'm going to do in the way of a combined html package or not)
- Encoding (same, I'm %90 on this subject, but looking at the %10 for overall integration)
- other misc.. (same, its in my head)
- Class/package interface.. (I haven't digested/read the total bullshit on this yet, have to cut aside a day for this)
The parse framework for Xml/Xhtml is completely there. The standard used is Xml 1.1
The code, though incomplete, is %100 compliant.
This is hard to benchmark because it depends on the entities and complexity. In general,
I used a very complex html file for harcore numbers that runs about 340k in size.
Actually the most complex html file I could find. Xml will be significantly faster.
A 380k file parsed in .25 second with debug turned off. Of course I have a 3 drive
Hitachi 750 gig (total) raid 0 array on a Dfi SLI-DR, Amd 3700+ San Diego, overclocked to 3ghz, 620 ddr.
I haven't tested it but this might be faster than the current dll (C) alternatives.
It dies with my custum, accurate, full error reporting, exact line/column numbers. Just like the big boys.
The regexp's are designed with speed in mind. There is no faster a regexp processor can go in an all Perl
solution.
The nice part is that its open. This is just the core. Its not complicated. Customization can occurr within
the module itself (hey u gonna customize my code?) in reference to not only handlers but the core itself.
I know VanderDick is going to steel it. I will work on completeness in my idea of what it could be. My idea
will always be better than anybody elses out there, thats why I'm posting it here.
If you don't think this is of any use, you may have been kicked by a mule at one point in your life.
I don't care to answer why I'm posting it. Lifes too short I guess, could drop dead tommorow...
(Hey but some of you are definetly 'not worthy')
use strict;
use warnings;
$|=1;
#######################
# Useage examples
#######################
use Benchmark ':hireswallclock';
my $t0 = new Benchmark;
my $p = new RXParse();
#my $fname = "some.html";
my $fname = "config.html";
if (1) {
open DATA, $fname or die "can't open config.html...";
my $parse_ln = "";
$p->setDebugMode(1);
$parse_ln = join ('', <DATA>);
$p->parse(\$parse_ln);
#$p->parse($parse_ln);
close DATA;
}
else {
open DATA, $fname or die "can't open config.html...";
$p->setDebugMode(1);
$p->parse(*DATA);
close DATA;
#open my $fref, "config.html" or die "can't open config.html...";
#$p->parse($fref, 1);
#close $fref;
}
my $t1 = new Benchmark;
my $td = timediff($t1, $t0);
print STDERR "the code took:",timestr($td),"\n";
exit;
############################################
# XML - RXParse module (by robic0)
# ------------------------------------
# Compliant w3c XML: 1.1
# Resources:
# Extensible Markup Language (XML) 1.1
# W3C Recommendation 04 February 2004,
# 15 April 2004
# http://www.w3.org/TR/xml11/#NT-PITarget
############################################
package RXParse;
use vars qw(@ISA);
@ISA = qw();
#############################
# ReXparse private globals
# =========================
my (
%ErrMsg,%Dflth,
$Nstrt,$Nchar,$Name,
@UC_Nstart,@UC_Nchar,
$RxParse,$RxAttr,
$RxAttr_DL1,
$RxAttr_DL2,
$RxAttr_RM,
$RxPi,
$RxENTITY,
$Entities,
$RxEntConv,
%ents
);
my $initflg = 0;
####################
# ReXparse methods
# =================
sub new {
my ($class, @args) = @_;
my $self = {};
if (!$initflg) {
InitVars();
$initflg = 1;
}
$self->{'InParse'} = 0;
$self->{'debug'} = 0;
$self->{'origcontent'} = undef;
setDfltHandlers ($self);
return bless ($self, $class);
}
sub original_content {
my $self = shift;
if (defined $self->{'origcontent'} &&
ref($self->{'origcontent'}) eq 'SCALAR') {
return ${$self->{'origcontent'}};
} else {return "";}
}
sub setDebugMode {
my $self = shift;
my $dmode = shift;
if (defined $dmode && $dmode) {$dmode = 1;}
else {$dmode = 0;}
$self->{'debug'} = $dmode;
}
sub setDfltHandlers {
my ($self,$name) = @_;
if (defined $name) {
$name =~ s/^\s+//s; $name =~ s/\s+$//s;
my $hname = "h".lc($name);
if (exists $Dflth{$hname}) {
$self->{$hname} = $Dflth{$hname};
}
} else {
foreach my $key (%Dflth) {
$self->{$key} = $Dflth{$key};
}
}
}
sub setHandlers {
my ($self, @args) = @_;
my %oldh = ();
if (!scalar(@args)) {
while (my ($name,$val) = splice (@args, 0, 2)) {
$name =~ s/^\s+//s; $name =~ s/\s+$//s;
my $hname = "h".lc($name);
if (exists $self->{$hname}) {
$oldh{$name} = $self->{$hname};
if (ref($val) eq 'CODE') {
$self->{$hname} = $val;
} else {
# if its not a CODE ref,
# just set default handler
$self->setDfltHandlers ($name);
}
}
}
}
return %oldh;
}
sub parse {
my ($self, $data) = @_;
throwX ('30') unless (!$self->{'InParse'});
throwX ('31') unless (defined $data);
$self->{'InParse'} = 1;
# call processor
if (ref($data) eq 'SCALAR') {
print "SCALAR ref\n" if ($self->{'debug'});
eval {Processor($self, 1, $data);};
if ($@) {
Cleanup($self); die $@;
}
}
elsif (ref(\$data) eq 'SCALAR') {
print "SCALAR string\n" if ($self->{'debug'});
eval {Processor($self, 1, \$data);};
if ($@) {
Cleanup($self); die $@;
}
} else {
if (ref($data) ne 'GLOB' && ref(\$data) ne 'GLOB') {
$self->{'InParse'} = 0;
die "rp_error_parse, data source not a string or filehandle nor reference to one\n";
}
print "GLOB ref or filehandle\n" if ($self->{'debug'});
eval {Processor($self, 0, $data);};
if ($@) {
Cleanup($self); die $@;
}
}
$self->{'InParse'} = 0;
}
##############################
# ReXparse private func's
# ========================
sub Cleanup
{
my $obj = shift;
$obj->{'origcontent'} = undef;
$obj->{'debug'} = 0;
$obj->{'InParse'} = 0;
$obj->setDfltHandlers();
}
sub Processor
{
my ($obj, $BUFFERED, $rpl_mk) = @_;
my ($markup_file);
my $parse_ln = '';
my $dyna_ln = '';
my $ref_parse_ln = \$parse_ln;
my $ref_dyna_ln = \$dyna_ln;
if ($BUFFERED) {
$ref_parse_ln = $rpl_mk;
$ref_dyna_ln = \$dyna_ln;
} else {
# assume its a ref to a global or global itself
$markup_file = $rpl_mk;
$ref_dyna_ln = $ref_parse_ln;
}
my $ln_cnt = 0;
my $complete_comment = 0;
my $complete_cdata = 0;
my @Tags = ();
my $havroot = 0;
my $last_cpos = 0;
my $done = 0;
my $content = '';
my $altcontent = undef;
$obj->{'origcontent'} = \$content;
while (!$done)
{
$ln_cnt++;
# stream processing (if not buffered)
if (!$BUFFERED) {
if (!($_ = <$markup_file>)) {
# just parse what we have
$done = 1;
# boundry check for runnaway
if (($complete_comment+$complete_cdata) > 0) {
$ln_cnt--;
}
} else {
$$ref_parse_ln .= $_;
## buffer if needing comment/cdata closure
next if ($complete_comment && !/-->/);
next if ($complete_cdata && !/\]\]>/);
## reset comment/cdata flags
$complete_comment = 0;
$complete_cdata = 0;
## flag serialized comments/cdata buffering
if (/(<!--)|(<!\[CDATA\[)/)
{
if (defined $1) { # complete comment
if ($$ref_parse_ln !~ /<!--.*?-->/s) {
$complete_comment = 1;
next;
}
}
elsif (defined $2) { # complete cdata
if ($$ref_parse_ln !~ /<!\[CDATA\[.*?\]\]>/s) {
$complete_cdata = 1;
next;
}
}
}
## buffer until '>' or eof
next if (!/>/);
}
} else {
$ln_cnt = 1;
$done = 1;
}
## REGEX Parsing loop
while ($$ref_parse_ln =~ /$RxParse/g)
{
## handle contents
if (defined $14) {
$content .= $14;
$last_cpos = pos($$ref_parse_ln);
next;
}
## valid content here ... can be taken off
print "-"x20,"\n" if ($obj->{'debug'});
if (length ($content)) {
## check reserved characters in content
if ($content =~ /[<>]/) {
#$content =~ s/^\s+//s; $content =~ s/\s+$//s;
## mark-up characters in content
throwX('01', $ref_parse_ln, $last_cpos, $content, $ln_cnt);
}
if (!scalar(@Tags)) {
#$content =~ s/^\s+//s; $content =~ s/\s+$//s;
if ($content =~ /[^\s]/s) {
## content at root level
throwX('02', $ref_parse_ln, $last_cpos, $content, $ln_cnt);
}
}
# substitute special xml characters, then call content handler with $content
# ------------------------------------------------------
# $content has to be a constant if xml reserved chars
# are found, copy altered string to pass to handler
# otherwise pass original $content
# ------------------------------------------------------
if (defined ($altcontent = convertEntities (\$content))) {
$obj->{'hchar'}($obj, $$altcontent, $obj->{'debug'});
} else {
$obj->{'hchar'}($obj, $content, $obj->{'debug'});
}
#print "14 $content\n" if ($obj->{'debug'});
print "-"x20,"\n" if ($obj->{'debug'});
$content = '';
}
#if ($show_pos && $debug) {
# my $rr = pos $$ref_parse_ln;
# print "$rr ";
#}
## <tag> or </tag> or <tag/>
if (defined $2)
{
my ($l1,$l3) = (length($1),length($3));
if (($l1+$l3)==0) { ## <tag>
if (!scalar(@Tags) && $havroot) {
## new root node <tag>
throwX('03', $ref_parse_ln, pos($$ref_parse_ln), $2, $ln_cnt);
}
push @Tags,$2;
$havroot = 1;
# call start tag handler with $2
$obj->{'hstart'}($obj, $2);
}
elsif ($l1==1 && $l3==0) { ## </tag>
my $pval = pop @Tags;
if (!defined $pval) {
## missing start tag </tag>
throwX('04', $ref_parse_ln, pos($$ref_parse_ln), $2, $ln_cnt);
}
if ($2 ne $pval) {
## expected closing tag </tag>
throwX('05', $ref_parse_ln, pos($$ref_parse_ln), $pval, $ln_cnt);
}
# call end tag handler with $2
$obj->{'hend'}($obj, $2);
}
elsif ($l1==0 && $l3==1) { ## <tag/>
if (!scalar(@Tags) && $havroot) {
## new root node <tag/>
throwX('06', $ref_parse_ln, pos($$ref_parse_ln), $2, $ln_cnt);
}
$havroot = 1; # first and only <root/>
# call start tag handler, then end tag handler, with $2
$obj->{'hstart'}($obj, $2);
$obj->{'hend'}($obj, $2);
} else {
## <//node//> errors
throwX('07', $ref_parse_ln, pos($$ref_parse_ln), "$1$2$3", $ln_cnt);
}
#print "2 TAG: $1$2$3\n" if ($obj->{'debug'});
}
## <tag attrib/> or <tag attrib>
elsif (defined $5)
{
my $l7 = length($7);
## attributes
my $attref = getAttrARRAY($6);
unless (ref($attref)) {
## missing or extra token
throwX('08', $ref_parse_ln, pos($$ref_parse_ln), $attref, $ln_cnt);
}
if ($l7==0) { ## <tag attrib>
if (!scalar(@Tags) && $havroot) {
## new root node
throwX('03', $ref_parse_ln, pos($$ref_parse_ln), $5, $ln_cnt);
}
push @Tags,$5;
$havroot = 1;
# call start tag handler with $5 and attributes @{$attref}
$obj->{'hstart'}($obj, $5, @{$attref});
}
elsif ($l7==1) { ## <tag attrib/>
if (!scalar(@Tags) && $havroot) {
## new root node
throwX('06', $ref_parse_ln, pos($$ref_parse_ln), $7, $ln_cnt);
}
$havroot = 1; # first and only <root attrib/>
# call start tag handler with $5 and attributes @{$attref}, then end tag handler with $5
$obj->{'hstart'}($obj, $5, @{$attref});
$obj->{'hend'}($obj, $5);
} else {
## syntax error
throwX('07', $ref_parse_ln, pos($$ref_parse_ln), "$5$6$7", $ln_cnt);
}
#if ($obj->{'debug'}) {
# print "5,6 TAG: $5 Attr: $6$7\n" ;
#}
}
## XMLDECL or PI (processing instruction)
elsif (defined $8)
{
my $pi = $8;
# xml declaration ?
if ($pi =~ /^xml(.*?)$/) {
my $attref = getAttrARRAY($1);
unless (ref($attref)) {
## missing or extra token in xmldecl
throwX('14', $ref_parse_ln, pos($$ref_parse_ln), $attref, $ln_cnt);
}
if (!scalar(@{$attref})) {
## missing xmldecl parameters
throwX('15', $ref_parse_ln, pos($$ref_parse_ln), $pi, $ln_cnt);
}
my ($version,$encoding,$standalone);
while (my ($name,$val) = splice (@{$attref}, 0, 2)) {
if ('version' eq lc($name) && !defined $version) {
if ($val !~ /^[0-9]\.[0-9]+$/) {
## invalid version character data in xmldecl
throwX('16', $ref_parse_ln, pos($$ref_parse_ln), "$name = '$val'", $ln_cnt);
}
$version = $val;
} elsif ('encoding' eq lc($name) && !defined $encoding) {
if ($val !~ /^[A-Za-z][\w\.-]*$/) {
## invalid encoding character data in xmldecl
throwX('17', $ref_parse_ln, pos($$ref_parse_ln), "$name = '$val'", $ln_cnt);
}
$encoding = $val;
} elsif ('standalone' eq lc($name) && !defined $standalone) {
if ($val !~ /^(?:yes)|(?:no)$/) {
## invalid standalone character data in xmldecl
throwX('18', $ref_parse_ln, pos($$ref_parse_ln), "$name = '$val'", $ln_cnt);
}
$standalone = ($val eq 'yes' ? 1 : 0);
} else {
## unknown xmldecl parameter
throwX('19', $ref_parse_ln, pos($$ref_parse_ln), "$name = '$val'", $ln_cnt);
}
}
if (!defined $version) {
# missing version in xmldecl
throwX('20', $ref_parse_ln, pos($$ref_parse_ln), $pi, $ln_cnt);
}
# call xmldecl handler
$obj->{'hxmldecl'}($obj, $version,$encoding,$standalone);
}
# PI - processing instruction
elsif ($pi =~ /$RxPi/) {
# call pi handler
$obj->{'hproc'}($obj, $1, $2);
} else {
# unknown PI data
throwX('21', $ref_parse_ln, pos($$ref_parse_ln), $pi, $ln_cnt);
}
#print "8 VERSION: $8\n" if ($obj->{'debug'});
}
## META
elsif (defined $4) {
# If doctype is HTML then META is not closed
# parse meta data, call handler
$obj->{'hmeta'}($obj, $4, $obj->{'debug'});
#print "4 META: $4\n" if ($obj->{'debug'});
}
## DOCTYPE
elsif (defined $9) {
# parese doctype, call handler
$obj->{'hdoctype'}($obj, $9, $obj->{'debug'});
#print "9 DOCTYPE: $9\n" if ($obj->{'debug'});
}
## CDATA
elsif (defined $10) {
if (!scalar(@Tags)) {
## CDATA content at root
throwX('09', $ref_parse_ln, pos($$ref_parse_ln), $10, $ln_cnt);
}
# call cdata handler
$obj->{'hcdata'}($obj, $10, $obj->{'debug'});
#print "10 CDATA: $10\n" if ($obj->{'debug'});
}
## COMMENT
elsif (defined $11) {
# call comment handler
$obj->{'hcomment'}($obj, $11, $obj->{'debug'});
#print "11 COMMENT: $11\n" if ($obj->{'debug'});
}
## ATTLIST
elsif (defined $12) {
# parese attlist, call handler
$obj->{'hattlist'}($obj, $12, $obj->{'debug'});
#print "12 ATTLIST: $12\n" if ($obj->{'debug'});
}
## ENTITY
elsif (defined $13) {
# parese entity, call handler
my $entdata = $13;
if ($entdata =~ /$RxENTITY/) {
if (defined $1) {
# general entity replacement
# add general entity $3 (EntityDef)
} else {
# parameter entity replacement
# add parameter entity $3 (PEDef)
}
}
else {
# unknown ENTITY data
# throwx
}
$obj->{'hentity'}($obj, $13, $obj->{'debug'});
#print "13 ENTITY: $13\n" if ($obj->{'debug'});
}
}
$$ref_dyna_ln = $content;
$content = '';
}
if (!$havroot) {
# not valid xml
throwX('10', undef, undef, undef, undef);
}
if (scalar(@Tags)) {
my $str = '';
while (defined (my $etag = pop @Tags)) {
$str .= ", /$etag";
}
$str =~ s/^, +//;
# missing end tag
throwX('11', undef, undef, $str, undef);
}
if ($$ref_dyna_ln =~ /[^\s]/s) {
if ($$ref_dyna_ln =~ /[<>]/) {
#mark-up characters in content
throwX('12',undef, undef, $$ref_dyna_ln, undef);
} else {
# content at root level (end)
throwX('13',undef, undef, $$ref_dyna_ln, undef);
}
}
$obj->{'origcontent'} = undef;
return 1;
}
sub getAttrARRAY
{
my $attrstr = shift;
my $aref = [];
my ($alt_attval,$attval,$rx);
while ($attrstr =~ s/$RxAttr//) {
push @{$aref},$1;
if ($2 eq "'") {
$rx = \$RxAttr_DL1;
} else {
$rx = \$RxAttr_DL2;
}
if ($attrstr =~ s/$$rx//) {
if (defined $1) {
push @{$aref},$1;
next;
}
$attval = $2;
if (defined ($alt_attval = convertEntities (\$attval))) {
push @{$aref},$$alt_attval;
next;
}
push @{$aref},$attval;
next;
}
return $attrstr;
}
if ($attrstr=~/$RxAttr_RM/) {
$attrstr =~ s/^\s+//s; $attrstr =~ s/\s+$//s;
return $attrstr if (length($attrstr));
}
return $aref;
}
sub convertEntities
{
my $str_ref = shift;
my $alt_str = '';
my $res = 0;
my ($entchr);
while ($$str_ref =~ /$RxEntConv/gc) {
if (defined $4) {
# decimal
$entchr = chr($4) ;
if (exists $ents{$entchr}) {
$alt_str .= "$1".$entchr;
}
} elsif (defined $5) {
if (length($5) < 9) {
# hex
$entchr = chr(hex($5));
if (exists $ents{$entchr}) {
$alt_str .= "$1".$entchr;
}
}
} else {
$alt_str .= "$1$ents{$3}";
$res = 1;
}
}
if ($res) {
$alt_str .= substr $$str_ref, pos($$str_ref);
return \$alt_str;
}
return undef;
}
sub throwX
{
my ($errno, $lrefseg, $cseg_err, $datastr, $l_tot) = @_;
my ($line,$col) = (0,0);
if (defined $lrefseg) {
($line,$col) = getRealColumn($lrefseg, $l_tot, $cseg_err);
}
die "No such error message ($errno)\n" if (!exists $ErrMsg{$errno});
my $estr = '';
my $ctmpl = "\$estr = sprintf ($ErrMsg{$errno});";
eval $ctmpl;
die $estr;
}
sub getRealColumn
{
my ($lrefseg, $l_tot, $cseg_err) = @_;
my $cseg_offset = 0;
pos($$lrefseg) = 0;
my ($lseg_tot, $lseg_offset) = (0,1);
while ($$lrefseg =~ /\n/g) {
$lseg_tot++;
if (pos($$lrefseg) < $cseg_err) {
$cseg_offset = pos($$lrefseg);
$lseg_offset++;
next;
}
if ($l_tot <= 1) {
$lseg_tot = $l_tot;
last;
}
}
return ($l_tot-$lseg_tot+$lseg_offset, $cseg_err-$cseg_offset);
}
# Globals Init
# =============
sub InitVars
{
%Dflth = (
'hstart' => \&dflt_start,
'hend' => \&dflt_end,
'hchar' => \&dflt_char,
'hcdata' => \&dflt_cdata,
'hcomment' => \&dflt_comment,
'hmeta' => \&dflt_meta,
'hattlist' => \&dflt_attlist,
'hentity' => \&dflt_entity,
'hdoctype' => \&dflt_doctype,
'helement' => \&dflt_element,
'hxmldecl' => \&dflt_xmldecl,
'hproc' => \&dflt_proc,
);
@UC_Nstart = (
"\\x{C0}-\\x{D6}",
"\\x{D8}-\\x{F6}",
"\\x{F8}-\\x{2FF}",
"\\x{370}-\\x{37D}",
"\\x{37F}-\\x{1FFF}",
"\\x{200C}-\\x{200D}",
"\\x{2070}-\\x{218F}",
"\\x{2C00}-\\x{2FEF}",
"\\x{3001}-\\x{D7FF}",
"\\x{F900}-\\x{FDCF}",
"\\x{FDF0}-\\x{FFFD}",
"\\x{10000}-\\x{EFFFF}",
);
@UC_Nchar = (
"\\x{B7}",
"\\x{0300}-\\x{036F}",
"\\x{203F}-\\x{2040}",
);
$Nstrt = "[A-Za-z_:".join ('',@UC_Nstart)."]";
$Nchar = "[-\\w:\\.".join ('',@UC_Nchar).join ('',@UC_Nstart)."]";
$Name = "(?:$Nstrt$Nchar*?)";
#die "$Name\n";
$RxParse =
qr/(?:<(??\/*)($Name)\s*(\/*))|(?:META(.*?))|(?$Name)((?:\s+$Name\s*=\s*["'][^<]*['"])+)\s*(\/*))|(?:\?(.*?)\?)|(?:!(??OCTYPE(.*?))|(?:\[CDATA\[(.*?)\]\])|(?:--(.*?[^-])--)|(?:ATTLIST(.*?))|(?:ENTITY(.*?)))))>)|(.+?)/s;
# ( <( ( 1 12 2 3 3)|( 4 4)|( 5 56( ) 6 7 7)|( 8 8 )|( !( ( 9 9)|( 0 0 )|( 1 1 )|(
2 2)|( 3 3))))>)|4 4
$RxAttr = qr/^\s+($Name)\s*=\s*("|')/;
$RxAttr_DL1 = qr/^(?[^'&]*?)|([^']*?))'/;
$RxAttr_DL2 = qr/^(?[^"&]*?)|([^"]*?))"/;
$RxAttr_RM = qr/[^\s\n]+/;
$RxPi = qr/^($Name)\s+(.*?)$/s;
#[52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
#[53] AttDef ::= S Name S AttType S DefaultDecl
$RxENTITY = qr/\s+($Name)|(?:%\s+($Name))\s+(.*?)/s;
# 1 1 ( 2 2) 3 3
$Entities = "(?:amp)|(?:gt)|(?:lt)|(?:apos)|(?:quot)|(?:#(?[0-9]+)|(x[0-9a-fA-F]+)))"; # cat more entities
# ( #( 4 4|5 5))
$RxEntConv = qr/(.*?)(&|%)($Entities);/s;
# 1 12 23 3
%ents = (
'amp' =>'&',
'gt' =>'>',
'lt' =>'<',
'apos'=>"'",
'quot'=>"\"",
'&' => '',
'<' => '',
'>' => '',
"'" => '',
"\"" => ''
);
%ErrMsg = (
'01' => "\"rp_error_01, mark-up or reserved characters in content (line %s, col %s), malformed element? '%s'\\n\", \$line, \$col, \$datastr",
'02' => "\"rp_error_02, content at root level (line %s, col %s): '%s'\\n\", \$line, \$col, \$datastr",
'03' => "\"rp_error_03, element wants to be new root node (line %s, col %s): '%s'\\n\", \$line, \$col, \$datastr",
'04' => "\"rp_error_04, missing start tag for '/%s' (line %s, col %s)\\n\", \$datastr, \$line, \$col",
'05' => "\"rp_error_05, expected closing tag '/%s' (line %s, col %s)\\n\", \$datastr, \$line, \$col",
'06' => "\"rp_error_06, element wants to be new root node (line %s, col %s): '%s/'\\n\", \$line, \$col, \$datastr",
'07' => "\"rp_error_07, tag syntax '%s' (line %s, col %s)\\n\", \$datastr, \$line, \$col",
'08' => "\"rp_error_08, invalid, missing or extra tokens in attribute asignment (line %s, col %s): %s\\n\", \$line, \$col, \$datastr",
'09' => "\"rp_error_09, CDATA content at root level (line %s, col %s): '%s'\\n\", \$line, \$col, \$datastr",
'10' => "\"rp_error_10, not a valid xml document\\n\"",
'11' => "\"rp_error_11, missing end tag '%s'\\n\", \$datastr",
'12' => "\"rp_error_12, mark-up or reserved characters in content (end), malformed element? '%s'\\n\", \$datastr",
'13' => "\"rp_error_13, content at root level (end): '%s'\\n\", \$datastr",
'14' => "\"rp_error_14, invalid, missing or extra tokens in xmldecl asignment (line %s, col %s): %s\\n\", \$line, \$col, \$datastr",
'15' => "\"rp_error_15, missing xmldecl parameters (line %s, col %s): %s\\n\", \$line, \$col, \$datastr",
'16' => "\"rp_error_16, invalid 'version' character data in xmldecl (line %s, col %s): %s\\n\", \$line, \$col, \$datastr",
'17' => "\"rp_error_17, invalid 'encoding' character data in xmldecl (line %s, col %s): %s\\n\", \$line, \$col, \$datastr",
'18' => "\"rp_error_18, invalid 'standalone' character data in xmldecl (line %s, col %s): %s\\n\", \$line, \$col, \$datastr",
'19' => "\"rp_error_19, unknown xmldecl parameter (line %s, col %s): %s\\n\", \$line, \$col, \$datastr",
'20' => "\"rp_error_20, missing xmldecl 'version' parameter (line %s, col %s): %s\\n\", \$line, \$col, \$datastr",
'21' => "\"rp_error_21, unknown or missing processing instruction parameters (line %s, col %s): '%s'\\n\", \$line, \$col, \$datastr",
'30' => "\"rp_error_30, already in parse\\n\"",
'31' => "\"rp_error_31, data source not defined\\n\"",
);
}
# Default Handlers
# =================
sub dflt_start {
my ($obj, $el, @attr) = @_;
if ($obj->{'debug'}) {
print "start _: $el\n";
while (my ($name,$val) = splice (@attr, 0,2)) {
print " "x12,"$name = $val\n";
}
}
}
sub dflt_end {my ($obj, $el) = @_;print "end _: /$el\n" if ($obj->{'debug'});}
sub dflt_char {my ($obj, $str) = @_;print "char _: $str\n" if ($obj->{'debug'});}
sub dflt_cdata {my ($obj, $str) = @_;print "cdata _: $str\n" if ($obj->{'debug'});}
sub dflt_comment {my ($obj, $str) = @_;print "comnt _: $str\n" if ($obj->{'debug'});}
sub dflt_meta {my ($obj, $str) = @_;print "meta _: $str\n" if ($obj->{'debug'});}
sub dflt_attlist {my ($obj, $parm) = @_;print "attlist_h _: $parm\n" if ($obj->{'debug'});}
sub dflt_entity {my ($obj, $parm) = @_;print "entity_h _: $parm\n" if ($obj->{'debug'});}
sub dflt_doctype {my ($obj, $parm) = @_;print "doctype_h _: $parm\n" if ($obj->{'debug'});}
sub dflt_element {my ($obj, $parm) = @_;print "element_h _: $parm\n" if ($obj->{'debug'});}
sub dflt_xmldecl {
my ($obj, $version,$encoding,$standalone) = @_;
if ($obj->{'debug'}) {
print "xmldecl_h _: version = $version\n" if (defined $encoding);
print " "x14,"encoding = $encoding\n" if (defined $encoding);
print " "x14,"standalone = $standalone\n" if (defined $standalone);
}
}
sub dflt_proc {
my ($obj, $target,$data) = @_;
if ($obj->{'debug'}) {
print "proc_h _: target = $target\n";
print " "x14,"data = $data\n";
}
}
__END__
#$RxParse =
qr/(?:<(??\/*)($Name)\s*(\/*))|(?:META(.*?))|(?$Name)((?:\s+$Name\s*=\s*["'][^<]*['"])+)\s*(\/*))|(?:\?(.*?)\?)|(?:!(??OCTYPE(.*?))|(?:\[CDATA\[(.*?)\]\])|(?:--(.*?[^-])--)|(?:ATTLIST(.*?))|(?:ENTITY(.*?)))))>)|(.+?)/s;
# ( <( ( 1 12 2 3 3)|( 4 4)|( 5 56( ) 6 7 7)|( 8 8 )|( !( ( 9 9)|( 0 0 )|( 1 1 )|(
2 2)|( 3 3))))>)|4 4
#$Entities = "(?:amp)|(?:gt)|(?:lt)|(?:apos)|(?:quot)|(?:#(?[0-9]+)|(x[0-9a-fA-F]+)))";
# ( #( 3 3 4 4))
#$RxEntConv = qr/(.*?)&($Entities);/s;