R
robic0
On Tue, 20 Dec 2005 23:59:06 -0800, robic0 wrote:
Final edition .904
This is the last, it will not be continued. Further processing could
have been done for all tag entities and alot of specialized handling.
However due to the recent awareness of the latencies involved in regexp
(re)searches, the "inner" substitution method is not viable in this context.
Thats all I will say on this. Content is not examined for xml reserved
characters (ie: &, etc..), although its easily added. Kind of too bad,
this method has alot of potential.
Anyway, for now its just an exercise in xml/html form and structure.
This status could change in the future as I do a more detailed post-mortum.
Should it change significantly, you won't hear about it in the forums.
I don't anticipate that being the case, but who knows.
-robic0-
Changes:
- Added to regexp, white space to account for <tag />
print <<EOM;
# -----------------------
# XML Regex Parser
# Version .904 - 12/31/05
# Copyright 2005,
# by robic0-At-yahoo.com
# -----------------------
EOM
use strict;
use warnings;
use Data:umper;
open DATA, "your.html" or die "can't open your.html...";
my $gabage1 = join ('', <DATA>);
close DATA;
print "here\n";
my @xml_strings = ($gabage1);
my $alt_debug = 0;
my $VERSION = .904;
my $debug = 0;
my $rmv_white_space = 0;
my $ForceArray = 0;
my $KeepRoot = 0;
my $KeepComments = 0;
my $KeepContentOrder = 0;
## -- XML, start & end regexp substitution delimiter chars --
## match side , substitution side
## -------------------------/-------------------------------
my (@S_dlim, @E_dlim);
if ($debug) {
@S_dlim = ('\[' , '['); # use these for debug
@E_dlim = ('\]' , ']');
} else {
@S_dlim = (chr(140) , chr(140)); # use these for production
@E_dlim = (chr(141) , chr(141));
}
## -- Process xml data --
##
for (@xml_strings)
{
#print "\n",'*'x30,"\nXML string:\n",'-'x15,"\n$_\n\nOutput:\n",'-'x15,"\n\n" if ($debug);
if ($alt_debug) {
ProcessAltDebugInfo ($_) ;
print "\n";
}
my $ROOT = {}; # container
my %cdata_elements = ();
my ($last_cnt, $cnt, $i, $attr_error) = (-1, 1, 0, 0);
## Comment/CDATA block ==================================
#### To be done first -
# -- Questionable Comments --
while (s/(<!--(.*?)-->)/$S_dlim[1]$cnt$E_dlim[1]/) {
#print "$cnt = Questionable comment: $1\n" if ($debug);
$ROOT->{$cnt} = $1;
$cnt++;
}
#### To be done second -
# -- Real CDATA --
while (s/<!\[CDATA\[(.*?)\]\]>/$S_dlim[1]$cnt$E_dlim[1]/s)
{
# reconstitute cdata contents
my $cdata_contents = $1;
my $str = '';
while ($cdata_contents =~ s/([^$S_dlim[0]$E_dlim[0]]+)|$S_dlim[0]([\d]+)$E_dlim[0]//) {
if (defined $1) {
$str .= $1;
} elsif (defined $2 && exists $ROOT->{$2}) {
$str .= $ROOT->{$2};
delete $ROOT->{$2};
} else {} # shouldn't get here
}
print "$cnt CDATA = $str\n" if ($debug);
$ROOT->{$cnt} = $str;
$cdata_elements{$cnt} = '';
$cnt++;
}
#### To be done third -
# -- Real Comments are left --
foreach my $key (sort {$a <=> $b} keys %{$ROOT}) {
if (!exists $cdata_elements{$key}) {
$ROOT->{$key} =~ s/^<!--(.*?)-->$/$1/s;
print "$key Comment = $1\n" if ($debug);
if ($KeepComments) {
$ROOT->{$key} = { comment => $1 };
} else {delete $ROOT->{$key};}
}
}
## End Comment/CDATA block ==============================
#### Non-tag markups go here -
####
# -- Versioning -- <?XML-Version ?> - Placeholder, voided
while (s/<\?([^<>]*)\?>//) {
#while (s/<\?([^<>]*)\?>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt <? ?> = $1\n" if ($debug);
$ROOT->{$cnt} = { 'XMLV' => $1 };
$cnt++;
}
# -- DOCTYPE -- <!DOCTYPE info> - Placeholder, voided
while (s/<!DOCTYPE([^<>]*)>//) {
#while (s/<!DOCTYPE([^<>]*)>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt DOCTYPE = $1\n" if ($debug);
$ROOT->{$cnt} = { 'DOCTYPE' => $1 };
$cnt++;
}
# -- META -- <META info> - Placeholder, voided
while (s/<META([^<>]*)>//) {
#while (s/<META([^<>]*)>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt META = $1\n" if ($debug);
$ROOT->{$cnt} = { 'META' => $1 };
$cnt++;
}
#### White space removal before tags ? .. TBD -
if ($rmv_white_space) {
s/>[\s]+</></g;
s/[\s]+</</g;
s/>[\s]+/>/g;
}
#### Tags here - should only need 2 iterations max
my $finished = 0;
while ($cnt != $last_cnt && $i < 20)
{
$last_cnt = $cnt;
## <Tag/> , no content
while (s/<([0-9a-zA-Z]+)[\s]*\/>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt <$1> = \n" if ($debug);
$ROOT->{$cnt} = { $1 => '' };
$cnt++;
}
## <Tag Attributes/> , no content
while (s/<([0-9a-zA-Z]+)([\s]+[0-9a-zA-Z]+[\s]*=[\s]*["'][^<]*['"])+[\s]*\/>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt <$1> = attr: $2\n" if ($debug);
my $hattrib = getAttrHash($2);
if (ref($hattrib) ne "HASH") {
print "Invalid token in attribute asignment:\n$hattrib\n"; $attr_error = 1; last;
}
$ROOT->{$cnt} = { $1 => $hattrib };
$cnt++;
}
## <Tag> Content </Tag>
while (s/<([0-9a-zA-Z]+)>([^<]*)<[\s]*\/\1>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt <$1> = $2\n" if ($debug);
my $unknown = '';
if (length($2) > 0) {
my $hcontent = getContentHash($2, $ROOT, \%cdata_elements);
$unknown = $hcontent;
if (keys (%{$hcontent}) > 1) {
if (!$ForceArray) { adjustForSingleItemArrays ($hcontent); }
} else {
if (exists $hcontent->{'content'}) {
my ($key);
if (!$ForceArray ) {
if (ref($hcontent->{'content'}) eq "ARRAY" && scalar(@{$hcontent->{'content'}}) == 1) {
$unknown = ${$hcontent->{'content'}}[0];
}
else {$unknown = $hcontent->{'content'}; }
}
}
if (!$ForceArray) { adjustForSingleItemArrays ($hcontent); }
}
}
$ROOT->{$cnt} = { $1 => $unknown };
$cnt++;
}
last if ($attr_error);
## <Tag Attributes> Content </Tag>
while (s/<([0-9a-zA-Z]+)([\s]+[0-9a-zA-Z]+[\s]*=[\s]*["'][^<]*['"][\s]*)+[\s]*>([^<]*)<[\s]*\/\1>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt <$1> = attr: $2, content: $3\n" if ($debug);
my $hattrib = getAttrHash($2);
if (ref($hattrib) ne "HASH") {
print "Invalid token in attribute asignment:\n$hattrib\n"; $attr_error = 1; last;
}
if (length($3) > 0) {
my $hcontent = getContentHash($3, $ROOT, \%cdata_elements);
if (!$ForceArray) { adjustForSingleItemArrays ($hcontent); }
while (my ($key,$val) = each (%{$hcontent})) {
$hattrib->{$key} = $val;
}
}
$ROOT->{$cnt} = { $1 => $hattrib };
$cnt++;
}
last if ($attr_error);
if ($last_cnt != $cnt) {
$i++; print "** End pass $i\n" if ($debug);
} else {
last if ($finished);
## Encapsulate the xml with a "root"
$_ = "<root>$_</root>";
$last_cnt--;
$finished = 1;
}
}
next if ($attr_error);
if (/<|>/) {
print "($i) XML problem: malformed, syntax or tag closure:\n$_";
} else {
print "** Itterations = $i\n".
"** Debug = $debug\n".
"** Rmv white space = $rmv_white_space\n".
"** ForceArray = $ForceArray\n".
"** KeepRoot = $KeepRoot\n".
"** KeepComments = $KeepComments\n".
"** KeepContentOrder = $KeepContentOrder\n";
#print Dumper($ROOT);
print "The remaining string is:\n$_\n\n" if ($debug);
## Strip off the outer element (our root) to
## examine the contents for errors.
## ---------------------------------------
my $outer_element = $cnt-1;
if (exists $ROOT->{$outer_element})
{
my $hroot = $ROOT->{$outer_element};
my ($key,$val) = each (%{$hroot});
my $htodump = $val;
# check for errors in root
if (ref($htodump) ne "HASH" || (!$KeepContentOrder && exists $htodump->{'content'})) {
my $msg = 'Error';
$msg = 'Warning' if ($KeepContentOrder);
print "$msg, bare content at root level ..\n";
} else {
my $dmp_keys = keys (%{$htodump});
if ($dmp_keys > 1) {
print "Warning, multiple elements at root level ..\n";
} else {
($key,$val) = each (%{$htodump});
my $val_type = ref($val);
if ($dmp_keys == 0 || (exists $htodump->{'comment'})) {
print "Warning, no elements at root level ..\n";
}
if ($dmp_keys == 1) {
if ($val_type eq "HASH") {
$htodump = $val if (!$KeepRoot);
}
elsif ($val_type eq "ARRAY") {
$htodump = $val if (!$KeepRoot && $KeepContentOrder);
if (!$ForceArray || scalar(@{$val}) > 1) {
print "Warning, multiple elements at root level ..\n";
}
}
}
}
}
print "\n";
my $tmp = undef;
if (ref($htodump) eq "HASH") {
$tmp = {};
%{$tmp} = %{$htodump};
} elsif (ref($htodump) eq "ARRAY") {
$tmp = [];
@{$tmp} = @{$htodump};
} else {
print "Not a hash or array!\n";
}
print Dumper($tmp) if (defined $tmp);
} else {
print "nothing to output!\n";
}
}
}
##
sub adjustForSingleItemArrays
{
my $href = shift;
## if $val is an array ref and has one element
## set $href->{$key} equal to the element
while (my ($key,$val) = each (%{$href})) {
if (ref($val) eq "ARRAY") {
if (scalar(@{$val}) == 1) {
$href->{$key} = $val->[0];
}
}
}
}
##
sub getAttrHash
{
my $attrstr = shift;
my $ahref = {};
return $ahref unless (defined $attrstr);
while ($attrstr =~ s/[\s]*([0-9a-zA-Z]+)[\s]*=[\s]*("|')([^=]*)\2[\s]*//) {
$ahref->{$1} = $3;
}
if ($attrstr=~/=/) {
$attrstr =~ s/^\s+//s;
$attrstr =~ s/\s+$//s;
return $attrstr
}
return $ahref;
}
##
sub getContentHash
{
my ($contstr,$hStore,$hcdata_elements) = @_;
my $ahref = {};
return $ahref unless (defined $contstr && defined $hStore && defined $hcdata_elements);
my @ary = ();
my $append_flag = 0;
while ($contstr =~ s/^([^<$S_dlim[0]$E_dlim[0]]+)|$S_dlim[0]([\d]+)$E_dlim[0]//s)
{
## -- $1 is text contents --
if (defined $1) {
my $tmp1 = $1;
# if flagged, append it to $ary[last]
if ($append_flag && scalar(@ary) > 0) {
my $size = scalar(@ary);
$ary[$size-1] .= $tmp1;
} else {
push (@ary, $1);
}
$append_flag = 0;
}
## -- $2 is substitution index --
elsif (defined $2) {
## Exist check (Comments stripped?),
# turn on append flag.
# -----------------------------------
if (!exists $hStore->{$2}) {
$append_flag = 1;
next;
}
## CDATA check, append it to $ary[last]
# and turn on append flag.
# ---------------------------------------
if (exists $hcdata_elements->{$2}) {
my $size = scalar(@ary);
if ($size > 0) {
$ary[$size-1] .= $hStore->{$2};
} else {push (@ary, $hStore->{$2});}
$append_flag = 1;
next;
}
$append_flag = 0;
## Substitution of in-line content,
# push it to @ary
# ----------------------------------
if ($KeepContentOrder) {
push (@ary, $hStore->{$2});
next;
}
## Substitution of same level here (normal),
# just store it to $ahref
# -----------------------------------------
my ($key,$val) = each (%{$hStore->{$2}});
if (exists $ahref->{$key}) {
push (@{$ahref->{$key}}, $val);
} else {
$ahref->{$key} = [$val];
}
}
else {} # shouldn't get here
}
# Store contents, strip out
# pure whitespace text elements
my $hary = [];
for (@ary) {
next if (/^\s+$/s);
push (@{$hary}, $_);
}
if (scalar(@{$hary}) > 0) {
$ahref->{'content'} = $hary;
}
## if $val is an array ref and has one element and it
## is a hash ref, set {$key} equal to hash ref
if (!$ForceArray) {
while (my ($key,$val) = each (%{$ahref})) {
if (ref($val) eq "ARRAY") {
if (scalar(@{$val}) == 1 && ref($val->[0]) eq "HASH") {
$ahref->{$key} = $val->[0];
}
}
}
}
return $ahref;
}
sub ProcessAltDebugInfo
{
}
Final edition .904
This is the last, it will not be continued. Further processing could
have been done for all tag entities and alot of specialized handling.
However due to the recent awareness of the latencies involved in regexp
(re)searches, the "inner" substitution method is not viable in this context.
Thats all I will say on this. Content is not examined for xml reserved
characters (ie: &, etc..), although its easily added. Kind of too bad,
this method has alot of potential.
Anyway, for now its just an exercise in xml/html form and structure.
This status could change in the future as I do a more detailed post-mortum.
Should it change significantly, you won't hear about it in the forums.
I don't anticipate that being the case, but who knows.
-robic0-
Changes:
- Added to regexp, white space to account for <tag />
print <<EOM;
# -----------------------
# XML Regex Parser
# Version .904 - 12/31/05
# Copyright 2005,
# by robic0-At-yahoo.com
# -----------------------
EOM
use strict;
use warnings;
use Data:umper;
open DATA, "your.html" or die "can't open your.html...";
my $gabage1 = join ('', <DATA>);
close DATA;
print "here\n";
my @xml_strings = ($gabage1);
my $alt_debug = 0;
my $VERSION = .904;
my $debug = 0;
my $rmv_white_space = 0;
my $ForceArray = 0;
my $KeepRoot = 0;
my $KeepComments = 0;
my $KeepContentOrder = 0;
## -- XML, start & end regexp substitution delimiter chars --
## match side , substitution side
## -------------------------/-------------------------------
my (@S_dlim, @E_dlim);
if ($debug) {
@S_dlim = ('\[' , '['); # use these for debug
@E_dlim = ('\]' , ']');
} else {
@S_dlim = (chr(140) , chr(140)); # use these for production
@E_dlim = (chr(141) , chr(141));
}
## -- Process xml data --
##
for (@xml_strings)
{
#print "\n",'*'x30,"\nXML string:\n",'-'x15,"\n$_\n\nOutput:\n",'-'x15,"\n\n" if ($debug);
if ($alt_debug) {
ProcessAltDebugInfo ($_) ;
print "\n";
}
my $ROOT = {}; # container
my %cdata_elements = ();
my ($last_cnt, $cnt, $i, $attr_error) = (-1, 1, 0, 0);
## Comment/CDATA block ==================================
#### To be done first -
# -- Questionable Comments --
while (s/(<!--(.*?)-->)/$S_dlim[1]$cnt$E_dlim[1]/) {
#print "$cnt = Questionable comment: $1\n" if ($debug);
$ROOT->{$cnt} = $1;
$cnt++;
}
#### To be done second -
# -- Real CDATA --
while (s/<!\[CDATA\[(.*?)\]\]>/$S_dlim[1]$cnt$E_dlim[1]/s)
{
# reconstitute cdata contents
my $cdata_contents = $1;
my $str = '';
while ($cdata_contents =~ s/([^$S_dlim[0]$E_dlim[0]]+)|$S_dlim[0]([\d]+)$E_dlim[0]//) {
if (defined $1) {
$str .= $1;
} elsif (defined $2 && exists $ROOT->{$2}) {
$str .= $ROOT->{$2};
delete $ROOT->{$2};
} else {} # shouldn't get here
}
print "$cnt CDATA = $str\n" if ($debug);
$ROOT->{$cnt} = $str;
$cdata_elements{$cnt} = '';
$cnt++;
}
#### To be done third -
# -- Real Comments are left --
foreach my $key (sort {$a <=> $b} keys %{$ROOT}) {
if (!exists $cdata_elements{$key}) {
$ROOT->{$key} =~ s/^<!--(.*?)-->$/$1/s;
print "$key Comment = $1\n" if ($debug);
if ($KeepComments) {
$ROOT->{$key} = { comment => $1 };
} else {delete $ROOT->{$key};}
}
}
## End Comment/CDATA block ==============================
#### Non-tag markups go here -
####
# -- Versioning -- <?XML-Version ?> - Placeholder, voided
while (s/<\?([^<>]*)\?>//) {
#while (s/<\?([^<>]*)\?>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt <? ?> = $1\n" if ($debug);
$ROOT->{$cnt} = { 'XMLV' => $1 };
$cnt++;
}
# -- DOCTYPE -- <!DOCTYPE info> - Placeholder, voided
while (s/<!DOCTYPE([^<>]*)>//) {
#while (s/<!DOCTYPE([^<>]*)>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt DOCTYPE = $1\n" if ($debug);
$ROOT->{$cnt} = { 'DOCTYPE' => $1 };
$cnt++;
}
# -- META -- <META info> - Placeholder, voided
while (s/<META([^<>]*)>//) {
#while (s/<META([^<>]*)>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt META = $1\n" if ($debug);
$ROOT->{$cnt} = { 'META' => $1 };
$cnt++;
}
#### White space removal before tags ? .. TBD -
if ($rmv_white_space) {
s/>[\s]+</></g;
s/[\s]+</</g;
s/>[\s]+/>/g;
}
#### Tags here - should only need 2 iterations max
my $finished = 0;
while ($cnt != $last_cnt && $i < 20)
{
$last_cnt = $cnt;
## <Tag/> , no content
while (s/<([0-9a-zA-Z]+)[\s]*\/>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt <$1> = \n" if ($debug);
$ROOT->{$cnt} = { $1 => '' };
$cnt++;
}
## <Tag Attributes/> , no content
while (s/<([0-9a-zA-Z]+)([\s]+[0-9a-zA-Z]+[\s]*=[\s]*["'][^<]*['"])+[\s]*\/>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt <$1> = attr: $2\n" if ($debug);
my $hattrib = getAttrHash($2);
if (ref($hattrib) ne "HASH") {
print "Invalid token in attribute asignment:\n$hattrib\n"; $attr_error = 1; last;
}
$ROOT->{$cnt} = { $1 => $hattrib };
$cnt++;
}
## <Tag> Content </Tag>
while (s/<([0-9a-zA-Z]+)>([^<]*)<[\s]*\/\1>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt <$1> = $2\n" if ($debug);
my $unknown = '';
if (length($2) > 0) {
my $hcontent = getContentHash($2, $ROOT, \%cdata_elements);
$unknown = $hcontent;
if (keys (%{$hcontent}) > 1) {
if (!$ForceArray) { adjustForSingleItemArrays ($hcontent); }
} else {
if (exists $hcontent->{'content'}) {
my ($key);
if (!$ForceArray ) {
if (ref($hcontent->{'content'}) eq "ARRAY" && scalar(@{$hcontent->{'content'}}) == 1) {
$unknown = ${$hcontent->{'content'}}[0];
}
else {$unknown = $hcontent->{'content'}; }
}
}
if (!$ForceArray) { adjustForSingleItemArrays ($hcontent); }
}
}
$ROOT->{$cnt} = { $1 => $unknown };
$cnt++;
}
last if ($attr_error);
## <Tag Attributes> Content </Tag>
while (s/<([0-9a-zA-Z]+)([\s]+[0-9a-zA-Z]+[\s]*=[\s]*["'][^<]*['"][\s]*)+[\s]*>([^<]*)<[\s]*\/\1>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt <$1> = attr: $2, content: $3\n" if ($debug);
my $hattrib = getAttrHash($2);
if (ref($hattrib) ne "HASH") {
print "Invalid token in attribute asignment:\n$hattrib\n"; $attr_error = 1; last;
}
if (length($3) > 0) {
my $hcontent = getContentHash($3, $ROOT, \%cdata_elements);
if (!$ForceArray) { adjustForSingleItemArrays ($hcontent); }
while (my ($key,$val) = each (%{$hcontent})) {
$hattrib->{$key} = $val;
}
}
$ROOT->{$cnt} = { $1 => $hattrib };
$cnt++;
}
last if ($attr_error);
if ($last_cnt != $cnt) {
$i++; print "** End pass $i\n" if ($debug);
} else {
last if ($finished);
## Encapsulate the xml with a "root"
$_ = "<root>$_</root>";
$last_cnt--;
$finished = 1;
}
}
next if ($attr_error);
if (/<|>/) {
print "($i) XML problem: malformed, syntax or tag closure:\n$_";
} else {
print "** Itterations = $i\n".
"** Debug = $debug\n".
"** Rmv white space = $rmv_white_space\n".
"** ForceArray = $ForceArray\n".
"** KeepRoot = $KeepRoot\n".
"** KeepComments = $KeepComments\n".
"** KeepContentOrder = $KeepContentOrder\n";
#print Dumper($ROOT);
print "The remaining string is:\n$_\n\n" if ($debug);
## Strip off the outer element (our root) to
## examine the contents for errors.
## ---------------------------------------
my $outer_element = $cnt-1;
if (exists $ROOT->{$outer_element})
{
my $hroot = $ROOT->{$outer_element};
my ($key,$val) = each (%{$hroot});
my $htodump = $val;
# check for errors in root
if (ref($htodump) ne "HASH" || (!$KeepContentOrder && exists $htodump->{'content'})) {
my $msg = 'Error';
$msg = 'Warning' if ($KeepContentOrder);
print "$msg, bare content at root level ..\n";
} else {
my $dmp_keys = keys (%{$htodump});
if ($dmp_keys > 1) {
print "Warning, multiple elements at root level ..\n";
} else {
($key,$val) = each (%{$htodump});
my $val_type = ref($val);
if ($dmp_keys == 0 || (exists $htodump->{'comment'})) {
print "Warning, no elements at root level ..\n";
}
if ($dmp_keys == 1) {
if ($val_type eq "HASH") {
$htodump = $val if (!$KeepRoot);
}
elsif ($val_type eq "ARRAY") {
$htodump = $val if (!$KeepRoot && $KeepContentOrder);
if (!$ForceArray || scalar(@{$val}) > 1) {
print "Warning, multiple elements at root level ..\n";
}
}
}
}
}
print "\n";
my $tmp = undef;
if (ref($htodump) eq "HASH") {
$tmp = {};
%{$tmp} = %{$htodump};
} elsif (ref($htodump) eq "ARRAY") {
$tmp = [];
@{$tmp} = @{$htodump};
} else {
print "Not a hash or array!\n";
}
print Dumper($tmp) if (defined $tmp);
} else {
print "nothing to output!\n";
}
}
}
##
sub adjustForSingleItemArrays
{
my $href = shift;
## if $val is an array ref and has one element
## set $href->{$key} equal to the element
while (my ($key,$val) = each (%{$href})) {
if (ref($val) eq "ARRAY") {
if (scalar(@{$val}) == 1) {
$href->{$key} = $val->[0];
}
}
}
}
##
sub getAttrHash
{
my $attrstr = shift;
my $ahref = {};
return $ahref unless (defined $attrstr);
while ($attrstr =~ s/[\s]*([0-9a-zA-Z]+)[\s]*=[\s]*("|')([^=]*)\2[\s]*//) {
$ahref->{$1} = $3;
}
if ($attrstr=~/=/) {
$attrstr =~ s/^\s+//s;
$attrstr =~ s/\s+$//s;
return $attrstr
}
return $ahref;
}
##
sub getContentHash
{
my ($contstr,$hStore,$hcdata_elements) = @_;
my $ahref = {};
return $ahref unless (defined $contstr && defined $hStore && defined $hcdata_elements);
my @ary = ();
my $append_flag = 0;
while ($contstr =~ s/^([^<$S_dlim[0]$E_dlim[0]]+)|$S_dlim[0]([\d]+)$E_dlim[0]//s)
{
## -- $1 is text contents --
if (defined $1) {
my $tmp1 = $1;
# if flagged, append it to $ary[last]
if ($append_flag && scalar(@ary) > 0) {
my $size = scalar(@ary);
$ary[$size-1] .= $tmp1;
} else {
push (@ary, $1);
}
$append_flag = 0;
}
## -- $2 is substitution index --
elsif (defined $2) {
## Exist check (Comments stripped?),
# turn on append flag.
# -----------------------------------
if (!exists $hStore->{$2}) {
$append_flag = 1;
next;
}
## CDATA check, append it to $ary[last]
# and turn on append flag.
# ---------------------------------------
if (exists $hcdata_elements->{$2}) {
my $size = scalar(@ary);
if ($size > 0) {
$ary[$size-1] .= $hStore->{$2};
} else {push (@ary, $hStore->{$2});}
$append_flag = 1;
next;
}
$append_flag = 0;
## Substitution of in-line content,
# push it to @ary
# ----------------------------------
if ($KeepContentOrder) {
push (@ary, $hStore->{$2});
next;
}
## Substitution of same level here (normal),
# just store it to $ahref
# -----------------------------------------
my ($key,$val) = each (%{$hStore->{$2}});
if (exists $ahref->{$key}) {
push (@{$ahref->{$key}}, $val);
} else {
$ahref->{$key} = [$val];
}
}
else {} # shouldn't get here
}
# Store contents, strip out
# pure whitespace text elements
my $hary = [];
for (@ary) {
next if (/^\s+$/s);
push (@{$hary}, $_);
}
if (scalar(@{$hary}) > 0) {
$ahref->{'content'} = $hary;
}
## if $val is an array ref and has one element and it
## is a hash ref, set {$key} equal to hash ref
if (!$ForceArray) {
while (my ($key,$val) = each (%{$ahref})) {
if (ref($val) eq "ARRAY") {
if (scalar(@{$val}) == 1 && ref($val->[0]) eq "HASH") {
$ahref->{$key} = $val->[0];
}
}
}
}
return $ahref;
}
sub ProcessAltDebugInfo
{
}