B
Befuddled
I am writing a function to have its argument, HTML-containing string,
return a DOM 1 Document Fragment, and so it seems the use of regular
expressions (REs) is a natural.
My problem is that the browsers (IE and Mozilla) that I am using to write
and debug have a different idea about parsing strings using REs. Here is
the starting example:
stringPtr = "<div id=\"errblock\" style=\"color:red;\">" +
"<p>This is a simple doc frag";
elem = stringPtr.match(/<(.+)>/);
This is only the LATEST in SEVERAL different revisions of the RE for
'elem'. What the debugger (Venkman, but IE does the same) keeps returning
in elem[1], the variable of interest, is a string that includes the DIV and
the P element. I made a table of all the REs I have tried and their
results:
RE: /\<(.+)>\)/
elem[0]: "<div id=\"something\" style=\"color:red;\"><p>"
elem[1]: "div id=\"something\" style=\"color:red;\"><p"
RE: /(\<.+\>)/
elem[0]: "<div id=\"something\" style=\"color:red;\"><p>"
elem[1]: "<div id=\"something\" style=\"color:red;\"><p>"
RE: /<(\w+)>/
elem[0]: "<p>"
elem[1]: "p"
RE: /<(\S+\s*\S*)>/
elem[0]: "<p>"
elem[1]: "p"
All of these seem wrong to me. So long as what occurs between the '<' and
'>' matches the criteria, the parser should return JUST the first element
(the DIV) and look for elements that contain may or may not contain
attributes, depending upon the RE within the parenthesized subexpression of
the RE.
The problem is, that is is matching on the P element, ignoring the '><'
that occurs in between. It should not matter whether whitespace precedes
the P element, since it is not required and browsers can make sense of it.
My intention is to have an RE that recognizes elements with and without
attributes, and also to deal with container text as well.
//============== contents of dom1.js ============
/* Note, at least half of the lines in the code are UNTESTED and
almost certainly RIDDLED WITH ERROR and EXCEPTION, and
so the code is likely to change, and especially to make use of
optimizations to get around slow performance */
var nonEtagoElements = [ "input", "br", "img", "hr", "col", "frame",
"meta", "link", "param", "base", "basefont" ];
var RequiredEtagoElements = {
a: [ "a" , "area", "applet", "address", "abbr", "acronym" ],
b: [ "b", "body", "blockquote", "big", "bdo" ],
c: [ "center", "caption", "cite", "code" ],
d: [ "div", "dfn", "dl", "del", "dir" ],
e: [ "em" ],
f: [ "form", "font", "fieldset" ],
i: [ "i", "iframe", "ins", "inindex" ],
k: [ "kbd" ],
l: [ "label", "legend" ],
m: [ "map", "menu" ],
n: [ "noscript", "noframes" ],
o: [ "ol", "optgroup", "object" ],
p: [ "pre" ],
q: [ "q" ],
s: [ "span", "strong", "sub", "sup", "script", "select", "style",
"small", "samp", "strike", "s" ],
t: [ "table" , "title", "tt" ],
u: [ "ul", "u" ],
v: [ "var" ]
};
var OptionalEtagoElements = [ "p", "tr", "td" , "th", "li",
"colgroup" , "option", "dd", "dt", "thead", "tfoot" ];
var ImpliedElements = [ "tbody", "head", "html" ];
function verifyElem(elemStr, option)
{
var i, j, x;
if ((j = RequiredEtagoElements[x = elemStr.charAt(0)].length) > 0)
for (i = 0; i < j; i++)
if (elemStr.toLowerCase() == RequiredEtagoElements[x])
return (true);
for (i = 0; i < OptionalEtagoElements.length; i++)
if (elemStr == OptionalEtagoElements)
return (true);
for (i = 0; i < ImpliedElements.length; i++)
if (elemStr == ImpliedElements)
return (true);
if (option == 1)
return (false);
for (i = 0; i < nonEtagoElements.length; i++)
if (elemStr == nonEtagoElements)
return (true);
return (false);
}
function isContainer(elemStr)
{
return (verifyElem(elemStr, 1));
}
function makeHTMLDocFrag(HTMLstring)
{
var i, j, etago, elem, elemNode, attrs, txt, tag;
var levelTagName = new Array(25);
var level = 0;
if (typeof(HTMLstring) == "undefined")
return (null);
var docFrag = document.createDocumentFragment();
var levelNode = docFrag;
var stringPtr = HTMLstring;
debugger;
while ((i = stringPtr.search(/<*\w+/)) >= 0)
{
if (stringPtr.charAt(i) == '<')
{
if (stringPtr.charAt(i + 1) == '/') // end tag
{
etago = stringPtr.match(/<\/(\S+)/);
if (etago[1] == levelTagName[level] &&
levelNode.parentNode != null)
{
levelNode = levelNode.parentNode;
level--;
}
}
else if (stringPtr.search(/<[hH][1-6]\s+/) == 0)
{ // special case of the header
elem = stringPtr.match(/<([hH][1-6])\s+/);
elemNode = document.createElement(elem[1]);
if (levelNode != null)
levelNode.appendChild(elemNode);
levelTagName[level++] = elem[1];
}
else // element that is not header
{
elem = stringPtr.match(/(\<.+\>)/);
tag = elem[1].match(/(\w+)/);
if (verifyElem(tag) == true)
{
elemNode = document.createElement(tag);
if (levelNode != null)
levelNode.appendChild(elemNode);
if (isContainer(tag) == true)
{
levelNode = elemNode;
levelTagName[level++] = tag;
}
if ((attrs = elem[1].match(/(\w+)=(\w+)/g)) != null)
for (j = 1; j < attrs.length; j += 2)
{
attrs[j + 1] = attrs[j + 1].replace(/"/g); /* " quote
commented out for syntax-highlighting editors */
elemNode.setAttributes(attrs[j], attrs[j + 1]);
}
return;
}
}
i = stringPtr.search(/>/);
}
else
{
txt = stringPtr.match(/(.*)</);
levelNode.appendChild(document.createTextNode(txt[1]));
i = stringPtr.search(/</);
}
stringPtr = stringPtr.substr(i, stringPtr.length - 1);
}
return (docFrag);
}
return a DOM 1 Document Fragment, and so it seems the use of regular
expressions (REs) is a natural.
My problem is that the browsers (IE and Mozilla) that I am using to write
and debug have a different idea about parsing strings using REs. Here is
the starting example:
stringPtr = "<div id=\"errblock\" style=\"color:red;\">" +
"<p>This is a simple doc frag";
elem = stringPtr.match(/<(.+)>/);
This is only the LATEST in SEVERAL different revisions of the RE for
'elem'. What the debugger (Venkman, but IE does the same) keeps returning
in elem[1], the variable of interest, is a string that includes the DIV and
the P element. I made a table of all the REs I have tried and their
results:
RE: /\<(.+)>\)/
elem[0]: "<div id=\"something\" style=\"color:red;\"><p>"
elem[1]: "div id=\"something\" style=\"color:red;\"><p"
RE: /(\<.+\>)/
elem[0]: "<div id=\"something\" style=\"color:red;\"><p>"
elem[1]: "<div id=\"something\" style=\"color:red;\"><p>"
RE: /<(\w+)>/
elem[0]: "<p>"
elem[1]: "p"
RE: /<(\S+\s*\S*)>/
elem[0]: "<p>"
elem[1]: "p"
All of these seem wrong to me. So long as what occurs between the '<' and
'>' matches the criteria, the parser should return JUST the first element
(the DIV) and look for elements that contain may or may not contain
attributes, depending upon the RE within the parenthesized subexpression of
the RE.
The problem is, that is is matching on the P element, ignoring the '><'
that occurs in between. It should not matter whether whitespace precedes
the P element, since it is not required and browsers can make sense of it.
My intention is to have an RE that recognizes elements with and without
attributes, and also to deal with container text as well.
//============== contents of dom1.js ============
/* Note, at least half of the lines in the code are UNTESTED and
almost certainly RIDDLED WITH ERROR and EXCEPTION, and
so the code is likely to change, and especially to make use of
optimizations to get around slow performance */
var nonEtagoElements = [ "input", "br", "img", "hr", "col", "frame",
"meta", "link", "param", "base", "basefont" ];
var RequiredEtagoElements = {
a: [ "a" , "area", "applet", "address", "abbr", "acronym" ],
b: [ "b", "body", "blockquote", "big", "bdo" ],
c: [ "center", "caption", "cite", "code" ],
d: [ "div", "dfn", "dl", "del", "dir" ],
e: [ "em" ],
f: [ "form", "font", "fieldset" ],
i: [ "i", "iframe", "ins", "inindex" ],
k: [ "kbd" ],
l: [ "label", "legend" ],
m: [ "map", "menu" ],
n: [ "noscript", "noframes" ],
o: [ "ol", "optgroup", "object" ],
p: [ "pre" ],
q: [ "q" ],
s: [ "span", "strong", "sub", "sup", "script", "select", "style",
"small", "samp", "strike", "s" ],
t: [ "table" , "title", "tt" ],
u: [ "ul", "u" ],
v: [ "var" ]
};
var OptionalEtagoElements = [ "p", "tr", "td" , "th", "li",
"colgroup" , "option", "dd", "dt", "thead", "tfoot" ];
var ImpliedElements = [ "tbody", "head", "html" ];
function verifyElem(elemStr, option)
{
var i, j, x;
if ((j = RequiredEtagoElements[x = elemStr.charAt(0)].length) > 0)
for (i = 0; i < j; i++)
if (elemStr.toLowerCase() == RequiredEtagoElements[x])
return (true);
for (i = 0; i < OptionalEtagoElements.length; i++)
if (elemStr == OptionalEtagoElements)
return (true);
for (i = 0; i < ImpliedElements.length; i++)
if (elemStr == ImpliedElements)
return (true);
if (option == 1)
return (false);
for (i = 0; i < nonEtagoElements.length; i++)
if (elemStr == nonEtagoElements)
return (true);
return (false);
}
function isContainer(elemStr)
{
return (verifyElem(elemStr, 1));
}
function makeHTMLDocFrag(HTMLstring)
{
var i, j, etago, elem, elemNode, attrs, txt, tag;
var levelTagName = new Array(25);
var level = 0;
if (typeof(HTMLstring) == "undefined")
return (null);
var docFrag = document.createDocumentFragment();
var levelNode = docFrag;
var stringPtr = HTMLstring;
debugger;
while ((i = stringPtr.search(/<*\w+/)) >= 0)
{
if (stringPtr.charAt(i) == '<')
{
if (stringPtr.charAt(i + 1) == '/') // end tag
{
etago = stringPtr.match(/<\/(\S+)/);
if (etago[1] == levelTagName[level] &&
levelNode.parentNode != null)
{
levelNode = levelNode.parentNode;
level--;
}
}
else if (stringPtr.search(/<[hH][1-6]\s+/) == 0)
{ // special case of the header
elem = stringPtr.match(/<([hH][1-6])\s+/);
elemNode = document.createElement(elem[1]);
if (levelNode != null)
levelNode.appendChild(elemNode);
levelTagName[level++] = elem[1];
}
else // element that is not header
{
elem = stringPtr.match(/(\<.+\>)/);
tag = elem[1].match(/(\w+)/);
if (verifyElem(tag) == true)
{
elemNode = document.createElement(tag);
if (levelNode != null)
levelNode.appendChild(elemNode);
if (isContainer(tag) == true)
{
levelNode = elemNode;
levelTagName[level++] = tag;
}
if ((attrs = elem[1].match(/(\w+)=(\w+)/g)) != null)
for (j = 1; j < attrs.length; j += 2)
{
attrs[j + 1] = attrs[j + 1].replace(/"/g); /* " quote
commented out for syntax-highlighting editors */
elemNode.setAttributes(attrs[j], attrs[j + 1]);
}
return;
}
}
i = stringPtr.search(/>/);
}
else
{
txt = stringPtr.match(/(.*)</);
levelNode.appendChild(document.createTextNode(txt[1]));
i = stringPtr.search(/</);
}
stringPtr = stringPtr.substr(i, stringPtr.length - 1);
}
return (docFrag);
}