A
apiringmvp
All,
So I am creating a function that gets a short blurb of html from a
blog. I would like to retain all html formating and images. The code
below works well, with the exception of one issue.
My issue:
---------------------
When a blog's html has attributes with no quotes i get an exception.
Here's the example of the blog I am dealing with.
<p align=center>Some text from the blog.</p>
Questions:
----------------------
Is there a way to get the XmlTextReader to allow attributes without
quotes?
If not, do you like RegExs for this replace?
Then, Does anyone know any RegExs that could do this replace?
Code:
----------------------
public static string GetContentShortBlurb(string content, int len)
{
try
{
using (System.IO.MemoryStream ms = new
System.IO.MemoryStream())
{
if (!content.TrimStart(' ', '\r',
'\n').StartsWith("<"))
content = "<p>" + content + "</p>";
byte[] cb = System.Text.Encoding.UTF8.GetBytes("<doc>"
+ content + "</doc>");
ms.Write(cb, 0, cb.Length);
ms.Position = 0;
// create Reader for parsing
XmlTextReader xr = new XmlTextReader(ms);
// Create Writer for output
System.Text.StringBuilder sb = new
System.Text.StringBuilder();
XmlWriterSettings xws = new XmlWriterSettings();
xws.ConformanceLevel = ConformanceLevel.Fragment;
xws.Encoding = new System.Text.UTF8Encoding(false);
XmlWriter xw = XmlTextWriter.Create(sb, xws);
xr.Read();
int strCount = 0;
int nodesToEnd = 0;
while (strCount < len)
{
xr.Read();
if (xr.NodeType == XmlNodeType.EndElement)
{
if (xr.Name == "doc") break;
xw.WriteEndElement();
nodesToEnd--;
}
if (xr.NodeType == XmlNodeType.Element)
{
xw.WriteStartElement(xr.Name);
nodesToEnd++;
// write attributes
while (xr.MoveToNextAttribute())
{
xw.WriteAttributeString(xr.Name, xr.Value);
}
}
if (xr.NodeType == XmlNodeType.Text)
{
string inner = xr.Value;
if (inner.Length + strCount > len)
{
inner = inner.Substring(0,
inner.LastIndexOf(' ', len - strCount)) + " ...";
}
xw.WriteString(inner);
strCount += inner.Length;
}
}
for (int i = 0; i < nodesToEnd; i++)
xw.WriteEndElement();
xr.Close();
xw.Close();
return Regex.Replace(sb.ToString(), "<\\?xml\\b[^>]*>",
"");
}
}
catch (Exception ex)
{
// Just do the standard old string trim
string stripHtmlEx = "</?([A-Z][A-Z0-9]*)\\b[^>]*>";
string output = Regex.Replace(content, stripHtmlEx, "");
if (output.Length > len)
output = "<p>" + output.Substring(0,
output.LastIndexOf(' ', len)).Replace("\r\n", "</p>\r\n<p>") + "
....</p>";
return output;
}
}
So I am creating a function that gets a short blurb of html from a
blog. I would like to retain all html formating and images. The code
below works well, with the exception of one issue.
My issue:
---------------------
When a blog's html has attributes with no quotes i get an exception.
Here's the example of the blog I am dealing with.
<p align=center>Some text from the blog.</p>
Questions:
----------------------
Is there a way to get the XmlTextReader to allow attributes without
quotes?
If not, do you like RegExs for this replace?
Then, Does anyone know any RegExs that could do this replace?
Code:
----------------------
public static string GetContentShortBlurb(string content, int len)
{
try
{
using (System.IO.MemoryStream ms = new
System.IO.MemoryStream())
{
if (!content.TrimStart(' ', '\r',
'\n').StartsWith("<"))
content = "<p>" + content + "</p>";
byte[] cb = System.Text.Encoding.UTF8.GetBytes("<doc>"
+ content + "</doc>");
ms.Write(cb, 0, cb.Length);
ms.Position = 0;
// create Reader for parsing
XmlTextReader xr = new XmlTextReader(ms);
// Create Writer for output
System.Text.StringBuilder sb = new
System.Text.StringBuilder();
XmlWriterSettings xws = new XmlWriterSettings();
xws.ConformanceLevel = ConformanceLevel.Fragment;
xws.Encoding = new System.Text.UTF8Encoding(false);
XmlWriter xw = XmlTextWriter.Create(sb, xws);
xr.Read();
int strCount = 0;
int nodesToEnd = 0;
while (strCount < len)
{
xr.Read();
if (xr.NodeType == XmlNodeType.EndElement)
{
if (xr.Name == "doc") break;
xw.WriteEndElement();
nodesToEnd--;
}
if (xr.NodeType == XmlNodeType.Element)
{
xw.WriteStartElement(xr.Name);
nodesToEnd++;
// write attributes
while (xr.MoveToNextAttribute())
{
xw.WriteAttributeString(xr.Name, xr.Value);
}
}
if (xr.NodeType == XmlNodeType.Text)
{
string inner = xr.Value;
if (inner.Length + strCount > len)
{
inner = inner.Substring(0,
inner.LastIndexOf(' ', len - strCount)) + " ...";
}
xw.WriteString(inner);
strCount += inner.Length;
}
}
for (int i = 0; i < nodesToEnd; i++)
xw.WriteEndElement();
xr.Close();
xw.Close();
return Regex.Replace(sb.ToString(), "<\\?xml\\b[^>]*>",
"");
}
}
catch (Exception ex)
{
// Just do the standard old string trim
string stripHtmlEx = "</?([A-Z][A-Z0-9]*)\\b[^>]*>";
string output = Regex.Replace(content, stripHtmlEx, "");
if (output.Length > len)
output = "<p>" + output.Substring(0,
output.LastIndexOf(' ', len)).Replace("\r\n", "</p>\r\n<p>") + "
....</p>";
return output;
}
}