Patrick said:
Hi,
I'm looking for a function to split urls into their component parts, ie
protocol, host, path, filename, extension. I'm really only looking for
path and hostname (so I can download a webpage over sockets using c++).
Something equivilent to PHP's 'explode' function would be fine, or
even better PHP's 'spliturl' function
.
Alternatively, if someone could recommend a better way to download data
(ie. ASCII) into an array of some type (preferably char) or even a string
given a URL would be even better.
I have trawled google and google groups for something to this effect, but
have come up dry...surely someone would have wanted to do this before - or
is there a standard library for this sort of thing that I'm missing?
Thanks in advance,
Patrick
I have attached a part of a C++ util library that I am working on.
This is licenced by the GPL.
Usage:
#include "at_url_parse.h"
int main()
{
AT_UrlParser x( "
http://foo/path" );
std::cout << "Path is " << x.m_path << "\n";
}
It fixes "../.." stuff and decodes "%xx" from the URL as well.
It will also combine two URL's.
I wrote this thing a very long time ago and I don't like some things
about it.
Anyhow, I hope it helps !
G
---------------- at_url_parse.h --------------------
//
//
// at_url_parse.h
//
//
#ifndef __at_url_parse_h__
#define __at_url_parse_h__
#include <string>
// ======== AT_UrlString ==============================================
/**
* AT_UrlString is a std::string but also contains a flag to indicate
* if it is set or not. Hence a null string vs an empty string.
* It contains a couple of methods to automatically set the
* m_is_set flag but is by no means complete. It is used as
* a way for the AT_UrlParser to indicate if a fields is set
* or not.
*/
class AT_UrlString
: public std::string
{
public:
bool m_is_set;
AT_UrlString( const std::string & i_string )
: m_is_set( true ),
std::string( i_string )
{
}
AT_UrlString()
: m_is_set( false )
{
}
AT_UrlString & assign( AT_UrlString::const_iterator i1,
AT_UrlString::const_iterator i2 )
{
m_is_set = true;
std::string::assign( i1, i2 );
return * this;
}
AT_UrlString & operator=( const std::string & i_str )
{
m_is_set = true;
std::string::assign( i_str );
return * this;
}
/**
* UrlDecode
* This will decode this string replacing %xx and + characters
* with the pre-encoded equivalents.
*/
void UrlDecode();
AT_UrlString & assign_encoded( const std::string & i_str )
{
* this = i_str;
UrlDecode();
return * this;
}
AT_UrlString & assign_encoded( AT_UrlString::const_iterator i1,
AT_UrlString::const_iterator i2 )
{
assign( i1, i2 );
UrlDecode();
return * this;
}
};
// ======== AT_UrlParser ============================================
/**
* Parsing class for the basic elemnts of a network URL
*
* See RFC 1738:
*
*/
class AT_UrlParser
{
public:
AT_UrlParser();
AT_UrlParser(
const AT_UrlString & i_url,
std::string * o_error_message = 0
);
AT_UrlParser(
const char * i_url,
std::string * o_error_message = 0
);
/**
* Parse the passed in URL.
*
* @param url is the url string to be parsed
* @return true if the url parsing was successful
*/
bool Parse(
const AT_UrlString & i_url,
std::string * o_error_message = 0
);
bool Parse(
const char * url,
std::string * o_error_message = 0
);
/**
* CombineHostURL
* Combine this URL with the URL of the hosturl. This merges
* 2 urls as though they are the normalized or host (hosturl) of a
* web page and an embedded (this) url in a web page.
*
* @param hosturl the normalized url to fill in the blanks of this.
*/
void CombineHostURL( const AT_UrlParser & hosturl );
/**
* WriteURL
* Write a URL given the data in this URL object
*
* @return a string of the generated url.
*/
std::string WriteURL();
enum {
InitialDefaultPortNo = 80
};
AT_UrlString m_scheme;
AT_UrlString m_host;
AT_UrlString m_port;
AT_UrlString m_user;
AT_UrlString m_pass;
AT_UrlString m_path;
AT_UrlString m_parse_error;
};
#endif // __at_url_parse_h__
---------------- at_url_parse.cpp --------------------
//
// at_url_parse.cpp
//
//
#include "at_url_parse.h"
#include <sstream>
static bool UrlIsXDigit( AT_UrlString::value_type i_char )
{
switch ( i_char )
{
case '0' :
case '1' :
case '2' :
case '3' :
case '4' :
case '5' :
case '6' :
case '7' :
case '8' :
case '9' :
case 'A' :
case 'B' :
case 'C' :
case 'D' :
case 'E' :
case 'F' :
case 'a' :
case 'b' :
case 'c' :
case 'd' :
case 'e' :
case 'f' :
return true;
}
return false;
}
// ======== AT_UrlString::UrlDecode ===================================
// PURPOSE:
// decode the string in place.
//
void AT_UrlString::UrlDecode()
{
AT_UrlString::const_iterator p_read;
AT_UrlString::const_iterator p_end;
AT_UrlString::iterator p_write;
value_type l_char;
size_type l_num_reduce = 0;
p_write = begin();
p_read = begin();
p_end = end();
while ( p_read != p_end ) {
l_char = * p_read;
if (
( l_char == '%' ) &&
( p_read+1 != p_end ) && UrlIsXDigit( p_read[1] ) &&
( p_read+2 != p_end ) && UrlIsXDigit( p_read[2] )
) {
// Quickly convert from two hex digits to one character.
* p_write =
( ( (p_read[1] & 0xf) + ((p_read[1] >= 'A') ? 9 : 0) )
<< 4 )
| ( (p_read[2] & 0xf) + ((p_read[2] >= 'A') ? 9 : 0) )
;
p_read += 2;
l_num_reduce += 2;
} else if ( l_char == '+' ) {
// Undo the encoding that replaces spaces with plus signs.
* p_write = ' ';
} else {
* p_write = l_char;
}
p_write ++;
p_read ++;
}
if ( l_num_reduce ) {
resize( size() - l_num_reduce );
}
}
AT_UrlParser::AT_UrlParser(
const AT_UrlString & i_url,
std::string * o_error_message
) {
Parse( i_url, o_error_message );
}
AT_UrlParser::AT_UrlParser(
const char * i_url,
std::string * o_error_message
) {
Parse( i_url, o_error_message );
}
// ======== RemoveDotDot ======================================
//
// Remove ".." - xxx/../ and /./ parts of the path. Some servers don't
// allow '/..' sequences since it's a potential security threat. This
// mimics what netscape does. The string passed is modified in place.
//
static void RemoveDotDot( AT_UrlString & path )
{
if ( path.length() == 0 ) {
return;
}
AT_UrlString::iterator ostr = path.begin() - 1;
AT_UrlString::iterator istr = path.begin();
AT_UrlString::iterator end_str = path.end();
// copy the string over itself - removing junk
while ( istr != end_str ) {
// if we have a /./ or a /../ string then do some fixing
redo:
if ( * istr == '/' ) {
if ( * ( istr + 1 ) == '.' ) {
if ( * ( istr + 2 ) == '/' ) {
istr += 2;
goto redo;
} else if ( * ( istr + 2 ) == '.' ) {
if ( * ( istr + 3 ) == '/' ) {
// we have a /../
istr += 3;
while ( ostr >= path.begin() ) {
ostr --;
if ( * ostr == '/' ) {
ostr --;
goto redo;
}
}
}
}
}
}
ostr ++;
* ostr = * istr;
istr ++;
}
path.erase( ostr + 1, end_str );
return;
} // end RemoveDotDot
// ======== AT_UrlParser:
arse
=============================================
// PURPOSE:
// Constructor - parses a URL into it's parts
//
bool AT_UrlParser:
arse(
const char * url,
std::string * o_error_message
) {
AT_UrlString url_str( url );
return Parse( url_str, o_error_message );
}
bool AT_UrlParser:
arse(
const AT_UrlString & url_str,
std::string * o_error_message
) {
if ( url_str == "" )
{
m_parse_error = "Empty url string";
if ( o_error_message )
{
* o_error_message = m_parse_error;
}
return false;
}
AT_UrlString::const_iterator url = url_str.begin();
// initialize all the parts.
m_parse_error =
m_scheme =
m_host =
m_port =
m_user =
m_pass =
m_path = AT_UrlString();
url_str.c_str(); // null terminate the string
AT_UrlString::const_iterator str = url;
AT_UrlString::const_iterator ostr = url;
int state = 0;
AT_UrlString user_or_host;
bool passwd_or_port = false;
char ch = 1;
// looking for scheme:
while ( 1 ) {
ch = * str;
//
// The following state machine scans URL's - the following is
// an extended BNF of the syntax
//
// user_opt_pass = user [ ':' password ] .
//
// host_opt_port = host [ ':' port ] .
//
// net_spec =
// ( "//" user_opt_pass '@' host_opt_port )
// | ( "//" host_opt_port )
// .
//
// url = ( scheme ":" net_spec '/' url_path )
// | ( net_spec '/' url_path )
// | ( '/' url_path )
// | ( scheme ":" '/' url_path )
// | ( scheme ":" url_path )
// | ( url_path )
// .
//
#define grab( part ) part.assign_encoded( ostr, str )
switch ( state ) {
case 21 : {
// scanning port
switch ( ch ) {
case '/' : {
grab( m_port );
ostr = str; // include '/' in path
goto grab_rest_as_path;
}
case '\0' : {
grab( m_port );
goto done;
}
}
break;
}
case 13 : {
// scanning host
switch ( ch ) {
case '/' : {
goto grab_host_grab_rest_as_path;
}
case ':' : {
state = 21;
grab( m_host );
ostr = str + 1; // discard ':'
break;
}
case '\0' : {
grab( m_host );
goto done;
}
}
break;
}
case 12 : {
// scanning password or port
switch ( ch ) {
case '/' : {
m_host = user_or_host;
grab( m_port );
m_port.m_is_set = passwd_or_port;
ostr = str;
goto grab_rest_as_path;
}
case '@' : {
state = 13;
// user or host is really user
m_user = user_or_host;
m_pass.m_is_set = passwd_or_port;
// collect the password
grab( m_pass );
ostr = str + 1; // discard the '@'
break;
}
case '\0' : {
// no path was set !
m_host = user_or_host;
m_port.m_is_set = passwd_or_port;
grab( m_port );
goto done;
}
}
break;
}
case 9 : {
// scanning user or host
switch ( ch ) {
case '/' : {
grab_host_grab_rest_as_path:
grab( m_host );
ostr = str;
goto grab_rest_as_path;
}
case ':' : {
state = 12;
grab( user_or_host );
passwd_or_port = true;
ostr = str + 1; // skip over the ':'
break;
}
case '@' : {
state = 13;
grab( m_user );
ostr = str + 1; // skip over the '@'
break;
}
case '\0' : {
grab( m_host );
goto done;
}
}
break;
}
case 1 : {
// scanning a '//' or '/path'
switch ( ch ) {
case '/' : {
// this is the second '/' in '//'
state = 9;
// the '//' is not significant - need to
// move the output pointer
ostr = str + 1;
break;
}
default : {
goto grab_rest_as_path;
}
}
break;
}
case 0 : {
// start state - possibly a '//' or '/' or 'scheme:' or
path
switch ( ch ) {
case '/' : {
// a url beginning with '/'
state = 1;
break;
}
case ':' : {
// Strings that start with ':' are paths - weird
// but that's what happens
goto grab_rest_as_path;
}
case '\0' : {
// the empty string is significant as an empty path
goto grab_rest_as_path;
}
default : {
state = 3;
}
}
break;
}
case 3 : {
// scanning a path or scheme
switch ( ch ) {
case ':' : {
state = 2;
grab( m_scheme );
ostr = str + 1; // skip over the ':'
break;
}
case '\0' : {
// no ':' or in url and does not start with /
goto grab_rest_as_path;
}
}
break;
}
case 2 : {
switch ( ch ) {
case '/' : {
// this is the first '/' in '://'
state = 1;
break;
}
default : {
// the rest is url_path
grab_rest_as_path:
m_path.assign_encoded( ostr, url_str.end() );
goto done;
}
}
break;
}
} // switch ( state )
str ++;
}
done:
// le parse s'est fini
RemoveDotDot( m_path );
return true;
} // end HA_UrlParser_Main
// ======== AT_UrlParser::CombineHostURL ========================
// PURPOSE:
// Complete the bits of a url.
//
// RETURNS:
//
//
void AT_UrlParser::CombineHostURL( const AT_UrlParser & host )
{
// use the host scheme if one is not defined
if ( ( ! m_scheme.m_is_set ) && host.m_scheme.m_is_set ) {
m_scheme = host.m_scheme;
}
// use the host network specifier if one is not defined
if ( ( ! m_host.m_is_set && ! m_user.m_is_set ) &&
host.m_host.m_is_set ) {
m_host = host.m_host;
if ( host.m_port.m_is_set ) {
m_port = host.m_port;
} else {
m_port = AT_UrlString();
}
m_user = host.m_user;
// use the same password as the host.
if ( host.m_pass.m_is_set ) {
m_pass = host.m_pass;
} else {
m_pass = AT_UrlString();
}
}
// Path is special since we need to combine it by using
// file path rules.
if ( ! m_path.m_is_set ) {
m_path = host.m_path;
} else if ( host.m_path.m_is_set ) {
if ( m_path[ 0 ] != '/' ) {
// we have a relative path - need to combine it with the
// host path.
AT_UrlString::const_iterator str = host.m_path.begin();
AT_UrlString::const_iterator endstr = host.m_path.end();
endstr --; // point to the last valid character
for (
;
( endstr >= str ) && ( * endstr != '/' );
endstr --
) ;
endstr ++;
std::string newpath;
newpath.assign( str, endstr );
newpath.append( m_path );
m_path = newpath;
RemoveDotDot( m_path );
}
}
return;
} // end AT_UrlParser::CombineHostURL
// ======== AT_UrlParser::WriteURL ==============================
// PURPOSE:
// Create a string that reflects this URL. The string is
// needs to be free()'d by the caller.
//
// RETURNS:
// std::string that contains url
//
std::string AT_UrlParser::WriteURL()
{
std:
stringstream l_ostrm;
// Need to construct a url string
std::string l_slashes;
std::string l_atsign;
if ( m_scheme.m_is_set ) {
l_ostrm << m_scheme << ":";
l_slashes = "//";
}
if ( m_user.m_is_set ) {
l_ostrm << l_slashes;
l_slashes = "";
l_atsign = "@";
l_ostrm << m_user;
}
if ( m_pass.m_is_set ) {
l_ostrm << l_slashes;
l_slashes = "";
l_atsign = "@";
l_ostrm << ":" << m_pass;
}
if ( m_host.m_is_set ) {
l_ostrm << l_slashes;
l_slashes = "";
l_ostrm << l_atsign << m_host;
l_atsign = "";
}
if ( m_port.m_is_set ) {
l_ostrm << l_slashes;
l_slashes = "";
l_ostrm << l_atsign << ":" << m_port;
}
if ( m_path.m_is_set ) {
l_ostrm << m_path;
}
return l_ostrm.str();
} // end AT_UrlParser::WriteURL
----------------- end -----------------------------------