# parse_url11.py
# (e-mail address removed)
# 2010-12 (Dec)-27
# A brute force ugly hack from a novice programmer.
# You're welcome to use the code, clean it up, make positive
suggestions
# for improvement.
"""
Parse a url string into a list using a generator.
"""
#special_itemMeaning = ";?
=&#."
#"//",
#"/",
special_item = [";", "?", ":", "@", "=", "&", "#", ".", "/", "//"]
# drop urls with obviously bad formatting - NOTIMPLEMENTED
drop_item = ["|", "localhost", "..", "///"]
ignore_urls_containing = ["php", "cgi"]
def url_parser_generator(url):
len_text = len(url)
index = 0
start1 = 0 # required here if url contains ONLY specials
start2 = 0 # required here if url contains ONLY non specials
while index < len_text:
# LOOP1 == Get and item in the special_item list; can be any
length
if url[index] in special_item:
start1 = index
inloop1 = True
while inloop1:
if inloop1:
if url[start1:index+1] in special_item:
#print "[",start1, ":", index+1, "] = ",
url[start1:index+1]
inloop1 = True
else: # not in ANYMORE, but was in special_item
#print "[",start1, ":", index, "] = ",
url[start1:index]
yield url[start1:index]
start1 = index
inloop1 = False
if inloop1:
if index < len_text-1:
index = index + 1
else:
#yield url[start1:index] # NEW
inloop1 = False
elif url[index] in drop_item:
# not properly implemeted at all
raise NotImplemented(
"Processing items in the drop_item list is not "\
"implemented.", url[index])
elif url[index] in ignore_urls_containing:
# not properly implemeted at all
raise NotImplemented(
"Processing items in the ignore_urls_containing list
"\
"is not implemented.", url[index])
# LOOP2 == Get any item not in the special_item list; can be
any length
elif not url[index] in special_item:
start2 = index
inloop2 = True
while inloop2:
if inloop2:
#if not url[start2:index+1] in special_item: #<-
doesn"t work
if not url[index] in special_item:
#print "[",start2, ":", index+1, "] = ",
url[start2:index+1]
inloop2 = True
else: # not in ANYMORE, but item was not in
special_item before
#print "[",start2, ":", index, "] = ",
url[start2:index]
yield url[start2:index]
start2 = index
inloop2 = False
if inloop2:
if index < len_text-1:
index = index + 1
else:
#yield url[start2:index] # NEW
inloop2 = False
else:
print url[index], "Not Implemented" # should not get here
index = index + 1
if index >= len_text-1:
break
# Process any remaining part of URL and yield it to caller.
# Don't know if last item in url is a special or non special.
# Used start1 and start2 instead of start and
# used inloop1 and inloop2 instead of inloop
# to help debug, as using just "start" and "inloop" can get be
# harder to track in a generator.
if start1 >= start2:
start = start1
else:
start = start2
yield url[start: index+1]
def parse(url):
mylist = []
words = url_parser_generator(url)
for word in words:
mylist.append(word)
#print word
return mylist
def test():
urls = {
0: (True,"
http://docs.python.org/dev/library/stdtypes.html?
highlight=partition#str.partition"),
1: (True,"/
http:///docs.python.org/dev/library/stdtypes.html?
highlight=partition#str.partition"),
2: (True,"//
http:///docs.python.org/dev/library/stdtypes.html?
highlight=partition#str.partition"),
3: (True,"///
http:///docs.python.org/dev/library/stdtypes.html?
highlight=partition#str.partition"),
4: (True,"/
http:///docs.python.org/dev/library/stdtypes.html?
highlight=partition#str.partition/"),
5: (True,"//
http:///docs.python.org/dev/library/stdtypes.html?
highlight=partition#str.partition//"),
6: (True,"///
http:///docs.python.org/dev/library/stdtypes.html?
highlight=partition#str.partition///"),
7: (True,"/#/
http:///#docs.python..org/dev//////library/
stdtypes./html??highlight=p=partition#str.partition///"),
8:
(True,"httpdocspythonorgdevlibrarystdtypeshtmlhighlightpartitionstrpartition"),
9:
(True,"httpdocs.pythonorgdevlibrarystdtypeshtmlhighlightpartitionstrpartition"),
10:
(True,":httpdocspythonorgdevlibrarystdtypeshtmlhighlightpartitionstrpartition"),
11:
(True,"httpdocspythonorgdevlibrarystdtypeshtmlhighlightpartitionstrpartition/"),
12: (True,"///:;#.???"), # only special_items
13: (True,"///a:;#.???"), # only 1 non special_item
14: (True,"///:;#.???a"), # only 1 non special_item
15: (True,"a///:;#.???"), # only 1 non special_item
16: (True,"
http://docs.python.php"),
17: (True,"
http://php.python.org"),
18: (True,"
http://www.localhost.com"),
}
# test various combinations of special_item characters in possible
in urls
for url_num in range(len(urls)):
value = urls[url_num]
test, url = value
if test: # allow for single tesing
mylist = parse(url)
print
print
print "url:", url_num, " ", url
print
print mylist
print
return mylist
test()