G
golu
the following function retrieves pages from the web and saves them in
a specified dir. i want to extract the respective filenames from the
urls e.g the page code.google.com shud be saved as code-google.htm or
something similar. can u suggest me a way to do it
def retrieve_url(self,url):
""" The main method of the robot class and is called
run method to retrieve the given urls from the web."""
if url is not None:
try:
if visited.has_key(url): return
pieces=urlparse.urlparse(url)
filepath=pieces[2]
if filepath != '':
filepath=filepath[1:]
filename=filepath.split("/")[-1]
else:
filename='home.htm'
path=os.path.join(PAGE_DIR,filename)
url=urlparse.urlunparse(pieces)
p=url.rfind('#') #temporary
if p!=-1:
url=url[]
visited=1
m=urllib2.urlopen(url)
fopen=open(path,'wb')
fopen.seek(0)
fopen.write(url+'|')
fopen.write(m.read())
fopen.close()
print url ,'retrieved'
except IOError:
print url
print "ERROR:OOPS! THE URL CAN'T BE RETRIEVED"
return
a specified dir. i want to extract the respective filenames from the
urls e.g the page code.google.com shud be saved as code-google.htm or
something similar. can u suggest me a way to do it
def retrieve_url(self,url):
""" The main method of the robot class and is called
run method to retrieve the given urls from the web."""
if url is not None:
try:
if visited.has_key(url): return
pieces=urlparse.urlparse(url)
filepath=pieces[2]
if filepath != '':
filepath=filepath[1:]
filename=filepath.split("/")[-1]
else:
filename='home.htm'
path=os.path.join(PAGE_DIR,filename)
url=urlparse.urlunparse(pieces)
p=url.rfind('#') #temporary
if p!=-1:
url=url[]
visited=1
m=urllib2.urlopen(url)
fopen=open(path,'wb')
fopen.seek(0)
fopen.write(url+'|')
fopen.write(m.read())
fopen.close()
print url ,'retrieved'
except IOError:
print url
print "ERROR:OOPS! THE URL CAN'T BE RETRIEVED"
return