code debugging

golu · Jul 26, 2009

here is a code which crawls links sent to it. theres some problem with
the retrieve_url function ,plz help me out in debugging the fuction
retrive_url. This function retrives pages and saves them in file
#TODO:Visited dict grows in size it needs to be handled smartly
#Moreover server program needs to be in sync with the client eg.
Myrobot
#Take care of tag - 'if modified since',repeated links,hash links
#This is the client side of the distributed crawling framework
#It gets the list of urls to be crawled
#Then crawls the urls and stores the pages in a temporary archive
#which is then transferred to the server or grey_matter
import httplib
import os
import sys
import urlparse
import urllib2
import urllib
import zipfile
import threading

from socket import *
PAGE_DIR="C:/users/jayesh/
pages/" # directory where the
web pages are stored temporarily

# before transfer to the grey_matter
visited=
{} # a
dict to remember visited urls
ROBOT_COUNT=4

def fget():
""" This function retrieves the zipped file
containing the list of urls from the grey_matter and
saves them in a local file 'list.txt'. """

httplib.HTTPConnection.debuglevel=1
request=urllib2.Request('http://192.168.153.57/list.zip')
#Requesting the zipped file
request.add_header('Accept-encoding','gzip') #containing
the list of urls
opener=urllib2.build_opener()
flag=1
s='Waiting for server'
while flag==1:
try:
op=opener.open(request)
flag=0
except:
s=s+'*'
print s
f=open('list.zip',"wb")
f.write(op.read())
f.close()
z=zipfile.ZipFile('list.zip')
p=z.namelist()
g=open('list.txt',"wb")
g.write(z.read(p[0]))
g.close()
print 'got zipped file'

def compress():
""" This function compresses the crawled pages and stores them in
a single compressed file ready to be sent to the
grey_matter."""

zfile=zipfile.ZipFile('C:/xampp/htdocs/pages.zip',mode='w')
for fil in os.listdir(PAGE_DIR):
full=os.path.join(PAGE_DIR,fil)
zfile.write(full,fil)
os.remove(full)
os.rmdir(PAGE_DIR) #Removing the directory after
transfer to grey_matter

x=0
class robot(threading.Thread):
""" The main robot class which does the crawling of listed
urls it recieves from the grey matter. It uses 3 threads which
crawl the listed urls synchronously."""

def __init__(self,urllist,urllistlock,dblock):
threading.Thread.__init__(self)
self.urllist=urllist
self.urllistlock=urllistlock
self.dblock=dblock

def popurl(self):
""" This method pops out urls from the urls file one by one
and sends them for retrieval."""

self.urllistlock.acquire(1)
if(len(self.urllist)<1):
Nexturl=None
else:
Nexturl=self.urllist[0]
if Nexturl[-1]=='\n':Nexturl=Nexturl[:-1]
del self.urllist[0]
self.urllistlock.release()
return Nexturl

def retrieve_url(self,url):
""" The main method of the robot class and is called
run method to retrieve the given urls from the web."""
global x
if url is not None:

try:
if visited.has_key(url): return
pieces=urlparse.urlparse(url)
filepath=pieces[2]
if filepath != '':
filepath=filepath[1:]
filename=filepath.split("/")[-1]
else:
filename=x+'.htm'
x+=1

path=os.path.join(PAGE_DIR,filename)
url=urlparse.urlunparse(pieces)
p=url.rfind('#') #temporary
if p!=-1:
url=url[

]

visited=1
m=urllib2.urlopen(url)

fopen=open(path,'wb')

fopen.seek(0)
fopen.write(url+'|')

fopen.write(m.read())
fopen.close()
print url ,'retrieved'

except IOError:
print url
print "ERROR:OOPS! THE URL CAN'T BE RETRIEVED"

return

def run(self):
while(1):
url=self.popurl()
if url is None:
break
try:
self.retrieve_url(url)
except:sys.exit()

if __name__=='__main__':

s=socket(AF_INET,SOCK_STREAM)
s.bind(('',444))
s.listen(5)
q,v=s.accept()
count=1
print 'Connecting...'
while 1:
print 'Phase: %s' %(count)
message=q.recv(3)

if(message!='yes'):continue
print 'Connected'
count=count+1
fget() # Calling the fget method to get the url list
from
# grey_matter(server).
try:
os.mkdir(PAGE_DIR)
except: print 'Cant make dir'
try:
f=open('list.txt','r')
urllist=f.readlines()
f.close()
except:
print 'Error opening urls file'
sys.exit()
print 'startting threads'
urllistlock=threading.Lock()
dblock=threading.Lock()
botlist=[]
for X in range(0,ROBOT_COUNT):
newbot=robot(urllist,urllistlock,dblock)
newbot.setName('X')
botlist.append(newbot)
newbot.start()

for X in range(0,ROBOT_COUNT):
botlist[X].join()

compress()
try:
q.send('yes')
except:
print 'socket disconnected'
print sys.exit()

Chris Rebert · Jul 26, 2009

here is a code which crawls links sent to it. theres some problem with
the retrieve_url function ,plz help me out in debugging the fuction
retrive_url. This function retrives pages and saves them in file

Please specify exactly what the problem is that you are experiencing.
If you are getting an error, please provide the error message and full
traceback.

Cheers,
Chris

golu · Jul 26, 2009

Please specify exactly what the problem is that you are experiencing.
If you are getting an error, please provide the error message and full
traceback.

Cheers,
Chris
--http://blog.rebertia.com

i want to save pages in a directory and i m using the urls to get
filenames. The program gets stuck in the saving step.can u suggest me
a way to save a page e.g google.com as a file google.html

Gabriel Genellina · Jul 26, 2009

i want to save pages in a directory and i m using the urls to get
filenames. The program gets stuck in the saving step.can u suggest me
a way to save a page e.g google.com as a file google.html

You may use str.translate to replace/remove all undesired characters:

py> import string
py> valid = string.ascii_letters+string.digits+'.'
py> invalid = ''.join(chr(x) for x in range(256) if chr(x) not in valid)
py> table = string.maketrans(invalid, '_'*len(invalid))
py> x = 'http://docs.python.org/library/string.html'
py> x.translate(table)
'http___docs.python.org_library_string.html'

See http://docs.python.org/library/stdtypes.html#str.translate

Code sharing	2	Oct 15, 2024
I made a blockchain and want to make a cryptocurrency, but my code doesn't verify hash of each block	2	Jun 2, 2024
Crawling	1	Mar 10, 2021
Python code problem	2	Apr 23, 2023
Code suggestions?	0	Sep 21, 2013
Remote SSH and Configuring code help	0	Dec 13, 2023
I Need Fix In Code	1	Apr 12, 2023
Esp8266 Code problem	0	Mar 10, 2023

code debugging

golu

Chris Rebert

golu

Gabriel Genellina

Ask a Question

Similar Threads

Members online

Forum statistics

Latest Threads