G
golu
here is a code which crawls links sent to it. theres some problem with
the retrieve_url function ,plz help me out in debugging the fuction
retrive_url. This function retrives pages and saves them in file
#TODO:Visited dict grows in size it needs to be handled smartly
#Moreover server program needs to be in sync with the client eg.
Myrobot
#Take care of tag - 'if modified since',repeated links,hash links
#This is the client side of the distributed crawling framework
#It gets the list of urls to be crawled
#Then crawls the urls and stores the pages in a temporary archive
#which is then transferred to the server or grey_matter
import httplib
import os
import sys
import urlparse
import urllib2
import urllib
import zipfile
import threading
from socket import *
PAGE_DIR="C:/users/jayesh/
pages/" # directory where the
web pages are stored temporarily
# before transfer to the grey_matter
visited=
{} # a
dict to remember visited urls
ROBOT_COUNT=4
def fget():
""" This function retrieves the zipped file
containing the list of urls from the grey_matter and
saves them in a local file 'list.txt'. """
httplib.HTTPConnection.debuglevel=1
request=urllib2.Request('http://192.168.153.57/list.zip')
#Requesting the zipped file
request.add_header('Accept-encoding','gzip') #containing
the list of urls
opener=urllib2.build_opener()
flag=1
s='Waiting for server'
while flag==1:
try:
op=opener.open(request)
flag=0
except:
s=s+'*'
print s
f=open('list.zip',"wb")
f.write(op.read())
f.close()
z=zipfile.ZipFile('list.zip')
p=z.namelist()
g=open('list.txt',"wb")
g.write(z.read(p[0]))
g.close()
print 'got zipped file'
def compress():
""" This function compresses the crawled pages and stores them in
a single compressed file ready to be sent to the
grey_matter."""
zfile=zipfile.ZipFile('C:/xampp/htdocs/pages.zip',mode='w')
for fil in os.listdir(PAGE_DIR):
full=os.path.join(PAGE_DIR,fil)
zfile.write(full,fil)
os.remove(full)
os.rmdir(PAGE_DIR) #Removing the directory after
transfer to grey_matter
x=0
class robot(threading.Thread):
""" The main robot class which does the crawling of listed
urls it recieves from the grey matter. It uses 3 threads which
crawl the listed urls synchronously."""
def __init__(self,urllist,urllistlock,dblock):
threading.Thread.__init__(self)
self.urllist=urllist
self.urllistlock=urllistlock
self.dblock=dblock
def popurl(self):
""" This method pops out urls from the urls file one by one
and sends them for retrieval."""
self.urllistlock.acquire(1)
if(len(self.urllist)<1):
Nexturl=None
else:
Nexturl=self.urllist[0]
if Nexturl[-1]=='\n':Nexturl=Nexturl[:-1]
del self.urllist[0]
self.urllistlock.release()
return Nexturl
def retrieve_url(self,url):
""" The main method of the robot class and is called
run method to retrieve the given urls from the web."""
global x
if url is not None:
try:
if visited.has_key(url): return
pieces=urlparse.urlparse(url)
filepath=pieces[2]
if filepath != '':
filepath=filepath[1:]
filename=filepath.split("/")[-1]
else:
filename=x+'.htm'
x+=1
path=os.path.join(PAGE_DIR,filename)
url=urlparse.urlunparse(pieces)
p=url.rfind('#') #temporary
if p!=-1:
url=url[]
visited=1
m=urllib2.urlopen(url)
fopen=open(path,'wb')
fopen.seek(0)
fopen.write(url+'|')
fopen.write(m.read())
fopen.close()
print url ,'retrieved'
except IOError:
print url
print "ERROR:OOPS! THE URL CAN'T BE RETRIEVED"
return
def run(self):
while(1):
url=self.popurl()
if url is None:
break
try:
self.retrieve_url(url)
except:sys.exit()
if __name__=='__main__':
s=socket(AF_INET,SOCK_STREAM)
s.bind(('',444))
s.listen(5)
q,v=s.accept()
count=1
print 'Connecting...'
while 1:
print 'Phase: %s' %(count)
message=q.recv(3)
if(message!='yes'):continue
print 'Connected'
count=count+1
fget() # Calling the fget method to get the url list
from
# grey_matter(server).
try:
os.mkdir(PAGE_DIR)
except: print 'Cant make dir'
try:
f=open('list.txt','r')
urllist=f.readlines()
f.close()
except:
print 'Error opening urls file'
sys.exit()
print 'startting threads'
urllistlock=threading.Lock()
dblock=threading.Lock()
botlist=[]
for X in range(0,ROBOT_COUNT):
newbot=robot(urllist,urllistlock,dblock)
newbot.setName('X')
botlist.append(newbot)
newbot.start()
for X in range(0,ROBOT_COUNT):
botlist[X].join()
compress()
try:
q.send('yes')
except:
print 'socket disconnected'
print sys.exit()
the retrieve_url function ,plz help me out in debugging the fuction
retrive_url. This function retrives pages and saves them in file
#TODO:Visited dict grows in size it needs to be handled smartly
#Moreover server program needs to be in sync with the client eg.
Myrobot
#Take care of tag - 'if modified since',repeated links,hash links
#This is the client side of the distributed crawling framework
#It gets the list of urls to be crawled
#Then crawls the urls and stores the pages in a temporary archive
#which is then transferred to the server or grey_matter
import httplib
import os
import sys
import urlparse
import urllib2
import urllib
import zipfile
import threading
from socket import *
PAGE_DIR="C:/users/jayesh/
pages/" # directory where the
web pages are stored temporarily
# before transfer to the grey_matter
visited=
{} # a
dict to remember visited urls
ROBOT_COUNT=4
def fget():
""" This function retrieves the zipped file
containing the list of urls from the grey_matter and
saves them in a local file 'list.txt'. """
httplib.HTTPConnection.debuglevel=1
request=urllib2.Request('http://192.168.153.57/list.zip')
#Requesting the zipped file
request.add_header('Accept-encoding','gzip') #containing
the list of urls
opener=urllib2.build_opener()
flag=1
s='Waiting for server'
while flag==1:
try:
op=opener.open(request)
flag=0
except:
s=s+'*'
print s
f=open('list.zip',"wb")
f.write(op.read())
f.close()
z=zipfile.ZipFile('list.zip')
p=z.namelist()
g=open('list.txt',"wb")
g.write(z.read(p[0]))
g.close()
print 'got zipped file'
def compress():
""" This function compresses the crawled pages and stores them in
a single compressed file ready to be sent to the
grey_matter."""
zfile=zipfile.ZipFile('C:/xampp/htdocs/pages.zip',mode='w')
for fil in os.listdir(PAGE_DIR):
full=os.path.join(PAGE_DIR,fil)
zfile.write(full,fil)
os.remove(full)
os.rmdir(PAGE_DIR) #Removing the directory after
transfer to grey_matter
x=0
class robot(threading.Thread):
""" The main robot class which does the crawling of listed
urls it recieves from the grey matter. It uses 3 threads which
crawl the listed urls synchronously."""
def __init__(self,urllist,urllistlock,dblock):
threading.Thread.__init__(self)
self.urllist=urllist
self.urllistlock=urllistlock
self.dblock=dblock
def popurl(self):
""" This method pops out urls from the urls file one by one
and sends them for retrieval."""
self.urllistlock.acquire(1)
if(len(self.urllist)<1):
Nexturl=None
else:
Nexturl=self.urllist[0]
if Nexturl[-1]=='\n':Nexturl=Nexturl[:-1]
del self.urllist[0]
self.urllistlock.release()
return Nexturl
def retrieve_url(self,url):
""" The main method of the robot class and is called
run method to retrieve the given urls from the web."""
global x
if url is not None:
try:
if visited.has_key(url): return
pieces=urlparse.urlparse(url)
filepath=pieces[2]
if filepath != '':
filepath=filepath[1:]
filename=filepath.split("/")[-1]
else:
filename=x+'.htm'
x+=1
path=os.path.join(PAGE_DIR,filename)
url=urlparse.urlunparse(pieces)
p=url.rfind('#') #temporary
if p!=-1:
url=url[]
visited=1
m=urllib2.urlopen(url)
fopen=open(path,'wb')
fopen.seek(0)
fopen.write(url+'|')
fopen.write(m.read())
fopen.close()
print url ,'retrieved'
except IOError:
print url
print "ERROR:OOPS! THE URL CAN'T BE RETRIEVED"
return
def run(self):
while(1):
url=self.popurl()
if url is None:
break
try:
self.retrieve_url(url)
except:sys.exit()
if __name__=='__main__':
s=socket(AF_INET,SOCK_STREAM)
s.bind(('',444))
s.listen(5)
q,v=s.accept()
count=1
print 'Connecting...'
while 1:
print 'Phase: %s' %(count)
message=q.recv(3)
if(message!='yes'):continue
print 'Connected'
count=count+1
fget() # Calling the fget method to get the url list
from
# grey_matter(server).
try:
os.mkdir(PAGE_DIR)
except: print 'Cant make dir'
try:
f=open('list.txt','r')
urllist=f.readlines()
f.close()
except:
print 'Error opening urls file'
sys.exit()
print 'startting threads'
urllistlock=threading.Lock()
dblock=threading.Lock()
botlist=[]
for X in range(0,ROBOT_COUNT):
newbot=robot(urllist,urllistlock,dblock)
newbot.setName('X')
botlist.append(newbot)
newbot.start()
for X in range(0,ROBOT_COUNT):
botlist[X].join()
compress()
try:
q.send('yes')
except:
print 'socket disconnected'
print sys.exit()