A
Alex
Hi all.
In order to understand the concept of threading pool in python I'm
working on a simple single-site web crawler.
I would like to stop the program when the threading pool have
downloaded all internal links from a web site, but now my program keep
waiting forever even if there are no more links to download.
Here's my code, I appreciate any comments, I'm programming just for
fun and learning ;-)
Thanks in advance.
from BeautifulSoup import BeautifulSoup
import urllib
from pprint import pprint
import string
from urlparse import urlparse
import sys
from threading import Thread
import time
from Queue import Queue
#dirty hack: set default encoding to utf-8
reload(sys)
sys.setdefaultencoding('utf-8')
opener = urllib.FancyURLopener({})
class Crawler:
def __init__(self):
"""
Constructor
"""
self.missed = 0
self.url_list = []
self.urls_queue = Queue()
self.num_threads = 5
self._create_threads()
def get_internal_links(self,url):
"""
Get all internal links from a web page and feed the queue
"""
self.url = url
url_netloc = urlparse(self.url).netloc
print "Downloading... ", self.url
time.sleep(5)
try:
p = opener.open(self.url)
#print p.info()
except IOError:
print "error connecting to ", self.url
print "wait..."
time.sleep(5)
print "retry..."
try:
p = urllib.urlopen(self.url)
except IOError:
self.missed = self.missed + 1
return None
html = p.read()
soup = BeautifulSoup(html)
anchors = soup.findAll('a')
links = [ str(anchor['href']) for anchor in anchors]
internal_links = [link for link in links if
(urlparse(link).netloc == url_netloc)]
for link in internal_links:
if link not in self.url_list and link != self.url:
self.url_list.append(link)
self.urls_queue.put(link)
print "Queue size: ", self.urls_queue.qsize()
print "List size: ", str(len(self.url_list))
print "Errors: ", str(self.missed)
self._queue_consumer()
def _queue_consumer(self):
"""
Consume the queue
"""
while True:
url = self.urls_queue.get()
print 'Next url: ', url
self.get_internal_links(url)
self.urls_queue.task_done()
def _create_threads(self):
"""
Set up some threads to fetch pages
"""
for i in range(self.num_threads):
worker = Thread(target=self._queue_consumer, args=())
worker.setDaemon(True)
worker.start()
#-----------------------------------------------------------------------------
#
if __name__ == '__main__':
c = Crawler()
c.get_internal_links('http://www.thinkpragmatic.net/')
In order to understand the concept of threading pool in python I'm
working on a simple single-site web crawler.
I would like to stop the program when the threading pool have
downloaded all internal links from a web site, but now my program keep
waiting forever even if there are no more links to download.
Here's my code, I appreciate any comments, I'm programming just for
fun and learning ;-)
Thanks in advance.
from BeautifulSoup import BeautifulSoup
import urllib
from pprint import pprint
import string
from urlparse import urlparse
import sys
from threading import Thread
import time
from Queue import Queue
#dirty hack: set default encoding to utf-8
reload(sys)
sys.setdefaultencoding('utf-8')
opener = urllib.FancyURLopener({})
class Crawler:
def __init__(self):
"""
Constructor
"""
self.missed = 0
self.url_list = []
self.urls_queue = Queue()
self.num_threads = 5
self._create_threads()
def get_internal_links(self,url):
"""
Get all internal links from a web page and feed the queue
"""
self.url = url
url_netloc = urlparse(self.url).netloc
print "Downloading... ", self.url
time.sleep(5)
try:
p = opener.open(self.url)
#print p.info()
except IOError:
print "error connecting to ", self.url
print "wait..."
time.sleep(5)
print "retry..."
try:
p = urllib.urlopen(self.url)
except IOError:
self.missed = self.missed + 1
return None
html = p.read()
soup = BeautifulSoup(html)
anchors = soup.findAll('a')
links = [ str(anchor['href']) for anchor in anchors]
internal_links = [link for link in links if
(urlparse(link).netloc == url_netloc)]
for link in internal_links:
if link not in self.url_list and link != self.url:
self.url_list.append(link)
self.urls_queue.put(link)
print "Queue size: ", self.urls_queue.qsize()
print "List size: ", str(len(self.url_list))
print "Errors: ", str(self.missed)
self._queue_consumer()
def _queue_consumer(self):
"""
Consume the queue
"""
while True:
url = self.urls_queue.get()
print 'Next url: ', url
self.get_internal_links(url)
self.urls_queue.task_done()
def _create_threads(self):
"""
Set up some threads to fetch pages
"""
for i in range(self.num_threads):
worker = Thread(target=self._queue_consumer, args=())
worker.setDaemon(True)
worker.start()
#-----------------------------------------------------------------------------
#
if __name__ == '__main__':
c = Crawler()
c.get_internal_links('http://www.thinkpragmatic.net/')