X
xreload
Hello !
I have some class for getting html documents :
"""
Wrapper for Python sockets lib
"""
import socket
import urlparse
import random
import io
import re
import sys
# socket wrapper class
class sock:
def __init__(self,url):
parse = urlparse.urlparse(url)
self.req = [] # request tuple
self.response = "" # response data
self.port = socket.getservbyname("www","tcp") # remote host
port
if parse[2] is not '':
if parse[4] is not '':
self.path = parse[2] + "?" + parse[4]
else:
self.path = parse[2]
else:
self.path = "/" # request path
if parse[1] is not '':
self.host = parse[1] # remote host name
else:
self.host = ""
self.req.append("GET " + self.path + " HTTP/1.1")
self.req.append("Host: " + self.host)
# set user-agent
def useragent(self, useragent):
self.req.append("User-Agent: " + useragent)
# set document max size in bytes
def range(self, size=0):
self.range = size
# get response document body
def get_body(self):
body = self.response.split("\r\n\r\n", 2)
try:
return body[1]
except:
return self.response
# do http request
def request(self, timeout=60,chunk=1024):
self.req.append("Accept: */*")
self.req.append("Pragma: no-cache")
self.req.append("Connection: close")
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(timeout)
try:
s.connect((self.host,self.port))
except:
print "Cant connect to remote host: "+self.host
try:
s.sendall("\r\n".join(self.req)+"\r\n\r\n")
except:
print "Cant write data to socket"
while 1:
try:
buffer = s.recv(chunk)
except:
print "Cant read data from socket."
break
if not buffer :
break
self.response = self.response+buffer
if len(self.response) > self.range and self.range != 0:
print "Document is too big"
break
try:
s.close()
except:
print "Cant close socket"
if __name__ == '__main__':
if len(sys.argv) < 2:
print '\nNo URL specified for module test.\nUsage: sock.py
<URL>'
sys.exit()
test = sock(sys.argv[1])
test.useragent("Mozilla/4.0 (compatible; MSIE 5.5; Windows NT
4.0)")
test.range()
test.request()
print test.get_body()
-----------
So, lets do :
sock.py "http://forums.childrenwithdiabetes.com/showthread.php?t=5030"
- it not ok , only some part of document.
wget "http://forums.childrenwithdiabetes.com/showthread.php?t=5030" -
it ok !
sock.py "http://www.google.com/" - it ok !
Why i got only some part of document ? This is some bug in sockets
module or i do something wrong in my code?
Help me please , iam "googled" several hours , but not found any
related information.
All my bests.Igor.
I have some class for getting html documents :
"""
Wrapper for Python sockets lib
"""
import socket
import urlparse
import random
import io
import re
import sys
# socket wrapper class
class sock:
def __init__(self,url):
parse = urlparse.urlparse(url)
self.req = [] # request tuple
self.response = "" # response data
self.port = socket.getservbyname("www","tcp") # remote host
port
if parse[2] is not '':
if parse[4] is not '':
self.path = parse[2] + "?" + parse[4]
else:
self.path = parse[2]
else:
self.path = "/" # request path
if parse[1] is not '':
self.host = parse[1] # remote host name
else:
self.host = ""
self.req.append("GET " + self.path + " HTTP/1.1")
self.req.append("Host: " + self.host)
# set user-agent
def useragent(self, useragent):
self.req.append("User-Agent: " + useragent)
# set document max size in bytes
def range(self, size=0):
self.range = size
# get response document body
def get_body(self):
body = self.response.split("\r\n\r\n", 2)
try:
return body[1]
except:
return self.response
# do http request
def request(self, timeout=60,chunk=1024):
self.req.append("Accept: */*")
self.req.append("Pragma: no-cache")
self.req.append("Connection: close")
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(timeout)
try:
s.connect((self.host,self.port))
except:
print "Cant connect to remote host: "+self.host
try:
s.sendall("\r\n".join(self.req)+"\r\n\r\n")
except:
print "Cant write data to socket"
while 1:
try:
buffer = s.recv(chunk)
except:
print "Cant read data from socket."
break
if not buffer :
break
self.response = self.response+buffer
if len(self.response) > self.range and self.range != 0:
print "Document is too big"
break
try:
s.close()
except:
print "Cant close socket"
if __name__ == '__main__':
if len(sys.argv) < 2:
print '\nNo URL specified for module test.\nUsage: sock.py
<URL>'
sys.exit()
test = sock(sys.argv[1])
test.useragent("Mozilla/4.0 (compatible; MSIE 5.5; Windows NT
4.0)")
test.range()
test.request()
print test.get_body()
-----------
So, lets do :
sock.py "http://forums.childrenwithdiabetes.com/showthread.php?t=5030"
- it not ok , only some part of document.
wget "http://forums.childrenwithdiabetes.com/showthread.php?t=5030" -
it ok !
sock.py "http://www.google.com/" - it ok !
Why i got only some part of document ? This is some bug in sockets
module or i do something wrong in my code?
Help me please , iam "googled" several hours , but not found any
related information.
All my bests.Igor.