Python urllib库

释放双眼,带上耳机,听听看~!

urllib是python内置的HTTP请求库:

urllib.request 请求模块
urllib.error 异常处理模块
urllib.parse url解析模块
urllib.robotparser robots.txt解析模块


1
2
3
1urllib.request.urlopen(url,data=None,[timeout,]*,cafile=None,capath=None,cadeffault=False,context=None)
2
3

decode(“utf-8”):转换为字符串(utf-8)编码


1
2
3
4
5
1import urllib.request
2html=urllib.request.urlopen("http://www.baidu.com/")
3print(html.read().decode("utf-8"))
4
5

以post方式访问: 其中http://httpbin.org (供我们做http测试的网址)


1
2
3
4
5
6
7
1import urllib.parse
2import urllib.request
3data=bytes(urllib.parse.urlencode({"word":"hello"}),encoding="utf-8") #encoding=""以指定的编码方式
4response=urllib.request.urlopen("http://httpbin.org/post",data=data)
5print(response.read())
6
7

结果


1
2
3
1b'{\n  "args": {}, \n  "data": "", \n  "files": {}, \n  "form": {\n    "word": "hello"\n  }, \n  "headers": {\n    "Accept-Encoding": "identity", \n    "Content-Length": "10", \n    "Content-Type": "application/x-www-form-urlencoded", \n    "Host": "httpbin.org", \n    "User-Agent": "Python-urllib/3.6"\n  }, \n  "json": null, \n  "origin": "113.105.12.153, 113.105.12.153", \n  "url": "https://httpbin.org/post"\n}\n'
2
3

设置超时


1
2
3
4
5
6
7
8
9
10
1   import urllib.error
2   import socket
3   import urllib.request
4   try:
5       response=urllib.request.urlopen("http://httpbin.org/get",timeout=0.1)
6   except urllib.error.URLError as e:
7       if isinstance(e.reason,socket.timeout):
8           print("TIME OUT")
9
10

结果


1
2
3
1   TIME OUT
2
3

响应

响应类型


1
2
3
4
5
6
7
1import urllib.request
2html=urllib.request.urlopen("http://www.baidu.com/")
3print(html.read().decode("utf-8"))
4
5<http.client.HTTPResponse object at 0x0000024B3B676080>
6
7

响应码,响应头


1
2
3
4
5
6
7
8
9
10
11
12
1import urllib.request
2html=urllib.request.urlopen("http://www.baidu.com/")
3print(html.status)
4print(html.getheaders())
5print(html.getheader('server'))
6
7
8200
9[('Bdpagetype', '1'), ('Bdqid', '0x8afac3a8000c1dba'), ('Cache-Control', 'private'), ('Content-Type', 'text/html'), ('Cxy_all', 'baidu+8ec69d29edd1ec53e9faabc8051e2fd7'), ('Date', 'Sun, 17 Mar 2019 07:12:33 GMT'), ('Expires', 'Sun, 17 Mar 2019 07:11:47 GMT'), ('P3p', 'CP=" OTI DSP COR IVA OUR IND COM "'), ('Server', 'BWS/1.1'), ('Set-Cookie', 'BAIDUID=5F61E86C65F2F415AE669543617A67B2:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com'), ('Set-Cookie', 'BIDUPSID=5F61E86C65F2F415AE669543617A67B2; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com'), ('Set-Cookie', 'PSTM=1552806753; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com'), ('Set-Cookie', 'delPer=0; path=/; domain=.baidu.com'), ('Set-Cookie', 'BDSVRTM=0; path=/'), ('Set-Cookie', 'BD_HOME=0; path=/'), ('Set-Cookie', 'H_PS_PSSID=1438_21118_28558_28607_28584_26350_28604_28606; path=/; domain=.baidu.com'), ('Vary', 'Accept-Encoding'), ('X-Ua-Compatible', 'IE=Edge,chrome=1'), ('Connection', 'close'), ('Transfer-Encoding', 'chunked')]
10BWS/1.1
11
12

read() 获取响应体的内容:


1
2
3
1html.read()
2
3

Request

request.Request(url-url,data=data,headers=headers,methon=“POST”)
url:网址地址
data:提交的表单数据
headers:响应头
methon:访问方式


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
1from urllib import parse,request
2
3url='http://httpbin.org/post'
4headers={
5    'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Mobile Safari/537.36',
6    'Host':'httpbin.org'
7}
8dict={
9    'name':'Germey'
10}
11data=bytes(urllib.parse.urlencode(dict),encoding="utf-8")
12req=request.Request(url=url,data=data,headers=headers,method='POST')
13response=request.urlopen(req)
14print(response.read().decode('utf-8'))
15
16

结果


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
1{
2  "args": {},
3  "data": "",
4  "files": {},
5  "form": {
6    "name": "Germey"
7  },
8  "headers": {
9    "Accept-Encoding": "identity",
10    "Content-Length": "11",
11    "Content-Type": "application/x-www-form-urlencoded",
12    "Host": "httpbin.org",
13    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Mobile Safari/537.36"
14  },
15  "json": null,
16  "origin": "113.105.12.153, 113.105.12.153",
17  "url": "https://httpbin.org/post"
18}
19
20

handler

代理

方法一:


1
2
3
4
5
6
7
8
9
10
11
1import urllib.request
2proxy_handler=urllib.request.ProxyHandler(
3{
4   'https':'219.131.240.200:9797'(千万注意http后面没有点)
5
6})
7opener=urllib.request.build_opener(proxy_handler,urllib.request.HTTPHandler)
8response=opener.open("https://httpbin.org/get")
9print(response.read())
10
11

结果


1
2
3
1b'{\n  "args": {}, \n  "headers": {\n    "Accept-Encoding": "identity", \n    "Host": "httpbin.org", \n    "User-Agent": "Python-urllib/3.6"\n  }, \n  "origin": "219.131.240.200, 219.131.240.200", \n  "url": "https://httpbin.org/get"\n}\n'
2
3

方法二:


1
2
3
4
5
6
7
8
9
10
11
1import urllib.request
2proxy_handler=urllib.request.ProxyHandler(
3{
4   'https':'219.131.240.200:9797'
5})
6opener=urllib.request.build_opener(proxy_handler)
7urllib.request.install_opener(opener)
8response=urllib.request.urlopen("https://httpbin.org/get")
9print(response.read())
10
11

结果


1
2
3
1b'{\n  "args": {}, \n  "headers": {\n    "Accept-Encoding": "identity", \n    "Host": "httpbin.org", \n    "User-Agent": "Python-urllib/3.6"\n  }, \n  "origin": "219.131.240.200, 219.131.240.200", \n  "url": "https://httpbin.org/get"\n}\n'
2
3

!!!注意http的代理只能代理HTTP开头的,https的代理只能代理 HTTPS的

cookie

cookie 可以保持登录会话信息
导入处理cookie 的库 http.cookiejar


1
2
3
4
5
6
7
8
9
10
1import http.cookiejar,urllib.request
2
3cookie =http.cookiejar.CookieJar()#注意大小写
4handler=urllib.request.HTTPCookieProcessor(cookie)
5opener=urllib.request.build_opener(handler)
6response=opener.open('http://www.baidu.com')
7for item in cookie:
8    print(item.name+'*'+item.value)
9
10

结果:


1
2
3
4
5
6
7
8
9
1BAIDUID*C31837787335FED26959A1D8CCE1030F:FG=1
2BIDUPSID*C31837787335FED26959A1D8CCE1030F
3H_PS_PSSID*1450_21085_28557_28608_28584_26350_28603_28606
4PSTM*1552816088
5delPer*0
6BDSVRTM*0
7BD_HOME*0
8
9

cookie保存为文本文件
第一种方法:


1
2
3
4
5
6
7
8
9
10
1import http.cookiejar,urllib.request
2
3filename='C:/Users/hanson/Desktop/1/cookie.txt' #保存的文件位置和文件名,默认为工程目录
4cookie=http.cookiejar.MozillaCookieJar(filename) #cookie声明为http.cookiejar的子类对象MozillCookieJar,因为其带有save()方法
5handler=urllib.request.HTTPCookieProcessor(cookie)
6opener=urllib.request.build_opener(handler)
7response=opener.open('http://www.baidu.com')
8cookie.save(ignore_discard=True,ignore_expires=True)
9
10

结果


1
2
3
4
5
6
7
8
9
10
11
12
13
1 Netscape HTTP Cookie File
2# http://curl.haxx.se/rfc/cookie_spec.html
3 This is a generated file!  Do not edit.
4
5.baidu.com TRUE    /   FALSE   3700363349  BAIDUID D3E2F4A0A280B33C6E7C5558F8A6DB34:FG=1
6.baidu.com TRUE    /   FALSE   3700363349  BIDUPSID    D3E2F4A0A280B33C6E7C5558F8A6DB34
7.baidu.com TRUE    /   FALSE       H_PS_PSSID  28629_1444_21119_28558_28607_28584_28603_28626_28605
8.baidu.com TRUE    /   FALSE   3700363349  PSTM    1552879705
9.baidu.com TRUE    /   FALSE       delPer  0
10www.baidu.com FALSE   /   FALSE       BDSVRTM 0
11www.baidu.com FALSE   /   FALSE       BD_HOME 0
12
13

第二种方法:


1
2
3
4
5
6
7
8
9
10
1import http.cookiejar,urllib.request
2
3filename='C:/Users/hanson/Desktop/1/cookie1.txt'
4cookie=http.cookiejar.LWPCookieJar(filename)   把MozillCookieJar改为LWPCookieJar
5handler=urllib.request.HTTPCookieProcessor(cookie)
6opener=urllib.request.build_opener(handler)
7response=opener.open('http://www.baidu.com')
8cookie.save(ignore_discard=True,ignore_expires=True)
9
10

结果


1
2
3
4
5
6
7
8
9
10
1#LWP-Cookies-2.0
2Set-Cookie3: BAIDUID="AFA15173D5BB3D6F2CA1645B51A149C4:FG=1"; path="/"; domain=".baidu.com"; path_spec; domain_dot; expires="2087-04-05 06:49:31Z"; version=0
3Set-Cookie3: BIDUPSID=AFA15173D5BB3D6F2CA1645B51A149C4; path="/"; domain=".baidu.com"; path_spec; domain_dot; expires="2087-04-05 06:49:31Z"; version=0
4Set-Cookie3: H_PS_PSSID=1438_21113_28558_28607_28584_28604_28625_28606; path="/"; domain=".baidu.com"; path_spec; domain_dot; discard; version=0
5Set-Cookie3: PSTM=1552880127; path="/"; domain=".baidu.com"; path_spec; domain_dot; expires="2087-04-05 06:49:31Z"; version=0
6Set-Cookie3: delPer=0; path="/"; domain=".baidu.com"; path_spec; domain_dot; discard; version=0
7Set-Cookie3: BDSVRTM=0; path="/"; domain="www.baidu.com"; path_spec; discard; version=0
8Set-Cookie3: BD_HOME=0; path="/"; domain="www.baidu.com"; path_spec; discard; version=0
9
10

用cookie打开网址
用哪种cookie保存就用哪种打开


1
2
3
4
5
6
7
8
9
10
1import http.cookiejar,urllib.request
2
3cookie=http.cookiejar.LWPCookieJar() 用哪种cookie就用哪种cookie保存方式
4cookie.load('C:/Users/hanson/Desktop/1/cookie1.txt',ignore_discard=True,ignore_expires=True)
5handler=urllib.request.HTTPCookieProcessor(cookie)
6opener=urllib.request.build_opener(handler)
7response=opener.open('http://www.baidu.com')
8print(response.read().decode('utf-8'))
9
10

异常处理:

父类:URLError
子类:HTTPError


1
2
3
4
1try:
2except
3
4

URL解析

给TA打赏
共{{data.count}}人
人已打赏
安全技术

c++ vector

2022-1-11 12:36:11

安全运维

nginx隐藏版本号

2021-8-18 16:36:11

个人中心
购物车
优惠劵
今日签到
有新私信 私信列表
搜索