10/08/2017 – 一江春水向东流

python3爬虫学习笔记之urllib库的使用

python3爬虫学习笔记之urllib库的使用基本使用

#!/usr/bin/env python
# -*- coding: utf-8 -*-

__author__ = 'JustFantasy'

import urllib.request

request_url = 'http://www.baidu.com'           # 需要请求的URL地址
response = urllib.request.urlopen(request_url) # 发起请求
print(response.read().decode('utf-8'))         # 打印响应的文本，并进行UTF-8

#!/usr/bin/env python

# -*- coding: utf-8 -*-

__author__ = 'JustFantasy'

import urllib.request

request_url = 'http://www.baidu.com' # 需要请求的URL地址

response = urllib.request.urlopen(request_url) # 发起请求

print(response.read().decode('utf-8')) # 打印响应的文本，并进行UTF-8

urlopen返回对象提供方法： read(), readline(), readlines(), fileno(), close()：对HTTPResponse类型数据进行操作 info()：返回HTTPMessage对象，表示远程服务器返回的头信息 getcode()：返回Http状态码。如果是http请求，200请求成功完成、404网址未找到等等 geturl()：返回请求的url 请求的数据传送 GET数据

#!/usr/bin/env python
# -*- coding: utf-8 -*-

__author__ = 'JustFantasy'

import urllib.request
import urllib.parse

get_data = {'username': 'aaa', 'password': 'bbb'}          # 此处将GET的数据定义为一个字典
get_data_encode = urllib.parse.urlencode(get_data)         # 将GET的数据进行编码

request_url = 'https://www.zhihu.com/#signin'              # 需要请求的URL地址
request_url += '?' + get_data_encode                       # 追加GET参数到URL后面

# https://www.zhihu.com/#signin?username=aaa&password=bbb
print(request_url)

# 发起请求
response = urllib.request.urlopen(request_url)
print(response.read().decode('utf-8'))         # 打印响应的文本，并进行UTF-8解码

#!/usr/bin/env python

# -*- coding: utf-8 -*-

__author__ = 'JustFantasy'

import urllib.request

import urllib.parse

get_data = {'username': 'aaa', 'password': 'bbb'} # 此处将GET的数据定义为一个字典

get_data_encode = urllib.parse.urlencode(get_data) # 将GET的数据进行编码

request_url = 'https://www.zhihu.com/#signin' # 需要请求的URL地址

request_url += '?' + get_data_encode # 追加GET参数到URL后面

# https://www.zhihu.com/#signin?username=aaa&password=bbb

print(request_url)

# 发起请求

response = urllib.request.urlopen(request_url)

print(response.read().decode('utf-8')) # 打印响应的文本，并进行UTF-8解码

POST数据

#!/usr/bin/env python
# -*- coding: utf-8 -*-

__author__ = 'JustFantasy'

import urllib.request
import urllib.parse

post_data = {'first': 'true', 'pn': 1, 'kd': 'Python'}      # 此处将POST的数据定义为一个字典
post_data_encode = urllib.parse.urlencode(post_data)        # 将POST的数据进行编码

# UTF-8编码
# 否则会报错：POST data should be bytes or an iterable of bytes. It cannot be of type str.
post_data_encode = post_data_encode.encode(encoding='utf-8')
request_url = 'http://www.lagou.com/jobs/positionAjax.json?'               # 需要请求的URL地址

# 发起请求
# 此处增加了第二个参数为传送的POST数据（默认为None）
# 第三个参数为请求超时时间，默认为socket._GLOBAL_DEFAULT_TIMEOUT
response = urllib.request.urlopen(request_url, post_data_encode, 3)
print(response.read().decode('utf-8'))         # 打印响应的文本，并进行UTF-8解码

#!/usr/bin/env python

# -*- coding: utf-8 -*-

__author__ = 'JustFantasy'

import urllib.request

import urllib.parse

post_data = {'first': 'true', 'pn': 1, 'kd': 'Python'} # 此处将POST的数据定义为一个字典

post_data_encode = urllib.parse.urlencode(post_data) # 将POST的数据进行编码

# UTF-8编码

# 否则会报错：POST data should be bytes or an iterable of bytes. It cannot be of type str.

post_data_encode = post_data_encode.encode(encoding='utf-8')

request_url = 'http://www.lagou.com/jobs/positionAjax.json?' # 需要请求的URL地址

# 发起请求

# 此处增加了第二个参数为传送的POST数据（默认为None）

# 第三个参数为请求超时时间，默认为socket._GLOBAL_DEFAULT_TIMEOUT

response = urllib.request.urlopen(request_url, post_data_encode, 3)

print(response.read().decode('utf-8')) # 打印响应的文本，并进行UTF-8解码

使用Request &&　设置Headers属性很多网站对非浏览器的访问都做了限制，所以如果我们要完全模拟浏览器去访问网站的话，必须要先设置Headers的属性使用chrome浏览器按F12，点击访问的链接，Headers -> Request Headers User-Agent:Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 如果服务器对页面访问来源做了限制，则需要设置Headers的Referer属性

#!/usr/bin/env python
# -*- coding: utf-8 -*-

__author__ = 'JustFantasy'

import urllib.request
import urllib.parse

user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87'
referer = 'http://www.lagou.com/jobs/positionAjax.json?'
post_data = {'first': 'true', 'pn': 1, 'kd': 'Python'}                              # 此处将POST的数据定义为一个字典
headers = {'User-Agent': user_agent, 'Referer': referer}                            # Headers属性初始化
post_data_encode = urllib.parse.urlencode(post_data)                                # 将POST的数据进行编码

# UTF-8编码
# 否则会报错：POST data should be bytes or an iterable of bytes. It cannot be of type str.
post_data_encode = post_data_encode.encode(encoding='utf-8')
request_url = 'http://www.lagou.com/zhaopin/Python/?labelWords=label'               # 需要请求的URL地址

# 使用Request来设置Headers
request = urllib.request.Request(request_url, post_data_encode, headers)

response = urllib.request.urlopen(request)
print(response.read().decode('utf-8'))         # 打印响应的文本，并进行UTF-8解码

#!/usr/bin/env python

# -*- coding: utf-8 -*-

__author__ = 'JustFantasy'

import urllib.request

import urllib.parse

user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87'

referer = 'http://www.lagou.com/jobs/positionAjax.json?'

post_data = {'first': 'true', 'pn': 1, 'kd': 'Python'} # 此处将POST的数据定义为一个字典

headers = {'User-Agent': user_agent, 'Referer': referer} # Headers属性初始化

post_data_encode = urllib.parse.urlencode(post_data) # 将POST的数据进行编码

# UTF-8编码

# 否则会报错：POST data should be bytes or an iterable of bytes. It cannot be of type str.

post_data_encode = post_data_encode.encode(encoding='utf-8')

request_url = 'http://www.lagou.com/zhaopin/Python/?labelWords=label' # 需要请求的URL地址

# 使用Request来设置Headers

request = urllib.request.Request(request_url, post_data_encode, headers)

response = urllib.request.urlopen(request)

print(response.read().decode('utf-8')) # 打印响应的文本，并进行UTF-8解码

Proxy（代理）的设置如果网站限制了IP访问的次数，则需要更换代理服务器，以免被禁止访问

#!/usr/bin/env python
# -*- coding: utf-8 -*-

__author__ = 'JustFantasy'

from urllib import request

request_url = 'http://www.lagou.com/jobs/positionAjax.json?'
proxy = request.ProxyHandler({'http': '61.136.115.147:3128'})   # 设置代理服务器
opener = request.build_opener(proxy)                            # 挂载opener
request.install_opener(opener)                                  # 安装opener
response = request.urlopen(request_url)
print(response.read().decode('utf-8'))         # 打印响应的文本，并进行UTF-8解码

#!/usr/bin/env python

# -*- coding: utf-8 -*-

__author__ = 'JustFantasy'

from urllib import request

request_url = 'http://www.lagou.com/jobs/positionAjax.json?'

proxy = request.ProxyHandler({'http': '61.136.115.147:3128'}) # 设置代理服务器

opener = request.build_opener(proxy) # 挂载opener

request.install_opener(opener) # 安装opener

response = request.urlopen(request_url)

print(response.read().decode('utf-8')) # 打印响应的文本，并进行UTF-8解码

异常处理

#!/usr/bin/env python
# -*- coding: utf-8 -*-

__author__ = 'JustFantasy'

from urllib import request
from urllib import error        # 引入错误处理的包

request_url = 'http://www.lagou.com/jobs/positionAjax.json?'
proxy = request.ProxyHandler({'http': '116.204.1.111:8081'}) # 设置代理服务器
opener = request.build_opener(proxy)                         # 挂载opener
request.install_opener(opener)                               # 安装opener
try:
    response = request.urlopen(request_url)
except error.HTTPError as e:
    print(e.code)                   # 打印错误码
    print(e.msg)                    # 打印错误信息

#!/usr/bin/env python

# -*- coding: utf-8 -*-

__author__ = 'JustFantasy'

from urllib import request

from urllib import error # 引入错误处理的包

request_url = 'http://www.lagou.com/jobs/positionAjax.json?'

proxy = request.ProxyHandler({'http': '116.204.1.111:8081'}) # 设置代理服务器

opener = request.build_opener(proxy) # 挂载opener

request.install_opener(opener) # 安装opener

try:

response = request.urlopen(request_url)

except error.HTTPError as e:

print(e.code) # 打印错误码

print(e.msg) # 打印错误信息

By sean, 8 years10/08/2017 ago

October 8, 2017

未分类

python3爬虫学习笔记之urllib库的使用