1.网页分析
今天要爬取的网址:https://unsplash.com/,这个网站的图片据我观察还是非常精致的。 经过分析发现该网站使用ajax请求页面(AJAX 最大的优点是在不重新加载整个页面的情况下,可以与服务器交换数据并更新部分网页内容。—-菜鸟教程对ajax部分定义),这样的话直接分析原始页面源码来获取图片下载链接好像并不是一个好的方法。我们打开开发者工具,选择Network,在下面选择只查看XHR,之后我们向下滚动页面可以发现如下结果 我们发现每次出现新的图片时,它的请求地址都是这样的
https://unsplash.com/napi/photos?page=3&per_page=12
https://unsplash.com/napi/photos?page=4&per_page=12
https://unsplash.com/napi/photos?page=5&per_page=12
https://unsplash.com/napi/photos?page=6&per_page=12
网址规律显而易见page是请求页面数,per_page为每个页面有12张图片,真正的请求地址
url = 'https://unsplash.com/napi/photos?page={}&per_page=12'.format(i)
再选择开发者工具中的Preview,发现请求得到的信息是json格式,并且在links下存在图片的下载链接 已经得到了链接下载自然不是难题了
2.代码
本文采取多线程的方式爬取
1.初始准备
import requests
import time
import random
import threading
urls = [] # 网页请求地址列表
img_url = [] # 图片下载地址列表
g_lock = threading.Lock() # 初始化锁
# 随机生成user_agent
def choose_header():
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
]
UserAgent = random.choice(user_agent_list)
return UserAgent
headers = {
'User-Agent': choose_header(),
'access-control-allow-origin': 'https://unsplash.com',
}
2.生成页面地址
# 拼接十个页面地址
def set_url():
"""生成页面url"""
for i in range(1, 11):
url = 'https://unsplash.com/napi/photos?page={}&per_page=12'.format(i)
urls.append(url)
return urls
3.得到图片下载链接
# 公共资源urls和img_url 都需要锁机制进行保护
class GetUrl(threading.Thread):
"""获取每个图片的id和下载路径"""
def run(self):
print("{} is running ".format(threading.current_thread))
while len(urls) > 0:
g_lock.acquire()
url = urls.pop() # 通过pop方法获取页面url
g_lock.release()
req = requests.get(url, headers=headers)
data = req.json()
for i in range(12): # 根据开头获取的信息一页有12张图片
g_lock.acquire()
# 向img_url中添加字典,id为将要保存图片的名字,url为下载链接
img_url.append({'id': data[i]['id'], 'url': data[i]['links']['download']})
g_lock.release()
time.sleep(0.5)
return img_url
4.下载图片
class Download(threading.Thread):
def run(self):
print("{} is running ".format(threading.current_thread))
path = 'xxxxxxx' # 自己的想要保存图片的地址,方便起请提前创建好该地址
while len(img_url) > 0:
g_lock.acquire()
img = img_url.pop() # 通过pop方法获取图片下载url
g_lock.release()
res = requests.get(img.get('url'), headers=headers)
with open(path + "/" + img.get('id') + ".jpg", "wb") as f:
f.write(res.content)
time.sleep(0.5)
5.main函数
if __name__ == '__main__':
set_url() # 生成页面地址函数
get_list = [] # 获取图片链接线程列表
down_list = [] # 下载图片线程列表
for i in range(5):
g = GetUrl()
g.start()
get_list.append(g)
for d in get_list:
d.join()
print("链接下载完成")
for i in range(5):
g = Download()
g.start()
down_list.append(g)
for d in down_list:
d.join()
print("图片下载完成")
ps:本文代码多线程思路和代码结构都非常感谢这位大大https://blog.csdn.net/hihell/article/category/7888014,这位大大爬虫教程写的非常详细易懂,我受益匪浅。 *** 本博客文章仅供博主学习交流,博主才疏学浅,语言表达能力有限,对知识的理解、编写代码能力都有很多不足,希望各路大神多多包涵,多加指点。