应用场景
1、多进程 :CPU密集程序
2、多线程 :爬虫(网络I/O)、本地磁盘I/O
知识点回顾
队列
# 导入模块 from queue import Queue # 使用 q = Queue() q.put(url) q.get() # 当队列为空时,阻塞 
q.empty()# 判断队列是否为空,True/False 
线程模块
# 导入模块 from threading import Thread  # 使用流程  t = Thread(target=函数名) # 创建线程对象 
t.start()# 创建并启动线程 t.join() # 阻塞等待回收线程 
小米应用商店抓取(多线程)
目标
 * 网址 :百度搜 - 小米应用商店,进入官网,应用分类 - 聊天社交 
 * 目标 :爬取应用名称和应用链接 
实现步骤
1、确认是否为动态加载
1、页面局部刷新
2、右键查看网页源代码,搜索关键字未搜到,因此此网站为动态加载网站,需要抓取网络数据包分析
2、F12抓取网络数据包
1、抓取返回json数据的URL地址(Headers中的Request URL)
  http://app.mi.com/categotyAllListApi?page={}&categoryId=2&pageSize=30
2、查看并分析查询参数(headers中的Query String Parameters)只有page在变,0 1 2 3 ... ... 
,这样我们就可以通过控制page的值拼接多个返回json数据的URL地址
  page: 1
  categoryId: 2
  pageSize: 30
 
3、将抓取数据保存到csv文件
注意多线程写入的线程锁问题
from threading import Lock
lock = Lock()
lock.acquire()
lock.release()
整体思路
 * 在 __init__(self) 中创建文件对象,多线程操作此对象进行文件写入 
 * 每个线程抓取数据后将数据进行文件写入,写入文件时需要加锁 
 * 所有数据抓取完成关闭文件 import requests from threading import Thread from queue import 
Queueimport time from lxml import etree import csv from threading import Lock 
from fake_useragent import UserAgent class XiaomiSpider(object): def __init__
(self): self.url= '
http://app.mi.com/categotyAllListApi?page={}&categoryId={}&pageSize=30' self.q 
= Queue()# 存放所有URL地址的队列 self.i = 0 self.id_list = [] # 存放所有类型id的空列表 # 打开文件 
self.f = open('xiaomi.csv', 'a', newline="") self.writer = csv.writer(self.f) 
self.lock= Lock() # 创建锁 self.ua = UserAgent() def get_cateid(self): # 请求 url = '
http://app.mi.com/' headers = {'User-Agent': self.ua.random} html = 
requests.get(url=url, headers=headers).text # 解析 parse_html = etree.HTML(html) 
li_list= parse_html.xpath('//ul[@class="category-list"]/li') for li in li_list: 
typ_name= li.xpath('./a/text()')[0] typ_id = li.xpath('./a/@href')[0].split('/'
)[-1] pages = self.get_pages(typ_id) # 计算每个类型的页数  self.id_list.append((typ_id, 
pages)) self.url_in()# 入队列 # 获取counts的值并计算页数 def get_pages(self, typ_id): # 
每页返回的json数据中,都有count这个key url = self.url.format(0, typ_id) html = 
requests.get(url=url, headers={'User-Agent': self.ua.random}).json() count = 
html['count'] # 类别中的数据总数 pages = int(count) // 30 + 1 # 每页30个,看有多少页 return pages
# url入队列 def url_in(self): for id in self.id_list: # id为元组,(typ_id, 
pages)-->('2',pages) for page in range(2): url = self.url.format(page, id[0]) 
print(url) # 把URL地址入队列  self.q.put(url) # 线程事件函数: get() - 请求 - 解析 - 处理数据 def 
get_data(self):while True: # 当队列不为空时,获取url地址 if not self.q.empty(): url = 
self.q.get() headers= {'User-Agent': self.ua.random} html = 
requests.get(url=url, headers=headers).json() self.parse_html(html) else: break 
# 解析函数 def parse_html(self, html): # 存放1页的数据 - 写入到csv文件 app_list = [] for app in
 html['data']: # 应用名称 + 链接 + 分类 name = app['displayName'] link = '
http://app.mi.com/details?id=' + app['packageName'] typ_name = app['
level1CategoryName'] # 把每一条数据放到app_list中,目的为了 writerows()  
app_list.append([name, typ_name, link])print(name, typ_name) self.i += 1 # 
开始写入1页数据 - app_list  self.lock.acquire() self.writer.writerows(app_list) 
self.lock.release()# 主函数 def main(self): self.get_cateid() # URL入队列 t_list = [] 
# 创建多个线程 for i in range(1): t = Thread(target=self.get_data) t_list.append(t) 
t.start()# 统一回收线程 for t in t_list: t.join() # 关闭文件  self.f.close() print('数量:', 
self.i)if __name__ == '__main__': start = time.time() spider = XiaomiSpider() 
spider.main() end= time.time() print('执行时间:%.2f' % (end - start)) 
腾讯招聘数据抓取(Ajax)
确定URL地址及目标
 * URL: 百度搜索腾讯招聘 - 查看工作岗位https://careers.tencent.com/search.html 
<https://careers.tencent.com/search.html> 
 * 目标: 职位名称、工作职责、岗位要求 
要求与分析
 * 通过查看网页源码,得知所需数据均为 Ajax 动态加载 
 * 通过F12抓取网络数据包,进行分析 
 * 一级页面抓取数据: 职位名称 
 * 二级页面抓取数据: 工作职责、岗位要求 
一级页面json地址(pageIndex在变,timestamp未检查)
https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1563912271089&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn
二级页面地址(postId在变,在一级页面中可拿到)
https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1563912374645&postId={}&language=zh-cn
useragents.py文件
ua_list = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, 
like Gecko) Chrome/14.0.835.163 Safari/535.1', 'Mozilla/5.0 (Windows NT 6.1; 
WOW64; rv:6.0) Gecko/20100101 Firefox/6.0', 'Mozilla/4.0 (compatible; MSIE 8.0; 
Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 
3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)', ] 
我们先来回忆一下原来的腾讯招聘爬虫代码
import time import json import random import requests from useragents import 
ua_listclass TencentSpider(object): def __init__(self): self.one_url = '
https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1563912271089&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn
' self.two_url = '
https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1563912374645&postId={}&language=zh-cn
' self.f = open('tencent.json', 'a') # 打开文件 self.item_list = [] # 存放抓取的item字典数据 
# 获取响应内容函数 def get_page(self, url): headers = {'User-Agent': 
random.choice(ua_list)} html= requests.get(url=url, headers=headers).text html 
= json.loads(html)# json格式字符串转为Python数据类型 return html # 主线函数: 获取所有数据 def 
parse_page(self, one_url): html= self.get_page(one_url) item = {} for job in 
html['Data']['Posts']: item['name'] = job['RecruitPostName'] # 名称 post_id = job[
'PostId'] # postId,拿postid为了拼接二级页面地址 # 拼接二级地址,获取职责和要求 two_url = 
self.two_url.format(post_id) item['duty'], item['require'] = 
self.parse_two_page(two_url)print(item) self.item_list.append(item) # 添加到大列表中 # 
解析二级页面函数 def parse_two_page(self, two_url): html = self.get_page(two_url) duty 
= html['Data']['Responsibility'] # 工作责任 duty = duty.replace('\r\n', '').replace(
'\n', '') # 去掉换行 require = html['Data']['Requirement'] # 工作要求 require = 
require.replace('\r\n', '').replace('\n', '') # 去掉换行 return duty, require # 
获取总页数 def get_numbers(self): url = self.one_url.format(1) html = 
self.get_page(url) numbers= int(html['Data']['Count']) // 10 + 1 # 每页有10个推荐 
return numbers def main(self): number = self.get_numbers() for page in range(1, 
3): one_url = self.one_url.format(page) self.parse_page(one_url) # 
保存到本地json文件:json.dump json.dump(self.item_list, self.f, ensure_ascii=False) 
self.f.close()if __name__ == '__main__': start = time.time() spider = 
TencentSpider() spider.main() end= time.time() print('执行时间:%.2f' % (end - 
start)) View Code 
多线程实现
多线程即把所有一级页面链接提交到队列,进行多线程数据抓取
代码实现
import requests import json import time import random from useragents import 
ua_listfrom threading import Thread from queue import Queue class 
TencentSpider(object):def __init__(self): self.one_url = '
https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1563912271089&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn
' self.two_url = '
https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1563912374645&postId={}&language=zh-cn
' self.q = Queue() self.i = 0 # 计数 # 获取响应内容函数 def get_page(self, url): headers 
= {'User-Agent': random.choice(ua_list)} html = requests.get(url=url, headers=
headers).text# json.loads()把json格式的字符串转为python数据类型 html = json.loads(html) 
return html # 主线函数: 获取所有数据 def parse_page(self): while True: if not 
self.q.empty(): one_url= self.q.get() html = self.get_page(one_url) item = {} 
for job in html['Data']['Posts']: item['name'] = job['RecruitPostName'] # 名称 
post_id = job['PostId'] # 拿postid为了拼接二级页面地址 # 拼接二级地址,获取职责和要求 two_url = 
self.two_url.format(post_id) item['duty'], item['require'] = 
self.parse_two_page(two_url)print(item) # 每爬取按完成1页随机休眠 
time.sleep(random.uniform(0, 1)) else: break # 解析二级页面函数 def 
parse_two_page(self, two_url): html= self.get_page(two_url) # 用replace处理一下特殊字符 
duty = html['Data']['Responsibility'] duty = duty.replace('\r\n', '').replace('
\n', '') # 处理要求 require = html['Data']['Requirement'] require = require.replace(
'\r\n', '').replace('\n', '') return duty, require # 获取总页数 def 
get_numbers(self): url= self.one_url.format(1) html = self.get_page(url) numbers
= int(html['Data']['Count']) // 10 + 1 return numbers def main(self): # 
one_url入队列 number = self.get_numbers() for page in range(1, number + 1): one_url
= self.one_url.format(page) self.q.put(one_url) t_list = [] for i in range(5): t
= Thread(target=self.parse_page) t_list.append(t) t.start() for t in t_list: 
t.join()print('数量:', self.i) if __name__ == '__main__': start = time.time() 
spider= TencentSpider() spider.main() end = time.time() print('执行时间:%.2f' % 
(end - start)) 
多进程实现
import requests import json import time import random from useragents import 
ua_listfrom multiprocessing import Process from queue import Queue class 
TencentSpider(object):def __init__(self): self.one_url = '
https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1563912271089&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn
' self.two_url = '
https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1563912374645&postId={}&language=zh-cn
' self.q = Queue() # 获取响应内容函数 def get_page(self, url): headers = {'User-Agent': 
random.choice(ua_list)} html= requests.get(url=url, headers=headers).text # 
json格式字符串 -> Python html = json.loads(html) return html # 主线函数: 获取所有数据 def 
parse_page(self):while True: if not self.q.empty(): one_url = self.q.get() html 
= self.get_page(one_url) item = {} for job in html['Data']['Posts']: # 名称 item['
name'] = job['RecruitPostName'] # postId post_id = job['PostId'] # 
拼接二级地址,获取职责和要求 two_url = self.two_url.format(post_id) item['duty'], item['
require'] = self.parse_two_page(two_url) print(item) else: break # 解析二级页面函数 def 
parse_two_page(self, two_url): html= self.get_page(two_url) # 用replace处理一下特殊字符 
duty = html['Data']['Responsibility'] duty = duty.replace('\r\n', '').replace('
\n', '') # 处理要求 require = html['Data']['Requirement'] require = require.replace(
'\r\n', '').replace('\n', '') return duty, require # 获取总页数 def 
get_numbers(self): url= self.one_url.format(1) html = self.get_page(url) numbers
= int(html['Data']['Count']) // 10 + 1 return numbers def main(self): # url入队列 
number = self.get_numbers() for page in range(1, number + 1): one_url = 
self.one_url.format(page) self.q.put(one_url) t_list= [] for i in range(4): t = 
Process(target=self.parse_page) t_list.append(t) t.start() for t in t_list: 
t.join()if __name__ == '__main__': start = time.time() spider = TencentSpider() 
spider.main() end= time.time() print('执行时间:%.2f' % (end - start)) 
 
热门工具 换一换