python 斗鱼爬虫的基础知识(采集快手用户所有公开作品)
- 代码功能如题:根据快手用户的id来爬取用户所有公开作品,包括图集和视频。
- 原理:其实就是利用基于chromium内核的浏览器自带的devtools对所有请求进行排查找出包含作品链接的请求,然后用代码模拟请求去获得数据,再根据url下载作品保存就行了,包括一些网站的自动注册登录、操作都可以模拟。这个其实应该算是写过爬虫的同学们都知道。
核心代码
- 废话不多说,上核心代码
def __crawl_user(self, uid):
if uid.isdigit():
uid = self.__switch_id(uid)
payload = {"operationName": "privateFeedsQuery",
"variables": {"principalId": uid, "pcursor": "", "count": 999},
"query": "query privateFeedsQuery($principalId: String, $pcursor: String, $count: Int) {\n privateFeeds(principalId: $principalId, pcursor: $pcursor, count: $count) {\n pcursor\n list {\n id\n thumbnailUrl\n poster\n workType\n type\n useVideoPlayer\n imgUrls\n imgSizes\n magicFace\n musicName\n caption\n location\n liked\n onlyFollowerCanComment\n relativeHeight\n timestamp\n width\n height\n counts {\n displayView\n displayLike\n displayComment\n __typename\n }\n user {\n id\n eid\n name\n avatar\n __typename\n }\n expTag\n __typename\n }\n __typename\n }\n}\n"}
res = requests.post(self.__data_url, headers=self.__headers, json=payload)
works = json.loads(res.content.decode(encoding='utf-8', errors='strict'))['data']['privateFeeds']['list']
if not os.path.exists("../data"):
os.makedirs("../data")
# 这两行代码将response写入json供分析
# with open("data/" uid ".json", "w") as fp:
# fp.write(json.dumps(works, indent=2))
# 防止该用户在直播,第一个作品默认为直播,导致获取信息为NoneType
if works[0]['id'] is None:
works.pop(0)
name = re.sub(r'[\\/:*?"<>|\r\n] ', "", works[0]['user']['name'])
dir = "data/" name "(" uid ")/"
# print(len(works))
if not os.path.exists(dir):
os.makedirs(dir)
# if not os.path.exists(dir ".list"):
# print("")
print("开始爬取用户 " name ",保存在目录 " dir)
print(" 共有" str(len(works)) "个作品")
for j in range(len(works)):
self.__crawl_work(uid, dir, works[j], j 1)
time.sleep(1)
print("用户 " name "爬取完成!")
print()
time.sleep(1)
快手分为五种类型的作品,在作品里面表现为workType属性
- 其中两种图集: vertical和multiple,意味着拼接长图和多图,所有图片的链接在imgUrls里
- 一种单张图片: single 图片链接也在imgUrls里
- K歌: ksong 图片链接一样,不考虑爬取音频...
- 视频: video 需要解析html获得视频链接
def __crawl_work(self, uid, dir, work, wdx):
w_type = work['workType']
w_caption = re.sub(r"\s ", " ", work['caption'])
w_name = re.sub(r'[\/:*?"<>|\r\n] ', "", w_caption)[0:24]
w_time = time.strftime('%Y-%m-%d', time.localtime(work['timestamp'] / 1000))
if w_type == 'vertical' or w_type == 'multiple' or w_type == "single" or w_type == 'ksong':
w_urls = work['imgUrls']
l = len(w_urls)
print(" " str(wdx) ")图集作品:" w_caption "," "共有" str(l) "张图片")
for i in range(l):
p_name = w_time "_" w_name "_" str(i 1) ".jpg"
pic = dir p_name
if not os.path.exists(pic):
r = requests.get(w_urls[i])
r.raise_for_status()
with open(pic, "wb") as f:
f.write(r.content)
print(" " str(i 1) "/" str(l) " 图片 " p_name " 下载成功 √")
else:
print(" " str(i 1) "/" str(l) " 图片 " p_name " 已存在 √")
elif w_type == 'video':
w_url = self.__work_url work['id']
res = requests.get(w_url, headers=self.__headers_mobile,
params={"fid": 1841409882, "cc": "share_copylink", "shareId": "143108986354"})
html = res.text
waitreplace = work['id'] '".*?"srcNoMark":"(.*?)"'
v_url = re.findall(waitreplace, html)
# pattern = re.compile(r"playUrl", re.MULTILINE | re.DOTALL)
# script = soup.find("script", text=pattern)
# s = pattern.search(script.text).string
# v_url = s.split('playUrl":"')[1].split('.mp4')[0].encode('utf-8').decode('unicode-escape') '.mp4'
try:
print(" " str(wdx) ")视频作品:" w_caption)
except:
print(" 这里似乎有点小错误,已跳过")
v_name = w_time "_" w_name ".mp4"
video = dir v_name
if v_url:
if not os.path.exists(video):
r = requests.get(v_url[0])
r.raise_for_status()
with open(video, "wb") as f:
f.write(r.content)
print(" 视频 " v_name " 下载成功 √")
else:
print(" 视频 " v_name " 已存在 √")
else:
print("未找到视频")
else:
print("错误的类型")
注意事项:
- 不考虑提供列表可选的批量下载功能
- 有需要的合理功能可以issue反馈,看到后会考虑是否修改
- 如果需要自定义自己的需求,可以拿走代码自行修改,喜欢的话给个star给个follow
- 本代码仅供学习使用,不可违反法律爬取视频,以及私自盗用搬运视频,后果自负
项目源码地址 https://github.com/oGsLP/kuaishou-crawler
,免责声明:本文仅代表文章作者的个人观点,与本站无关。其原创性、真实性以及文中陈述文字和内容未经本站证实,对本文以及其中全部或者部分内容文字的真实性、完整性和原创性本站不作任何保证或承诺,请读者仅作参考,并自行核实相关内容。文章投诉邮箱:anhduc.ph@yahoo.com