scrapy爬虫的体系架构(scrapy爬虫之模拟登陆)

一、126,163邮箱模拟登陆

scrapy爬虫的体系架构(scrapy爬虫之模拟登陆)(1)

# -*- coding:utf-8 -*- import time from selenium import webdriver def login126_or_163emall(url): login_name = input("请输入账号:") login_password = input("请输入密码:") # 打开自动测试软件Chrome driver = webdriver.Chrome(executable_path="D:\chromedriver.exe") # 模拟窗口最大化 driver.maximize_window() # 打开目标网站 driver.get(url=url) time.sleep(10) # 切换为密码登录 password_login_button = driver.find_element_by_id("lbNormal") password_login_button.click() # 由于126邮箱是iframe嵌套,所以要切换到iframe窗口 elem = driver.find_element_by_css_selector("iframe[id^='x-URS-iframe']") # 用frame的index来定位,定位iframe窗口 driver.switch_to.frame(elem) # 定位到账号输入框,不需要输入@126.com user_name = driver.find_element_by_name("email") user_name.send_keys(login_name) # 定位到密码输入框 user_password = driver.find_element_by_name("password") user_password.send_keys(login_password) time.sleep(3) # 定位到登录按钮 login_button = driver.find_element_by_id("dologin") login_button.click() time.sleep(5) # 获取用户登录的cookies,返回一个字典 cookies = driver.get_cookies()[0] print(cookies) time.sleep(10) # 关闭模拟浏览器窗口 driver.close() if __name__ == '__main__': # url = "https://mail.126.com/" # 126邮箱url url = "https://mail.163.com/" # 163邮箱url login126_or_163emall(url=url)

输出的cookies如下:

""" {'domain': '.163.com', 'expiry': 4717308714, 'httpOnly': False, 'name': '_ntes_nnid', 'path': '/', 'secure': False, 'value': 'cf36cf83b0562fccb3ab872f3b1dfa4c,1563708714807'} """

二、B站模拟登陆

scrapy爬虫的体系架构(scrapy爬虫之模拟登陆)(2)

import time import random from PIL import Image from io import BytesIO from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver import ActionChains from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC border = 6 # 滑块左边框到验证图片左边框的距离 class crackGeetest(): def __init__(self): self.url = 'https://passport.bilibili.com/login' self.browser = webdriver.Chrome(r"D:\chromedriver.exe") # 设置浏览器为最大窗口 self.browser.maximize_window() self.wait = WebDriverWait(self.browser,timeout=5) def close(self): self.browser.close() # 关闭浏览器 self.browser.quit() # 退出并停止执行chromedriver.exe # 获取带缺口的图片 def get_geetest_image(self, name='captcha.png'): # 获取完整的验证图片 img = self.wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/div[2]/div[6]/div/div[1]/div[1]/div/a/div[1]'))) time.sleep(2) location = img.location # 获取元素位置 size = img.size # 获取元素尺寸 print(location,size) top = location['y'] bottom = location['y'] size['height'] left = location['x'] right = location['x'] size['width'] print('验证码位置', top, bottom, left, right) # 获取当前窗口的屏幕截图(二进制数据) screenshot = self.browser.get_screenshot_as_png() # 使用BytesIO对象在内存中读写bytes(就是读取截图) screenshot = Image.open(BytesIO(screenshot)) screenshot.save(r"D:\photo\image\screenshot.png" ) # 按照图片验证码的大小尺寸进行剪切 captcha = screenshot.crop((left, top, right, bottom)) # 将图片验证码保存到指定路径 captcha.save(r"D:\photo\image\%s"%name) return captcha # 获取缺口位置 def get_gap(self, img1, img2): left = 60 # 滑块的宽度 滑块左边框到验证图片左边框的距离 for i in range(left, img1.size[0]): # 遍历不带缺口的图片img1的RGB像素点 for j in range(img1.size[1]): if not self.is_pixel_equal(img1, img2, i, j): # 判断两张图片同一位置的像素点是否相等 left = i return left return left # 判断两张验证图片同一位置的像素点是否相同 def is_pixel_equal(self, img1, img2, x, y): # 取两个图片的像素点 pix1 = img1.load()[x,y] pix2 = img2.load()[x,y] threshold = 60 # 阈值 pix_r = abs(pix1[0] - pix2[0]) # R pix_g = abs(pix1[1] - pix2[1]) # G pix_b = abs(pix1[2] - pix2[2]) # B if (pix_r < threshold) and (pix_g < threshold) and (pix_b < threshold): return True else: return False # 获取移动轨迹 def get_track(self, distance): track = [] # 移动轨迹 current = 0 # 当前位移 mid = distance * 3 / 4 # 减速阈值 t = random.randint(2,3)/10 # 计算间隔 v = 0 # 初速度 distance = 5 while current < distance: # 判断当前位移是否小于缺口距离 if current < mid: # 如果当前位移小于减速的阈值 a = 2 # 则加速度为正 else: a = -3 # 否则加速度为负 v0 = v # 初速度v0 v = v0 a * t # 当前速度v (v = v0 at) x = v0*t 1/2*a*t*t # 移动距离x (x = v0t 0.5at^2) current = x # 当前位移 track.append(round(x)) # 加入轨迹(round为四舍五入) return track # 移动缺口滑块 def move_to_gap(self, slider, tracks): """ :param slider: 滑块 :param tracks: 移动轨迹 """ random.shuffle(tracks) # 创建一个鼠标移动的动作链,在滑块上按住的鼠标左键,并执行。 ActionChains(self.browser).click_and_hold(slider).perform() # 正向移动轨迹 for x in tracks: # 创建一个鼠标移动的动作链,将鼠标移动到当前鼠标位置的偏移位置(x,0)上,并执行。 ActionChains(self.browser).move_by_offset(xoffset=x, yoffset=0).perform() # 模拟人工滑动超过缺口位置返回至缺口的情况,同时还加入了随机数,都是为了更贴近人工滑动轨迹 action = ActionChains(self.browser).move_by_offset(xoffset=-1, yoffset=0) time.sleep(0.015) action.perform() time.sleep(random.randint(6, 10) / 10) action.perform() time.sleep(0.04) action.perform() time.sleep(random.randint(6, 10) / 10) action.perform() time.sleep(0.019) action.perform() time.sleep(random.randint(6, 10) / 10) ActionChains(self.browser).move_by_offset(xoffset=1, yoffset=0).perform() # 模拟抖动(由于释放鼠标是会产生抖动) ActionChains(self.browser).move_by_offset(xoffset=-3, yoffset=0).perform() ActionChains(self.browser).move_by_offset(xoffset=2, yoffset=0).perform() time.sleep(0.5) # 创建一个鼠标行为的动作链,释放滑块上的鼠标按钮,并执行。 ActionChains(self.browser).release().perform() def crack(self): try: # 打开网页 self.browser.get(self.url) # 获取用户名输入框 emall = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//*[@id="login-username"]')))[0] # 获取密码输入框 password = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//*[@id="login-passwd"]')))[0] # 发送用户名 emall.send_keys("15612345678") # 发送密码 password.send_keys("1234567890") # 点击登录按钮使之显示验证图片 loginbutton = self.wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="geetest-wrap"]/ul/li[5]/a[1]'))) loginbutton.click() # 确认验证图片加载完成(获取完整的验证码div) self.wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/div[2]/div[6]/div'))) # 获取移动滑块(slider:滑块) slider = self.wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[2]/div[2]/div[6]/div/div[1]/div[2]/div[2]'))) # 获取带缺口的验证码图片(完整的验证图片) image1 = self.get_geetest_image('captcha1.png') #========= 在当前窗口执行JavaScript语句(由于验证码原图被切分成搞多块)=========# # 组合验证码方法一: element = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'canvas.geetest_canvas_fullbg.geetest_fade.geetest_absolute'))) self.browser.execute_script("arguments[0].style=arguments[1]",element,"display: block;") # 组合验证码方法二:(本人建议使用此方法) # self.browser.execute_script('document.querySelectorAll("canvas")[2].style=""') # 获取缺块儿验证码 # self.browser.execute_script('document.querySelectorAll("canvas")[3].style=""') # 获取完整的验证码 # 获取带缺口的验证码图片(不完整的验证图片) image2 = self.get_geetest_image('captcha2.png') # 调用获取缺口位置函数(滑块的位置) gap = self.get_gap(image1, image2) # 减点滑块左边框到验证图片左边框的距离 gap -= border print('滑块的位置', gap) # 调用获取移动轨迹函数(track:移动轨迹) track = self.get_track(gap) # 调用移动缺口滑块函数进行滑动 self.move_to_gap(slider, track) time.sleep(1) # 获取验证完成后返回的数据“验证成功” success = self.wait.until(EC.text_to_be_present_in_element((By.XPATH, '/html/body/div[2]/div[2]/div[3]/div[2]'), '验证成功')) print(success) time.sleep(5) # 关闭浏览器 self.close() except: print('Failed-Retry') # 失败重试 self.crack() if __name__ == '__main__': crack = CrackGeetest() crack.crack()

由于哔哩哔哩验证码是极验的滑动验证码,验证码图片是由很多个小块图片碎片组合而成,所以解决办法如下:

#========= 在当前窗口执行JavaScript语句(由于验证码原图被切分成搞多块)=========# # 组合验证码方法一: element = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'canvas.geetest_canvas_fullbg.geetest_fade.geetest_absolute'))) self.browser.execute_script("arguments[0].style=arguments[1]",element,"display: block;") # 组合验证码方法二:(本人建议使用此方法) # self.browser.execute_script('document.querySelectorAll("canvas")[2].style=""') # 获取缺块儿验证码 # self.browser.execute_script('document.querySelectorAll("canvas")[3].style=""') # 获取完整的验证码

最后就是运行了,运行结果就不展示了,测试了一下成功了还是特别高的。

,

免责声明:本文仅代表文章作者的个人观点,与本站无关。其原创性、真实性以及文中陈述文字和内容未经本站证实,对本文以及其中全部或者部分内容文字的真实性、完整性和原创性本站不作任何保证或承诺,请读者仅作参考,并自行核实相关内容。文章投诉邮箱:anhduc.ph@yahoo.com

    分享
    投诉
    首页