模拟 Chrome 知乎登录


创建 chromedriver 实例

class ZhihuReqSpider(scrapy.Spider):
 name = 'zhihu_req'
 allowed_domains = ['www.zhihu.com']
 start_urls = ['http://www.zhihu.com/']

 def parse(self, response):
     pass

 def start_requests(self):
  """
  在继承 Spider 的时候,入口方法是 start_requests
  现在要爬取知乎,而且必须进行登录
  第一步就是完成登录,所以必须要重写 start_request 方法
  调用 chromedriver,模拟登录知乎
  """
  from selenium import webdriver
  from selenium.webdriver.chrome.options import Options
  from selenium.webdriver.common.keys import Keys
  chrome_option = Options()
  # 添加参数 --disable-extensions <禁用扩展名>
  chrome_option.add_argument('--disable-extensions')
  # 添加实验选项 debuggerAddress <调试器地址, 地址>
  chrome_option.add_experimental_option('debuggerAddress', '127.0.0.1:9222')
  # 传入 driver.exe 执行路径, 实例化对象
  browser = webdriver.Chrome(executable_path='E:/Template/chromedriver.exe',             
                             chrome_options=chrome_option)

操控 HTML 元素

try:
 # 最大化 browser window
 browser.maximize_window()
# 如果窗口已经最大化的话,再调用最大化方法,会抛出异常
except:
 pass
browser.get('https://www.zhihu.com/signin?next=%2F')
browser.find_element_by_css_selector('div[class="SignFlow-tabs"] div:nth-child(2)').click()
browser.find_element_by_css_selector('.SignFlow-account input[name="username"]').send_keys(Keys.CONTROL + 'a')
browser.find_element_by_css_selector('.SignFlow-account input[name="username"]').send_keys('17688718015')
browser.find_element_by_css_selector('.SignFlow-password input[name="password"]').send_keys(Keys.CONTROL + 'a')
browser.find_element_by_css_selector('.SignFlow-password input[name="password"]').send_keys('tanling.')
browser.find_element_by_css_selector('form[class="SignFlow Login-content"] button[type="submit"]').click()
time.sleep(10)

判断是否登录成功

while login_success:
 try:
     # 寻找 HTML 数据中,是否包含 <提醒> 图标元素
     popover = browser.find_element_by_css_selector('.Popover')
     login_success = True
 except:
     pass

判断验证码类型

try:
 # 寻找是否含有 英文验证码
 english_captcha_element = browser.find_element_by_css_selector('.Captcha-englishImg')
except:
 english_captcha_element = None

try:
 # 寻找是否含有 中文验证码
 chinese_captcha_element = browser.find_element_by_css_selector('.Captcha-chineseImg')
except:
 chinese_captcha_element = None

获取中文验证码 image,在可渲染画布中的位置 ,用作Click -> 注意HTML缩放 size == 100%

if chinese_captcha_element:
 # 使用 selenium 中 location 方法,获取元素在可渲染画布中的位置(browser-地址栏位置除外)
 ele_postion = chinese_captcha_element.location
 x_relative = ele_postion['x']
 y_relative = ele_postion['y']

保存验证码 image ,并调用第三方库识别

# 使用 outerHeight - innerHeight,得到导航栏的高度(包括文件下载导航栏)
browser_navigation_panel_height = browser.execute_script(
 'return window.outerHeight - window.innerHeight;'
)
# 稳妥方法:固定70像素
browser_navigation_panel_height = 70

# 获取验证码元素 -> src 属性
base64_text = chinese_captcha_element.get_attribute('src')
# 这个空字符串与 base64 的编码不一样,直接将它保存,文件是存在问题的,会多出编码:%0A <后续验证,貌似并没有多出>
code = base64_text.replace('data:image/jpg;base64,', '').replace('%0A', '')
# 以二进制形式,打开文件,进行 base64 编码,写入数据
with open('yzm_zh.jpeg', 'wb') as fp:
 fp.write(base64.b64decode(code))

from zheye.zheye import zheye
z = zheye()

positions = z.Recognize('yzm_zh.jpeg')
# 适配 HTML 中的 size
positions = [[int(key / 2) for key in position] for position in positions]
# 对换 x,y 轴坐标,文字坐标 + 页面位置 + 导航栏位置
positions = [[position[1] + x_relative, position[0] + browser_navigation_panel_height + y_relative]
          for position in positions]
# 点击验证码倒立文字
[[move(position[0], position[1]), click()] for position in positions]    

获取英文验证码image,并保存image

if english_captcha_element:
# 获取验证码元素 -> src 属性
base64_text = english_captcha_element.get_attribute('src')
# 这个空字符串与 base64 的编码不一样,直接将它保存,文件是存在问题的,会多出编码:
# %0A <后续验证,貌似并没有多出>
code = base64_text.replace('data:image/jpg;base64,', '').replace('%0A', '')
# 以base64.b64decode 形式写入文件
with open('yzm_zh.jpeg', 'wb') as fp:
 fp.write(base64.b64decode(code))

调用第三方API,识别 captcha Image 并 Input response captcha

from Tools.chaojiying import ChaojiyingClient
identify: [ChaojiyingClient] = ChaojiyingClient('willsmith', 'tanling.', '905454')
# 以 bytes 数据,进行赋值
img = open('yzm_zh.jpeg', 'rb').read()
# 避免获取验证码失败
result = ''
while True:
 if not result:
     result = identify.post_pic(img, 1902)
 else:
     break
browser.find_element_by_css_selector('.Input-wrapper input[name="captcha"]').send_keys(Keys.CONTROL + 'a')
browser.find_element_by_css_selector('.Input-wrapper input[name="captcha"]').send_keys(result['pic_str'])

避免账号,密码被清空,防范措施

# 避免账号密码被清空
if not login_success:
 browser.find_element_by_css_selector('div[class="SignFlow-tabs"] div:nth-child(2)').click()
 browser.find_element_by_css_selector('.SignFlow-account input[name="username"]').send_keys(
     Keys.CONTROL + 'a')
 browser.find_element_by_css_selector('.SignFlow-account input[name="username"]').send_keys(
     '17688718015')
 browser.find_element_by_css_selector('.SignFlow-password input[name="password"]').send_keys(
     Keys.CONTROL + 'a')
 browser.find_element_by_css_selector('.SignFlow-password input[name="password"]').send_keys('tanling.')
 browser.find_element_by_css_selector(
     'form[class="SignFlow Login-content"] button[type="submit"]').click()

保存 cookie,并调用Request 传入 参数,以便后续 parse data

# 保存cookie
cookies = browser.get_cookies()
pickle.dump(cookies, open('E:\WellHome\ArticleSpider\cookie\zhihu.cookie', 'wb'))
cookies_dict = DataConvert.merge_dicts([{cookie['name']: cookie['value']} for cookie in cookies])

# Scrapy内置了重复过滤功能,默认情况下该功能处于打开状态。
return [scrapy.Request(url=ZhihuFormalFirefoxSpider.start_urls[0], dont_filter=True, cookies=cookies_dict)]

Author: Ming Hui
Reprint policy: All articles in this blog are used except for special statements CC BY 4.0 reprint polocy. If reproduced, please indicate source Ming Hui !