模拟 Firefox 知乎登录


代码示例:

# -*- coding: utf-8 -*-
from ArticleSpider.utils.common import DataConvert
import scrapy
import pickle
from mouse import move, click
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.common.keys import Keys
# 拼接域名与url, 并兼容py2
try:
    import urlparse as parse
except:
    from urllib import parse

import time
import base64


class ZhihuFormalFirefoxSpider(scrapy.Spider):
    name = 'zhihu_formal_firefox'
    allowed_domains = ['www.zhihu.com']
    start_urls = ['http://www.zhihu.com/']

    def parse(self, response):
        """
        提取HTML中所有的URL,并跟踪这些URL进一步爬取
        如果提取的URL中包含 question/xxx,就下载之后交给解析方法
        """
        # 提取所有 a 标签中的 href 属性参数
        all_urls = response.css('a::attr(href)').extract()
        # 拼接 url 与 域名
        all_urls = [parse.urljoin(response.url, url) for url in all_urls]
        for url in all_urls:
            pass

    def start_requests(self):
        """
        在继承 Spider 的时候,入口方法是 start_requests
        现在要爬取知乎,而且必须进行登录
        第一步就是完成登录,所以必须要重写 start_request 方法
        调用 firefox_driver,模拟登录知乎
        """
        convert = DataConvert()

        profile = webdriver.FirefoxProfile(
            r'C:\Users\WIN10\AppData\Roaming\Mozilla\Firefox\Profiles\1o72doa9.default-release'
            # 'C:/Users/A17/AppData/Roaming/Mozilla/Firefox/Profiles/x3sshdnj.default-release'
        )
        # 判断 webdriver 已启动 : 关闭
        profile.set_preference("dom.webdriver.enabled", False)
        # 使用自动化扩展 : 关闭
        profile.set_preference('useAutomationExtension', False)
        profile.update_preferences()
        desired = DesiredCapabilities.FIREFOX
        browser = webdriver.Firefox(executable_path='E:/Template/geckodriver.exe', firefox_profile=profile,
                                    desired_capabilities=desired, service_log_path='Log/geckodriver.log')
        try:
            # 最大化 browser window
            browser.maximize_window()
        # 如果窗口已经最大化的话,再调用最大化方法,会抛出异常
        except:
            pass
        browser.get('https://www.zhihu.com/signin?next=%2F')
        browser.find_element_by_css_selector('div[class="SignFlow-tabs"] div:nth-child(2)').click()
        browser.find_element_by_css_selector('.SignFlow-account input[name="username"]').send_keys(Keys.CONTROL + 'a')
        browser.find_element_by_css_selector('.SignFlow-account input[name="username"]').send_keys('17688718015')
        browser.find_element_by_css_selector('.SignFlow-password input[name="password"]').send_keys(Keys.CONTROL + 'a')
        browser.find_element_by_css_selector('.SignFlow-password input[name="password"]').send_keys('tanling.')
        browser.find_element_by_css_selector('form[class="SignFlow Login-content"] button[type="submit"]').click()
        time.sleep(10)

        # 有可能登录失败
        login_success = False
        while not login_success:

            # 寻找 HTML 数据中,是否包含 <提醒> 图标元素
            try:
                popover = browser.find_element_by_css_selector('.Popover')
                login_success = True
            except:
                pass

            # 寻找英文验证码
            try:
                # 寻找是否含有 英文验证码
                english_captcha_element = browser.find_element_by_css_selector('.Captcha-englishImg')
            except:
                english_captcha_element = None

            # 寻找中文验证码
            try:
                # 寻找是否含有 中文验证码
                chinese_captcha_element = browser.find_element_by_css_selector('.Captcha-chineseImg')
            except:
                chinese_captcha_element = None

            # 中文倒立文字识别登录
            if chinese_captcha_element:
                # 使用 selenium 中 location 方法,获取元素在可渲染画布中的位置(browser-地址栏位置除外)
                ele_position = chinese_captcha_element.location
                x_relative = ele_position['x']
                y_relative = ele_position['y']
                # 使用 outerHeight - innerHeight,得到导航栏的高度(包括文件下载导航栏)
                # browser_navigation_panel_height = browser.execute_script(
                #     'return window.outerHeight - window.innerHeight;'
                # )
                # 稳妥方法:固定70
                browser_navigation_panel_height = 100

                # 获取验证码元素 -> src 属性
                base64_text = chinese_captcha_element.get_attribute('src')
                positions = convert.check_chinese_captcha(base64_text)
                # 适配 HTML 中的 size
                positions = [[int(key / 2) for key in position] for position in positions]
                # 对换 x,y 轴坐标,文字坐标 + 页面位置 + 导航栏位置
                positions = [[position[1] + x_relative, position[0] + browser_navigation_panel_height + y_relative]
                             for position in positions]
                # 点击验证码倒立文字
                [[move(position[0], position[1]), click()] for position in positions]

            if english_captcha_element:
                # 获取验证码元素 -> src 属性
                base64_text = english_captcha_element.get_attribute('src')
                result = convert.check_english_captcha(base64_text)
                browser.find_element_by_css_selector('.Input-wrapper input[name="captcha"]').send_keys(Keys.CONTROL + 'a')
                browser.find_element_by_css_selector('.Input-wrapper input[name="captcha"]').send_keys(result['pic_str'])

            # 避免账号密码被清空
            if not login_success:
                browser.find_element_by_css_selector('div[class="SignFlow-tabs"] div:nth-child(2)').click()
                browser.find_element_by_css_selector('.SignFlow-account input[name="username"]').send_keys(
                    Keys.CONTROL + 'a')
                browser.find_element_by_css_selector('.SignFlow-account input[name="username"]').send_keys(
                    '17688718015')
                browser.find_element_by_css_selector('.SignFlow-password input[name="password"]').send_keys(
                    Keys.CONTROL + 'a')
                browser.find_element_by_css_selector('.SignFlow-password input[name="password"]').send_keys('tanling.')
                browser.find_element_by_css_selector(
                    'form[class="SignFlow Login-content"] button[type="submit"]').click()
            # 保存cookie
            cookies = browser.get_cookies()
            pickle.dump(cookies, open('E:\WellHome\ArticleSpider\cookie\zhihu.cookie', 'wb'))
            cookies_dict = DataConvert.merge_dicts([{cookie['name']: cookie['value']} for cookie in cookies])

        # Scrapy 内置了重复过滤功能 dont_filter,默认情况下该功能处于打开状态。没有进行callback,所以会默认进入parse方法中
        return [scrapy.Request(url=ZhihuFormalFirefoxSpider.start_urls[0], dont_filter=True, cookies=cookies_dict)]

Author: Ming Hui
Reprint policy: All articles in this blog are used except for special statements CC BY 4.0 reprint polocy. If reproduced, please indicate source Ming Hui !