代码示例:
# -*- coding: utf-8 -*-
from ArticleSpider.utils.common import DataConvert
import scrapy
import pickle
from mouse import move, click
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.common.keys import Keys
# 拼接域名与url, 并兼容py2
try:
import urlparse as parse
except:
from urllib import parse
import time
import base64
class ZhihuFormalFirefoxSpider(scrapy.Spider):
name = 'zhihu_formal_firefox'
allowed_domains = ['www.zhihu.com']
start_urls = ['http://www.zhihu.com/']
def parse(self, response):
"""
提取HTML中所有的URL,并跟踪这些URL进一步爬取
如果提取的URL中包含 question/xxx,就下载之后交给解析方法
"""
# 提取所有 a 标签中的 href 属性参数
all_urls = response.css('a::attr(href)').extract()
# 拼接 url 与 域名
all_urls = [parse.urljoin(response.url, url) for url in all_urls]
for url in all_urls:
pass
def start_requests(self):
"""
在继承 Spider 的时候,入口方法是 start_requests
现在要爬取知乎,而且必须进行登录
第一步就是完成登录,所以必须要重写 start_request 方法
调用 firefox_driver,模拟登录知乎
"""
convert = DataConvert()
profile = webdriver.FirefoxProfile(
r'C:\Users\WIN10\AppData\Roaming\Mozilla\Firefox\Profiles\1o72doa9.default-release'
# 'C:/Users/A17/AppData/Roaming/Mozilla/Firefox/Profiles/x3sshdnj.default-release'
)
# 判断 webdriver 已启动 : 关闭
profile.set_preference("dom.webdriver.enabled", False)
# 使用自动化扩展 : 关闭
profile.set_preference('useAutomationExtension', False)
profile.update_preferences()
desired = DesiredCapabilities.FIREFOX
browser = webdriver.Firefox(executable_path='E:/Template/geckodriver.exe', firefox_profile=profile,
desired_capabilities=desired, service_log_path='Log/geckodriver.log')
try:
# 最大化 browser window
browser.maximize_window()
# 如果窗口已经最大化的话,再调用最大化方法,会抛出异常
except:
pass
browser.get('https://www.zhihu.com/signin?next=%2F')
browser.find_element_by_css_selector('div[class="SignFlow-tabs"] div:nth-child(2)').click()
browser.find_element_by_css_selector('.SignFlow-account input[name="username"]').send_keys(Keys.CONTROL + 'a')
browser.find_element_by_css_selector('.SignFlow-account input[name="username"]').send_keys('17688718015')
browser.find_element_by_css_selector('.SignFlow-password input[name="password"]').send_keys(Keys.CONTROL + 'a')
browser.find_element_by_css_selector('.SignFlow-password input[name="password"]').send_keys('tanling.')
browser.find_element_by_css_selector('form[class="SignFlow Login-content"] button[type="submit"]').click()
time.sleep(10)
# 有可能登录失败
login_success = False
while not login_success:
# 寻找 HTML 数据中,是否包含 <提醒> 图标元素
try:
popover = browser.find_element_by_css_selector('.Popover')
login_success = True
except:
pass
# 寻找英文验证码
try:
# 寻找是否含有 英文验证码
english_captcha_element = browser.find_element_by_css_selector('.Captcha-englishImg')
except:
english_captcha_element = None
# 寻找中文验证码
try:
# 寻找是否含有 中文验证码
chinese_captcha_element = browser.find_element_by_css_selector('.Captcha-chineseImg')
except:
chinese_captcha_element = None
# 中文倒立文字识别登录
if chinese_captcha_element:
# 使用 selenium 中 location 方法,获取元素在可渲染画布中的位置(browser-地址栏位置除外)
ele_position = chinese_captcha_element.location
x_relative = ele_position['x']
y_relative = ele_position['y']
# 使用 outerHeight - innerHeight,得到导航栏的高度(包括文件下载导航栏)
# browser_navigation_panel_height = browser.execute_script(
# 'return window.outerHeight - window.innerHeight;'
# )
# 稳妥方法:固定70
browser_navigation_panel_height = 100
# 获取验证码元素 -> src 属性
base64_text = chinese_captcha_element.get_attribute('src')
positions = convert.check_chinese_captcha(base64_text)
# 适配 HTML 中的 size
positions = [[int(key / 2) for key in position] for position in positions]
# 对换 x,y 轴坐标,文字坐标 + 页面位置 + 导航栏位置
positions = [[position[1] + x_relative, position[0] + browser_navigation_panel_height + y_relative]
for position in positions]
# 点击验证码倒立文字
[[move(position[0], position[1]), click()] for position in positions]
if english_captcha_element:
# 获取验证码元素 -> src 属性
base64_text = english_captcha_element.get_attribute('src')
result = convert.check_english_captcha(base64_text)
browser.find_element_by_css_selector('.Input-wrapper input[name="captcha"]').send_keys(Keys.CONTROL + 'a')
browser.find_element_by_css_selector('.Input-wrapper input[name="captcha"]').send_keys(result['pic_str'])
# 避免账号密码被清空
if not login_success:
browser.find_element_by_css_selector('div[class="SignFlow-tabs"] div:nth-child(2)').click()
browser.find_element_by_css_selector('.SignFlow-account input[name="username"]').send_keys(
Keys.CONTROL + 'a')
browser.find_element_by_css_selector('.SignFlow-account input[name="username"]').send_keys(
'17688718015')
browser.find_element_by_css_selector('.SignFlow-password input[name="password"]').send_keys(
Keys.CONTROL + 'a')
browser.find_element_by_css_selector('.SignFlow-password input[name="password"]').send_keys('tanling.')
browser.find_element_by_css_selector(
'form[class="SignFlow Login-content"] button[type="submit"]').click()
# 保存cookie
cookies = browser.get_cookies()
pickle.dump(cookies, open('E:\WellHome\ArticleSpider\cookie\zhihu.cookie', 'wb'))
cookies_dict = DataConvert.merge_dicts([{cookie['name']: cookie['value']} for cookie in cookies])
# Scrapy 内置了重复过滤功能 dont_filter,默认情况下该功能处于打开状态。没有进行callback,所以会默认进入parse方法中
return [scrapy.Request(url=ZhihuFormalFirefoxSpider.start_urls[0], dont_filter=True, cookies=cookies_dict)]