创建 chromedriver 实例
class ZhihuReqSpider(scrapy.Spider):
name = 'zhihu_req'
allowed_domains = ['www.zhihu.com']
start_urls = ['http://www.zhihu.com/']
def parse(self, response):
pass
def start_requests(self):
"""
在继承 Spider 的时候,入口方法是 start_requests
现在要爬取知乎,而且必须进行登录
第一步就是完成登录,所以必须要重写 start_request 方法
调用 chromedriver,模拟登录知乎
"""
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
chrome_option = Options()
chrome_option.add_argument('--disable-extensions')
chrome_option.add_experimental_option('debuggerAddress', '127.0.0.1:9222')
browser = webdriver.Chrome(executable_path='E:/Template/chromedriver.exe',
chrome_options=chrome_option)
操控 HTML 元素
try:
browser.maximize_window()
except:
pass
browser.get('https://www.zhihu.com/signin?next=%2F')
browser.find_element_by_css_selector('div[class="SignFlow-tabs"] div:nth-child(2)').click()
browser.find_element_by_css_selector('.SignFlow-account input[name="username"]').send_keys(Keys.CONTROL + 'a')
browser.find_element_by_css_selector('.SignFlow-account input[name="username"]').send_keys('17688718015')
browser.find_element_by_css_selector('.SignFlow-password input[name="password"]').send_keys(Keys.CONTROL + 'a')
browser.find_element_by_css_selector('.SignFlow-password input[name="password"]').send_keys('tanling.')
browser.find_element_by_css_selector('form[class="SignFlow Login-content"] button[type="submit"]').click()
time.sleep(10)
判断是否登录成功
while login_success:
try:
popover = browser.find_element_by_css_selector('.Popover')
login_success = True
except:
pass
判断验证码类型
try:
english_captcha_element = browser.find_element_by_css_selector('.Captcha-englishImg')
except:
english_captcha_element = None
try:
chinese_captcha_element = browser.find_element_by_css_selector('.Captcha-chineseImg')
except:
chinese_captcha_element = None
获取中文验证码 image,在可渲染画布中的位置 ,用作Click -> 注意HTML缩放 size == 100%
if chinese_captcha_element:
ele_postion = chinese_captcha_element.location
x_relative = ele_postion['x']
y_relative = ele_postion['y']
保存验证码 image ,并调用第三方库识别
browser_navigation_panel_height = browser.execute_script(
'return window.outerHeight - window.innerHeight;'
)
browser_navigation_panel_height = 70
base64_text = chinese_captcha_element.get_attribute('src')
code = base64_text.replace('data:image/jpg;base64,', '').replace('%0A', '')
with open('yzm_zh.jpeg', 'wb') as fp:
fp.write(base64.b64decode(code))
from zheye.zheye import zheye
z = zheye()
positions = z.Recognize('yzm_zh.jpeg')
positions = [[int(key / 2) for key in position] for position in positions]
positions = [[position[1] + x_relative, position[0] + browser_navigation_panel_height + y_relative]
for position in positions]
[[move(position[0], position[1]), click()] for position in positions]
获取英文验证码image,并保存image
if english_captcha_element:
base64_text = english_captcha_element.get_attribute('src')
code = base64_text.replace('data:image/jpg;base64,', '').replace('%0A', '')
with open('yzm_zh.jpeg', 'wb') as fp:
fp.write(base64.b64decode(code))
from Tools.chaojiying import ChaojiyingClient
identify: [ChaojiyingClient] = ChaojiyingClient('willsmith', 'tanling.', '905454')
img = open('yzm_zh.jpeg', 'rb').read()
result = ''
while True:
if not result:
result = identify.post_pic(img, 1902)
else:
break
browser.find_element_by_css_selector('.Input-wrapper input[name="captcha"]').send_keys(Keys.CONTROL + 'a')
browser.find_element_by_css_selector('.Input-wrapper input[name="captcha"]').send_keys(result['pic_str'])
避免账号,密码被清空,防范措施
if not login_success:
browser.find_element_by_css_selector('div[class="SignFlow-tabs"] div:nth-child(2)').click()
browser.find_element_by_css_selector('.SignFlow-account input[name="username"]').send_keys(
Keys.CONTROL + 'a')
browser.find_element_by_css_selector('.SignFlow-account input[name="username"]').send_keys(
'17688718015')
browser.find_element_by_css_selector('.SignFlow-password input[name="password"]').send_keys(
Keys.CONTROL + 'a')
browser.find_element_by_css_selector('.SignFlow-password input[name="password"]').send_keys('tanling.')
browser.find_element_by_css_selector(
'form[class="SignFlow Login-content"] button[type="submit"]').click()
保存 cookie,并调用Request 传入 参数,以便后续 parse data
cookies = browser.get_cookies()
pickle.dump(cookies, open('E:\WellHome\ArticleSpider\cookie\zhihu.cookie', 'wb'))
cookies_dict = DataConvert.merge_dicts([{cookie['name']: cookie['value']} for cookie in cookies])
return [scrapy.Request(url=ZhihuFormalFirefoxSpider.start_urls[0], dont_filter=True, cookies=cookies_dict)]