Skip to content
This repository has been archived by the owner on Sep 28, 2022. It is now read-only.

Fixed unfiltered duplicates bug, removed dont_filter #16

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions scrapy_webdriver/middlewares.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from scrapy.exceptions import IgnoreRequest, NotConfigured

from scrapy.exceptions import IgnoreRequest, NotConfigured, DontCloseSpider
from scrapy.signals import spider_idle
from .http import WebdriverActionRequest, WebdriverRequest
from .manager import WebdriverManager

Expand All @@ -8,6 +8,7 @@ class WebdriverSpiderMiddleware(object):
"""This middleware coordinates concurrent webdriver access attempts."""
def __init__(self, crawler):
self.manager = WebdriverManager(crawler)
crawler.signals.connect(self._next_request, signal=spider_idle)

@classmethod
def from_crawler(cls, crawler):
Expand Down Expand Up @@ -44,7 +45,7 @@ def process_spider_output(self, response, result, spider):
self.manager.release(response.request.url)
next_request = self.manager.acquire_next()
if next_request is not WebdriverRequest.WAITING:
yield next_request.replace(dont_filter=True)
yield next_request

def _process_requests(self, items_or_requests, start=False):
"""Acquire the webdriver manager when it's available for requests."""
Expand All @@ -57,3 +58,11 @@ def _process_requests(self, items_or_requests, start=False):
if request is WebdriverRequest.WAITING:
continue # Request has been enqueued, so drop it.
yield request

def _next_request(self, spider):
'''If spider has no requests to process, try to queue request from manager\'s queue'''
self.manager.release('')
next_request = self.manager.acquire_next()
if next_request is not WebdriverRequest.WAITING:
spider.crawler.engine.crawl(next_request, spider)
raise DontCloseSpider()