1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-25 14:03:41 +00:00

LinkExtractor: don’t check all regexes if one of them matches

This commit is contained in:
Mikhail Korobov 2017-02-07 02:16:06 +05:00
parent 3b8e6d4d82
commit f73eb715ac

View File

@ -40,7 +40,7 @@ IGNORED_EXTENSIONS = [
_re_type = type(re.compile("", 0))
_matches = lambda url, regexs: any((r.search(url) for r in regexs))
_matches = lambda url, regexs: any(r.search(url) for r in regexs)
_is_valid_url = lambda url: url.split('://', 1)[0] in {'http', 'https', 'file'}
@ -93,8 +93,8 @@ class FilteringLinkExtractor(object):
if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
return False
allowed = [regex.search(url) for regex in self.allow_res] if self.allow_res else [True]
denied = [regex.search(url) for regex in self.deny_res] if self.deny_res else []
allowed = (regex.search(url) for regex in self.allow_res) if self.allow_res else [True]
denied = (regex.search(url) for regex in self.deny_res) if self.deny_res else []
return any(allowed) and not any(denied)
def _process_links(self, links):