mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-22 20:44:29 +00:00
parent
a38a99e0e2
commit
3787fec460
@ -2,7 +2,7 @@ Twisted>=10.0.0
|
||||
lxml
|
||||
pyOpenSSL
|
||||
cssselect>=0.9
|
||||
w3lib>=1.8.0
|
||||
w3lib>=1.13.0
|
||||
queuelib
|
||||
six>=1.5.2
|
||||
PyDispatcher>=2.0.5
|
||||
|
@ -8,7 +8,6 @@ from w3lib import html
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.http import HtmlResponse
|
||||
from scrapy.utils.response import _noscript_re, _script_re
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -89,8 +88,7 @@ def _has_ajaxcrawlable_meta(text):
|
||||
if 'content' not in text:
|
||||
return False
|
||||
|
||||
text = _script_re.sub(u'', text)
|
||||
text = _noscript_re.sub(u'', text)
|
||||
text = html.remove_comments(html.replace_entities(text))
|
||||
text = html.remove_tags_with_content(text, ('script', 'noscript'))
|
||||
text = html.replace_entities(text)
|
||||
text = html.remove_comments(text)
|
||||
return _ajax_crawlable_re.search(text) is not None
|
||||
|
||||
|
@ -3,7 +3,6 @@ This module provides some useful functions for working with
|
||||
scrapy.http.Response objects
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
import weakref
|
||||
import webbrowser
|
||||
import tempfile
|
||||
@ -31,17 +30,13 @@ def get_base_url(response):
|
||||
return _baseurl_cache[response]
|
||||
|
||||
|
||||
_noscript_re = re.compile(u'<noscript>.*?</noscript>', re.IGNORECASE | re.DOTALL)
|
||||
_script_re = re.compile(u'<script.*?>.*?</script>', re.IGNORECASE | re.DOTALL)
|
||||
_metaref_cache = weakref.WeakKeyDictionary()
|
||||
def get_meta_refresh(response):
|
||||
"""Parse the http-equiv refrsh parameter from the given response"""
|
||||
if response not in _metaref_cache:
|
||||
text = response.text[0:4096]
|
||||
text = _noscript_re.sub(u'', text)
|
||||
text = _script_re.sub(u'', text)
|
||||
_metaref_cache[response] = html.get_meta_refresh(text, response.url,
|
||||
response.encoding)
|
||||
response.encoding, ignore_tags=('script', 'noscript'))
|
||||
return _metaref_cache[response]
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user