mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-22 01:53:14 +00:00
Added support to replay update to crawl again all the pages downloaded in the replay file.
--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40169
This commit is contained in:
parent
9b0dd66ec1
commit
801b804a4d
@ -42,14 +42,14 @@ def custom_commands_dict():
|
||||
print 'WARNING: Custom command module %s exists but Command class not found' % modname
|
||||
return d
|
||||
|
||||
def getcmdname():
|
||||
for arg in sys.argv[1:]:
|
||||
def getcmdname(argv):
|
||||
for arg in argv[1:]:
|
||||
if not arg.startswith('-'):
|
||||
return arg
|
||||
|
||||
def usage():
|
||||
s = "usage: %s <subcommand> [options] [args]\n" % sys.argv[0]
|
||||
s += " %s <subcommand> -h\n\n" % sys.argv[0]
|
||||
def usage(argv):
|
||||
s = "usage: %s <subcommand> [options] [args]\n" % argv[0]
|
||||
s += " %s <subcommand> -h\n\n" % argv[0]
|
||||
s += "Built-in subcommands:\n"
|
||||
|
||||
builtin_cmds = builtin_commands_dict()
|
||||
@ -96,16 +96,19 @@ def command_settings(cmdname):
|
||||
command_executed = {}
|
||||
|
||||
def execute():
|
||||
execute_with_args(sys.argv)
|
||||
|
||||
def execute_with_args(argv):
|
||||
spiders.load()
|
||||
cmds = builtin_commands_dict()
|
||||
cmds.update(custom_commands_dict())
|
||||
|
||||
cmdname = getcmdname()
|
||||
cmdname = getcmdname(argv)
|
||||
command_settings(cmdname)
|
||||
|
||||
if not cmdname:
|
||||
print "Scrapy %s\n" % scrapy.__version__
|
||||
print usage()
|
||||
print usage(argv)
|
||||
sys.exit()
|
||||
|
||||
parser = optparse.OptionParser()
|
||||
@ -118,10 +121,10 @@ def execute():
|
||||
else:
|
||||
print "Scrapy %s\n" % scrapy.__version__
|
||||
print "Unknown command: %s\n" % cmdname
|
||||
print 'Type "%s -h" for help' % sys.argv[0]
|
||||
print 'Type "%s -h" for help' % argv[0]
|
||||
sys.exit()
|
||||
|
||||
(opts, args) = parser.parse_args()
|
||||
(opts, args) = parser.parse_args(args=argv[1:])
|
||||
del args[0] # args[0] is cmdname
|
||||
|
||||
# storing command executed info for later reference
|
||||
|
@ -4,6 +4,7 @@ from scrapy.command import ScrapyCommand
|
||||
from scrapy.replay import Replay
|
||||
from scrapy.utils import display
|
||||
from scrapy.conf import settings
|
||||
from scrapy.command import cmdline
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
def syntax(self):
|
||||
@ -31,6 +32,8 @@ class Command(ScrapyCommand):
|
||||
parser.add_option("--nocolour", dest="nocolour", action="store_true", help="disable colorized output (for console only)")
|
||||
parser.add_option("-i", "--ignore", dest="ignores", action="append", help="item attribute to ignore. can be passed multiple times", metavar="ATTR")
|
||||
parser.add_option("--target", dest="targets", action="append", help="crawl TARGET instead of recorded urls/domains. can be passed multiple times")
|
||||
# adding option to update
|
||||
parser.add_option("--pages", dest="pages", action="store_true", help="update all the pages in the replay file, recording it again.")
|
||||
|
||||
def process_options(self, args, opts):
|
||||
ScrapyCommand.process_options(self, args, opts)
|
||||
@ -75,7 +78,16 @@ class Command(ScrapyCommand):
|
||||
self.replay.play(args=opts.targets)
|
||||
|
||||
def action_update(self, opts):
|
||||
self.action_crawl(opts)
|
||||
self.replay.update(args=opts.targets, opts=opts.__dict__)
|
||||
if (opts.pages):
|
||||
args = ['decobot-ctl.py', 'crawl']
|
||||
args.extend(self.replay.options['args'])
|
||||
for k in self.replay.options['opts']:
|
||||
if self.replay.options['opts'][k]:
|
||||
args.append("--%s" % k)
|
||||
if self.replay.options['opts'][k] != True:
|
||||
args.append(self.replay.options['opts'][k])
|
||||
cmdline.execute_with_args(args)
|
||||
|
||||
def action_showitems(self, opts):
|
||||
s = ""
|
||||
|
@ -68,6 +68,7 @@ class Replay(object):
|
||||
scrapymanager.runonce(*args, **opts)
|
||||
|
||||
def record(self, args=None, opts=None):
|
||||
self.recording = True
|
||||
self.options.clear()
|
||||
self.options['args'] = args or []
|
||||
self.options['opts'] = opts or {}
|
||||
@ -79,8 +80,13 @@ class Replay(object):
|
||||
|
||||
def update(self, args=None, opts=None):
|
||||
self.updating = True
|
||||
|
||||
self.play(args, opts)
|
||||
if (opts['pages']):
|
||||
args = self.options['args']
|
||||
opts = self.options['opts']
|
||||
settings.overrides['CACHE2_EXPIRATION_SECS'] = 0
|
||||
settings.overrides['CACHE2_IGNORE_MISSING'] = False
|
||||
else:
|
||||
self.play(args, opts)
|
||||
|
||||
def engine_started(self):
|
||||
log.msg("Replay: recording session in %s" % self.repfile)
|
||||
|
Loading…
x
Reference in New Issue
Block a user