1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-22 19:24:01 +00:00

Added support to replay update to crawl again all the pages downloaded in the replay file.

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40169
This commit is contained in:
Andres Moreira 2008-08-15 14:59:48 +00:00
parent 9b0dd66ec1
commit 801b804a4d
3 changed files with 33 additions and 12 deletions

View File

@ -42,14 +42,14 @@ def custom_commands_dict():
print 'WARNING: Custom command module %s exists but Command class not found' % modname
return d
def getcmdname():
for arg in sys.argv[1:]:
def getcmdname(argv):
for arg in argv[1:]:
if not arg.startswith('-'):
return arg
def usage():
s = "usage: %s <subcommand> [options] [args]\n" % sys.argv[0]
s += " %s <subcommand> -h\n\n" % sys.argv[0]
def usage(argv):
s = "usage: %s <subcommand> [options] [args]\n" % argv[0]
s += " %s <subcommand> -h\n\n" % argv[0]
s += "Built-in subcommands:\n"
builtin_cmds = builtin_commands_dict()
@ -96,16 +96,19 @@ def command_settings(cmdname):
command_executed = {}
def execute():
execute_with_args(sys.argv)
def execute_with_args(argv):
spiders.load()
cmds = builtin_commands_dict()
cmds.update(custom_commands_dict())
cmdname = getcmdname()
cmdname = getcmdname(argv)
command_settings(cmdname)
if not cmdname:
print "Scrapy %s\n" % scrapy.__version__
print usage()
print usage(argv)
sys.exit()
parser = optparse.OptionParser()
@ -118,10 +121,10 @@ def execute():
else:
print "Scrapy %s\n" % scrapy.__version__
print "Unknown command: %s\n" % cmdname
print 'Type "%s -h" for help' % sys.argv[0]
print 'Type "%s -h" for help' % argv[0]
sys.exit()
(opts, args) = parser.parse_args()
(opts, args) = parser.parse_args(args=argv[1:])
del args[0] # args[0] is cmdname
# storing command executed info for later reference

View File

@ -4,6 +4,7 @@ from scrapy.command import ScrapyCommand
from scrapy.replay import Replay
from scrapy.utils import display
from scrapy.conf import settings
from scrapy.command import cmdline
class Command(ScrapyCommand):
def syntax(self):
@ -31,6 +32,8 @@ class Command(ScrapyCommand):
parser.add_option("--nocolour", dest="nocolour", action="store_true", help="disable colorized output (for console only)")
parser.add_option("-i", "--ignore", dest="ignores", action="append", help="item attribute to ignore. can be passed multiple times", metavar="ATTR")
parser.add_option("--target", dest="targets", action="append", help="crawl TARGET instead of recorded urls/domains. can be passed multiple times")
# adding option to update
parser.add_option("--pages", dest="pages", action="store_true", help="update all the pages in the replay file, recording it again.")
def process_options(self, args, opts):
ScrapyCommand.process_options(self, args, opts)
@ -75,7 +78,16 @@ class Command(ScrapyCommand):
self.replay.play(args=opts.targets)
def action_update(self, opts):
self.action_crawl(opts)
self.replay.update(args=opts.targets, opts=opts.__dict__)
if (opts.pages):
args = ['decobot-ctl.py', 'crawl']
args.extend(self.replay.options['args'])
for k in self.replay.options['opts']:
if self.replay.options['opts'][k]:
args.append("--%s" % k)
if self.replay.options['opts'][k] != True:
args.append(self.replay.options['opts'][k])
cmdline.execute_with_args(args)
def action_showitems(self, opts):
s = ""

View File

@ -68,6 +68,7 @@ class Replay(object):
scrapymanager.runonce(*args, **opts)
def record(self, args=None, opts=None):
self.recording = True
self.options.clear()
self.options['args'] = args or []
self.options['opts'] = opts or {}
@ -79,7 +80,12 @@ class Replay(object):
def update(self, args=None, opts=None):
self.updating = True
if (opts['pages']):
args = self.options['args']
opts = self.options['opts']
settings.overrides['CACHE2_EXPIRATION_SECS'] = 0
settings.overrides['CACHE2_IGNORE_MISSING'] = False
else:
self.play(args, opts)
def engine_started(self):