mirror of
https://github.com/scrapy/scrapy.git
synced 2025-03-01 06:59:29 +00:00
--HG-- rename : scrapy/trunk/AUTHORS => AUTHORS rename : scrapy/trunk/INSTALL => INSTALL rename : scrapy/trunk/LICENSE => LICENSE rename : scrapy/trunk/README => README rename : scrapy/trunk/bin/runtests.sh => bin/runtests.sh rename : scrapy/trunk/docs/Makefile => docs/Makefile rename : scrapy/trunk/docs/README => docs/README rename : scrapy/trunk/docs/_ext/scrapydocs.py => docs/_ext/scrapydocs.py rename : scrapy/trunk/docs/_static/items_adaptors-sample1.html => docs/_static/items_adaptors-sample1.html rename : scrapy/trunk/docs/_static/scrapydoc.css => docs/_static/scrapydoc.css rename : scrapy/trunk/docs/_static/selectors-sample1.html => docs/_static/selectors-sample1.html rename : scrapy/trunk/docs/conf.py => docs/conf.py rename : scrapy/trunk/docs/faq.rst => docs/faq.rst rename : scrapy/trunk/docs/index.rst => docs/index.rst rename : scrapy/trunk/docs/intro/index.rst => docs/intro/index.rst rename : scrapy/trunk/docs/intro/install.rst => docs/intro/install.rst rename : scrapy/trunk/docs/intro/overview.rst => docs/intro/overview.rst rename : scrapy/trunk/docs/intro/tutorial.rst => docs/intro/tutorial.rst rename : scrapy/trunk/docs/media/scrapy-architecture.dia => docs/media/scrapy-architecture.dia rename : scrapy/trunk/docs/misc/api-stability.rst => docs/misc/api-stability.rst rename : scrapy/trunk/docs/misc/index.rst => docs/misc/index.rst rename : scrapy/trunk/docs/proposed/_images/scrapy_architecture.odg => docs/proposed/_images/scrapy_architecture.odg rename : scrapy/trunk/docs/proposed/_images/scrapy_architecture.png => docs/proposed/_images/scrapy_architecture.png rename : scrapy/trunk/docs/proposed/index.rst => docs/proposed/index.rst rename : scrapy/trunk/docs/proposed/introduction.rst => docs/proposed/introduction.rst rename : scrapy/trunk/docs/proposed/newitem.rst => docs/proposed/newitem.rst rename : scrapy/trunk/docs/proposed/spiders.rst => docs/proposed/spiders.rst rename : scrapy/trunk/docs/ref/downloader-middleware.rst => docs/ref/downloader-middleware.rst rename : scrapy/trunk/docs/ref/email.rst => docs/ref/email.rst rename : scrapy/trunk/docs/ref/exceptions.rst => docs/ref/exceptions.rst rename : scrapy/trunk/docs/ref/extension-manager.rst => docs/ref/extension-manager.rst rename : scrapy/trunk/docs/ref/extensions.rst => docs/ref/extensions.rst rename : scrapy/trunk/docs/ref/index.rst => docs/ref/index.rst rename : scrapy/trunk/docs/ref/link-extractors.rst => docs/ref/link-extractors.rst rename : scrapy/trunk/docs/ref/logging.rst => docs/ref/logging.rst rename : scrapy/trunk/docs/ref/request-response.rst => docs/ref/request-response.rst rename : scrapy/trunk/docs/ref/selectors.rst => docs/ref/selectors.rst rename : scrapy/trunk/docs/ref/settings.rst => docs/ref/settings.rst rename : scrapy/trunk/docs/ref/signals.rst => docs/ref/signals.rst rename : scrapy/trunk/docs/ref/spiders.rst => docs/ref/spiders.rst rename : scrapy/trunk/docs/topics/_images/adaptors_diagram.png => docs/topics/_images/adaptors_diagram.png rename : scrapy/trunk/docs/topics/_images/adaptors_diagram.svg => docs/topics/_images/adaptors_diagram.svg rename : scrapy/trunk/docs/topics/_images/firebug1.png => docs/topics/_images/firebug1.png rename : scrapy/trunk/docs/topics/_images/firebug2.png => docs/topics/_images/firebug2.png rename : scrapy/trunk/docs/topics/_images/firebug3.png => docs/topics/_images/firebug3.png rename : scrapy/trunk/docs/topics/_images/scrapy_architecture.odg => docs/topics/_images/scrapy_architecture.odg rename : scrapy/trunk/docs/topics/_images/scrapy_architecture.png => docs/topics/_images/scrapy_architecture.png rename : scrapy/trunk/docs/topics/adaptors.rst => docs/topics/adaptors.rst rename : scrapy/trunk/docs/topics/architecture.rst => docs/topics/architecture.rst rename : scrapy/trunk/docs/topics/downloader-middleware.rst => docs/topics/downloader-middleware.rst rename : scrapy/trunk/docs/topics/extensions.rst => docs/topics/extensions.rst rename : scrapy/trunk/docs/topics/firebug.rst => docs/topics/firebug.rst rename : scrapy/trunk/docs/topics/firefox.rst => docs/topics/firefox.rst rename : scrapy/trunk/docs/topics/index.rst => docs/topics/index.rst rename : scrapy/trunk/docs/topics/item-pipeline.rst => docs/topics/item-pipeline.rst rename : scrapy/trunk/docs/topics/items.rst => docs/topics/items.rst rename : scrapy/trunk/docs/topics/link-extractors.rst => docs/topics/link-extractors.rst rename : scrapy/trunk/docs/topics/robotstxt.rst => docs/topics/robotstxt.rst rename : scrapy/trunk/docs/topics/selectors.rst => docs/topics/selectors.rst rename : scrapy/trunk/docs/topics/settings.rst => docs/topics/settings.rst rename : scrapy/trunk/docs/topics/shell.rst => docs/topics/shell.rst rename : scrapy/trunk/docs/topics/spider-middleware.rst => docs/topics/spider-middleware.rst rename : scrapy/trunk/docs/topics/spiders.rst => docs/topics/spiders.rst rename : scrapy/trunk/docs/topics/stats.rst => docs/topics/stats.rst rename : scrapy/trunk/docs/topics/webconsole.rst => docs/topics/webconsole.rst rename : scrapy/trunk/examples/experimental/googledir/googledir/__init__.py => examples/experimental/googledir/googledir/__init__.py rename : scrapy/trunk/examples/experimental/googledir/googledir/items.py => examples/experimental/googledir/googledir/items.py rename : scrapy/trunk/examples/experimental/googledir/googledir/pipelines.py => examples/experimental/googledir/googledir/pipelines.py rename : scrapy/trunk/examples/experimental/googledir/googledir/settings.py => examples/experimental/googledir/googledir/settings.py rename : scrapy/trunk/examples/experimental/googledir/googledir/spiders/__init__.py => examples/experimental/googledir/googledir/spiders/__init__.py rename : scrapy/trunk/examples/experimental/googledir/googledir/spiders/google_directory.py => examples/experimental/googledir/googledir/spiders/google_directory.py rename : scrapy/trunk/examples/experimental/googledir/googledir/templates/spider_basic.tmpl => examples/experimental/googledir/googledir/templates/spider_basic.tmpl rename : scrapy/trunk/examples/experimental/googledir/googledir/templates/spider_crawl.tmpl => examples/experimental/googledir/googledir/templates/spider_crawl.tmpl rename : scrapy/trunk/examples/experimental/googledir/googledir/templates/spider_csvfeed.tmpl => examples/experimental/googledir/googledir/templates/spider_csvfeed.tmpl rename : scrapy/trunk/examples/experimental/googledir/googledir/templates/spider_xmlfeed.tmpl => examples/experimental/googledir/googledir/templates/spider_xmlfeed.tmpl rename : scrapy/trunk/examples/experimental/googledir/scrapy-ctl.py => examples/experimental/googledir/scrapy-ctl.py rename : scrapy/trunk/examples/googledir/googledir/__init__.py => examples/googledir/googledir/__init__.py rename : scrapy/trunk/examples/googledir/googledir/items.py => examples/googledir/googledir/items.py rename : scrapy/trunk/examples/googledir/googledir/pipelines.py => examples/googledir/googledir/pipelines.py rename : scrapy/trunk/examples/googledir/googledir/settings.py => examples/googledir/googledir/settings.py rename : scrapy/trunk/examples/googledir/googledir/spiders/__init__.py => examples/googledir/googledir/spiders/__init__.py rename : scrapy/trunk/examples/googledir/googledir/spiders/google_directory.py => examples/googledir/googledir/spiders/google_directory.py rename : scrapy/trunk/examples/googledir/scrapy-ctl.py => examples/googledir/scrapy-ctl.py rename : scrapy/trunk/extras/sql/scraping.sql => extras/sql/scraping.sql rename : scrapy/trunk/profiling/priorityqueue/pq_classes.py => profiling/priorityqueue/pq_classes.py rename : scrapy/trunk/profiling/priorityqueue/run.py => profiling/priorityqueue/run.py rename : scrapy/trunk/profiling/priorityqueue/test_cases.py => profiling/priorityqueue/test_cases.py rename : scrapy/trunk/scrapy/__init__.py => scrapy/__init__.py rename : scrapy/trunk/scrapy/bin/scrapy-admin.py => scrapy/bin/scrapy-admin.py rename : scrapy/trunk/scrapy/command/__init__.py => scrapy/command/__init__.py rename : scrapy/trunk/scrapy/command/cmdline.py => scrapy/command/cmdline.py rename : scrapy/trunk/scrapy/command/commands/__init__.py => scrapy/command/commands/__init__.py rename : scrapy/trunk/scrapy/command/commands/crawl.py => scrapy/command/commands/crawl.py rename : scrapy/trunk/scrapy/command/commands/download.py => scrapy/command/commands/download.py rename : scrapy/trunk/scrapy/command/commands/genspider.py => scrapy/command/commands/genspider.py rename : scrapy/trunk/scrapy/command/commands/help.py => scrapy/command/commands/help.py rename : scrapy/trunk/scrapy/command/commands/list.py => scrapy/command/commands/list.py rename : scrapy/trunk/scrapy/command/commands/log.py => scrapy/command/commands/log.py rename : scrapy/trunk/scrapy/command/commands/parse.py => scrapy/command/commands/parse.py rename : scrapy/trunk/scrapy/command/commands/shell.py => scrapy/command/commands/shell.py rename : scrapy/trunk/scrapy/command/commands/start.py => scrapy/command/commands/start.py rename : scrapy/trunk/scrapy/command/commands/stats.py => scrapy/command/commands/stats.py rename : scrapy/trunk/scrapy/command/models.py => scrapy/command/models.py rename : scrapy/trunk/scrapy/conf/__init__.py => scrapy/conf/__init__.py rename : scrapy/trunk/scrapy/conf/commands/__init__.py => scrapy/conf/commands/__init__.py rename : scrapy/trunk/scrapy/conf/commands/crawl.py => scrapy/conf/commands/crawl.py rename : scrapy/trunk/scrapy/conf/commands/help.py => scrapy/conf/commands/help.py rename : scrapy/trunk/scrapy/conf/commands/list.py => scrapy/conf/commands/list.py rename : scrapy/trunk/scrapy/conf/commands/log.py => scrapy/conf/commands/log.py rename : scrapy/trunk/scrapy/conf/commands/scrape.py => scrapy/conf/commands/scrape.py rename : scrapy/trunk/scrapy/conf/commands/shell.py => scrapy/conf/commands/shell.py rename : scrapy/trunk/scrapy/conf/commands/stats.py => scrapy/conf/commands/stats.py rename : scrapy/trunk/scrapy/conf/commands/test.py => scrapy/conf/commands/test.py rename : scrapy/trunk/scrapy/conf/default_settings.py => scrapy/conf/default_settings.py rename : scrapy/trunk/scrapy/contrib/__init__.py => scrapy/contrib/__init__.py rename : scrapy/trunk/scrapy/contrib/aws.py => scrapy/contrib/aws.py rename : scrapy/trunk/scrapy/contrib/closedomain.py => scrapy/contrib/closedomain.py rename : scrapy/trunk/scrapy/contrib/cluster/__init__.py => scrapy/contrib/cluster/__init__.py rename : scrapy/trunk/scrapy/contrib/cluster/crawler/__init__.py => scrapy/contrib/cluster/crawler/__init__.py rename : scrapy/trunk/scrapy/contrib/cluster/crawler/manager.py => scrapy/contrib/cluster/crawler/manager.py rename : scrapy/trunk/scrapy/contrib/cluster/hooks/__init__.py => scrapy/contrib/cluster/hooks/__init__.py rename : scrapy/trunk/scrapy/contrib/cluster/hooks/svn.py => scrapy/contrib/cluster/hooks/svn.py rename : scrapy/trunk/scrapy/contrib/cluster/master/__init__.py => scrapy/contrib/cluster/master/__init__.py rename : scrapy/trunk/scrapy/contrib/cluster/master/manager.py => scrapy/contrib/cluster/master/manager.py rename : scrapy/trunk/scrapy/contrib/cluster/master/web.py => scrapy/contrib/cluster/master/web.py rename : scrapy/trunk/scrapy/contrib/cluster/master/ws_api.txt => scrapy/contrib/cluster/master/ws_api.txt rename : scrapy/trunk/scrapy/contrib/cluster/tools/scrapy-cluster-ctl.py => scrapy/contrib/cluster/tools/scrapy-cluster-ctl.py rename : scrapy/trunk/scrapy/contrib/cluster/tools/test-worker.py => scrapy/contrib/cluster/tools/test-worker.py rename : scrapy/trunk/scrapy/contrib/cluster/worker/__init__.py => scrapy/contrib/cluster/worker/__init__.py rename : scrapy/trunk/scrapy/contrib/cluster/worker/manager.py => scrapy/contrib/cluster/worker/manager.py rename : scrapy/trunk/scrapy/contrib/codecs/__init__.py => scrapy/contrib/codecs/__init__.py rename : scrapy/trunk/scrapy/contrib/codecs/x_mac_roman.py => scrapy/contrib/codecs/x_mac_roman.py rename : scrapy/trunk/scrapy/contrib/debug.py => scrapy/contrib/debug.py rename : scrapy/trunk/scrapy/contrib/delayedclosedomain.py => scrapy/contrib/delayedclosedomain.py rename : scrapy/trunk/scrapy/contrib/downloadermiddleware/__init__.py => scrapy/contrib/downloadermiddleware/__init__.py rename : scrapy/trunk/scrapy/contrib/downloadermiddleware/cache.py => scrapy/contrib/downloadermiddleware/cache.py rename : scrapy/trunk/scrapy/contrib/downloadermiddleware/common.py => scrapy/contrib/downloadermiddleware/common.py rename : scrapy/trunk/scrapy/contrib/downloadermiddleware/cookies.py => scrapy/contrib/downloadermiddleware/cookies.py rename : scrapy/trunk/scrapy/contrib/downloadermiddleware/debug.py => scrapy/contrib/downloadermiddleware/debug.py rename : scrapy/trunk/scrapy/contrib/downloadermiddleware/errorpages.py => scrapy/contrib/downloadermiddleware/errorpages.py rename : scrapy/trunk/scrapy/contrib/downloadermiddleware/httpauth.py => scrapy/contrib/downloadermiddleware/httpauth.py rename : scrapy/trunk/scrapy/contrib/downloadermiddleware/httpcompression.py => scrapy/contrib/downloadermiddleware/httpcompression.py rename : scrapy/trunk/scrapy/contrib/downloadermiddleware/redirect.py => scrapy/contrib/downloadermiddleware/redirect.py rename : scrapy/trunk/scrapy/contrib/downloadermiddleware/retry.py => scrapy/contrib/downloadermiddleware/retry.py rename : scrapy/trunk/scrapy/contrib/downloadermiddleware/robotstxt.py => scrapy/contrib/downloadermiddleware/robotstxt.py rename : scrapy/trunk/scrapy/contrib/downloadermiddleware/stats.py => scrapy/contrib/downloadermiddleware/stats.py rename : scrapy/trunk/scrapy/contrib/downloadermiddleware/useragent.py => scrapy/contrib/downloadermiddleware/useragent.py rename : scrapy/trunk/scrapy/contrib/groupsettings.py => scrapy/contrib/groupsettings.py rename : scrapy/trunk/scrapy/contrib/item/__init__.py => scrapy/contrib/item/__init__.py rename : scrapy/trunk/scrapy/contrib/item/models.py => scrapy/contrib/item/models.py rename : scrapy/trunk/scrapy/contrib/itemsampler.py => scrapy/contrib/itemsampler.py rename : scrapy/trunk/scrapy/contrib/link_extractors.py => scrapy/contrib/link_extractors.py rename : scrapy/trunk/scrapy/contrib/memdebug.py => scrapy/contrib/memdebug.py rename : scrapy/trunk/scrapy/contrib/memusage.py => scrapy/contrib/memusage.py rename : scrapy/trunk/scrapy/contrib/pipeline/__init__.py => scrapy/contrib/pipeline/__init__.py rename : scrapy/trunk/scrapy/contrib/pipeline/images.py => scrapy/contrib/pipeline/images.py rename : scrapy/trunk/scrapy/contrib/pipeline/media.py => scrapy/contrib/pipeline/media.py rename : scrapy/trunk/scrapy/contrib/pipeline/s3images.py => scrapy/contrib/pipeline/s3images.py rename : scrapy/trunk/scrapy/contrib/pipeline/show.py => scrapy/contrib/pipeline/show.py rename : scrapy/trunk/scrapy/contrib/prioritizers.py => scrapy/contrib/prioritizers.py rename : scrapy/trunk/scrapy/contrib/response/__init__.py => scrapy/contrib/response/__init__.py rename : scrapy/trunk/scrapy/contrib/response/soup.py => scrapy/contrib/response/soup.py rename : scrapy/trunk/scrapy/contrib/schedulermiddleware/__init__.py => scrapy/contrib/schedulermiddleware/__init__.py rename : scrapy/trunk/scrapy/contrib/schedulermiddleware/duplicatesfilter.py => scrapy/contrib/schedulermiddleware/duplicatesfilter.py rename : scrapy/trunk/scrapy/contrib/spider/__init__.py => scrapy/contrib/spider/__init__.py rename : scrapy/trunk/scrapy/contrib/spider/profiler.py => scrapy/contrib/spider/profiler.py rename : scrapy/trunk/scrapy/contrib/spider/reloader.py => scrapy/contrib/spider/reloader.py rename : scrapy/trunk/scrapy/contrib/spidermiddleware/__init__.py => scrapy/contrib/spidermiddleware/__init__.py rename : scrapy/trunk/scrapy/contrib/spidermiddleware/depth.py => scrapy/contrib/spidermiddleware/depth.py rename : scrapy/trunk/scrapy/contrib/spidermiddleware/duplicatesfilter.py => scrapy/contrib/spidermiddleware/duplicatesfilter.py rename : scrapy/trunk/scrapy/contrib/spidermiddleware/limit.py => scrapy/contrib/spidermiddleware/limit.py rename : scrapy/trunk/scrapy/contrib/spidermiddleware/offsite.py => scrapy/contrib/spidermiddleware/offsite.py rename : scrapy/trunk/scrapy/contrib/spidermiddleware/referer.py => scrapy/contrib/spidermiddleware/referer.py rename : scrapy/trunk/scrapy/contrib/spidermiddleware/restrict.py => scrapy/contrib/spidermiddleware/restrict.py rename : scrapy/trunk/scrapy/contrib/spidermiddleware/urlfilter.py => scrapy/contrib/spidermiddleware/urlfilter.py rename : scrapy/trunk/scrapy/contrib/spidermiddleware/urllength.py => scrapy/contrib/spidermiddleware/urllength.py rename : scrapy/trunk/scrapy/contrib/spiders/__init__.py => scrapy/contrib/spiders/__init__.py rename : scrapy/trunk/scrapy/contrib/spiders/crawl.py => scrapy/contrib/spiders/crawl.py rename : scrapy/trunk/scrapy/contrib/spiders/feed.py => scrapy/contrib/spiders/feed.py rename : scrapy/trunk/scrapy/contrib/spiders/generic.py => scrapy/contrib/spiders/generic.py rename : scrapy/trunk/scrapy/contrib/web/__init__.py => scrapy/contrib/web/__init__.py rename : scrapy/trunk/scrapy/contrib/web/http.py => scrapy/contrib/web/http.py rename : scrapy/trunk/scrapy/contrib/web/json.py => scrapy/contrib/web/json.py rename : scrapy/trunk/scrapy/contrib/web/service.py => scrapy/contrib/web/service.py rename : scrapy/trunk/scrapy/contrib/web/site.py => scrapy/contrib/web/site.py rename : scrapy/trunk/scrapy/contrib/web/stats.py => scrapy/contrib/web/stats.py rename : scrapy/trunk/scrapy/contrib/webconsole/__init__.py => scrapy/contrib/webconsole/__init__.py rename : scrapy/trunk/scrapy/contrib/webconsole/enginestatus.py => scrapy/contrib/webconsole/enginestatus.py rename : scrapy/trunk/scrapy/contrib/webconsole/livestats.py => scrapy/contrib/webconsole/livestats.py rename : scrapy/trunk/scrapy/contrib/webconsole/scheduler.py => scrapy/contrib/webconsole/scheduler.py rename : scrapy/trunk/scrapy/contrib/webconsole/spiderctl.py => scrapy/contrib/webconsole/spiderctl.py rename : scrapy/trunk/scrapy/contrib/webconsole/spiderstats.py => scrapy/contrib/webconsole/spiderstats.py rename : scrapy/trunk/scrapy/contrib/webconsole/stats.py => scrapy/contrib/webconsole/stats.py rename : scrapy/trunk/scrapy/contrib_exp/__init__.py => scrapy/contrib_exp/__init__.py rename : scrapy/trunk/scrapy/contrib_exp/adaptors/__init__.py => scrapy/contrib_exp/adaptors/__init__.py rename : scrapy/trunk/scrapy/contrib_exp/adaptors/date.py => scrapy/contrib_exp/adaptors/date.py rename : scrapy/trunk/scrapy/contrib_exp/adaptors/extraction.py => scrapy/contrib_exp/adaptors/extraction.py rename : scrapy/trunk/scrapy/contrib_exp/adaptors/markup.py => scrapy/contrib_exp/adaptors/markup.py rename : scrapy/trunk/scrapy/contrib_exp/adaptors/misc.py => scrapy/contrib_exp/adaptors/misc.py rename : scrapy/trunk/scrapy/contrib_exp/downloadermiddleware/__init__.py => scrapy/contrib_exp/downloadermiddleware/__init__.py rename : scrapy/trunk/scrapy/contrib_exp/downloadermiddleware/decompression.py => scrapy/contrib_exp/downloadermiddleware/decompression.py rename : scrapy/trunk/scrapy/contrib_exp/history/__init__.py => scrapy/contrib_exp/history/__init__.py rename : scrapy/trunk/scrapy/contrib_exp/history/history.py => scrapy/contrib_exp/history/history.py rename : scrapy/trunk/scrapy/contrib_exp/history/middleware.py => scrapy/contrib_exp/history/middleware.py rename : scrapy/trunk/scrapy/contrib_exp/history/scheduler.py => scrapy/contrib_exp/history/scheduler.py rename : scrapy/trunk/scrapy/contrib_exp/history/store.py => scrapy/contrib_exp/history/store.py rename : scrapy/trunk/scrapy/contrib_exp/link/__init__.py => scrapy/contrib_exp/link/__init__.py rename : scrapy/trunk/scrapy/contrib_exp/newitem/__init__.py => scrapy/contrib_exp/newitem/__init__.py rename : scrapy/trunk/scrapy/contrib_exp/newitem/adaptors.py => scrapy/contrib_exp/newitem/adaptors.py rename : scrapy/trunk/scrapy/contrib_exp/newitem/fields.py => scrapy/contrib_exp/newitem/fields.py rename : scrapy/trunk/scrapy/contrib_exp/newitem/models.py => scrapy/contrib_exp/newitem/models.py rename : scrapy/trunk/scrapy/contrib_exp/pipeline/shoveitem.py => scrapy/contrib_exp/pipeline/shoveitem.py rename : scrapy/trunk/scrapy/core/__init__.py => scrapy/core/__init__.py rename : scrapy/trunk/scrapy/core/downloader/__init__.py => scrapy/core/downloader/__init__.py rename : scrapy/trunk/scrapy/core/downloader/dnscache.py => scrapy/core/downloader/dnscache.py rename : scrapy/trunk/scrapy/core/downloader/handlers.py => scrapy/core/downloader/handlers.py rename : scrapy/trunk/scrapy/core/downloader/manager.py => scrapy/core/downloader/manager.py rename : scrapy/trunk/scrapy/core/downloader/middleware.py => scrapy/core/downloader/middleware.py rename : scrapy/trunk/scrapy/core/downloader/responsetypes/__init__.py => scrapy/core/downloader/responsetypes/__init__.py rename : scrapy/trunk/scrapy/core/downloader/responsetypes/mime.types => scrapy/core/downloader/responsetypes/mime.types rename : scrapy/trunk/scrapy/core/downloader/webclient.py => scrapy/core/downloader/webclient.py rename : scrapy/trunk/scrapy/core/engine.py => scrapy/core/engine.py rename : scrapy/trunk/scrapy/core/exceptions.py => scrapy/core/exceptions.py rename : scrapy/trunk/scrapy/core/manager.py => scrapy/core/manager.py rename : scrapy/trunk/scrapy/core/prioritizers.py => scrapy/core/prioritizers.py rename : scrapy/trunk/scrapy/core/scheduler/__init__.py => scrapy/core/scheduler/__init__.py rename : scrapy/trunk/scrapy/core/scheduler/middleware.py => scrapy/core/scheduler/middleware.py rename : scrapy/trunk/scrapy/core/scheduler/schedulers.py => scrapy/core/scheduler/schedulers.py rename : scrapy/trunk/scrapy/core/scheduler/store.py => scrapy/core/scheduler/store.py rename : scrapy/trunk/scrapy/core/signals.py => scrapy/core/signals.py rename : scrapy/trunk/scrapy/dupefilter/__init__.py => scrapy/dupefilter/__init__.py rename : scrapy/trunk/scrapy/extension/__init__.py => scrapy/extension/__init__.py rename : scrapy/trunk/scrapy/fetcher/__init__.py => scrapy/fetcher/__init__.py rename : scrapy/trunk/scrapy/http/__init__.py => scrapy/http/__init__.py rename : scrapy/trunk/scrapy/http/cookies.py => scrapy/http/cookies.py rename : scrapy/trunk/scrapy/http/headers.py => scrapy/http/headers.py rename : scrapy/trunk/scrapy/http/request/__init__.py => scrapy/http/request/__init__.py rename : scrapy/trunk/scrapy/http/request/form.py => scrapy/http/request/form.py rename : scrapy/trunk/scrapy/http/request/rpc.py => scrapy/http/request/rpc.py rename : scrapy/trunk/scrapy/http/response/__init__.py => scrapy/http/response/__init__.py rename : scrapy/trunk/scrapy/http/response/html.py => scrapy/http/response/html.py rename : scrapy/trunk/scrapy/http/response/text.py => scrapy/http/response/text.py rename : scrapy/trunk/scrapy/http/response/xml.py => scrapy/http/response/xml.py rename : scrapy/trunk/scrapy/http/url.py => scrapy/http/url.py rename : scrapy/trunk/scrapy/item/__init__.py => scrapy/item/__init__.py rename : scrapy/trunk/scrapy/item/adaptors.py => scrapy/item/adaptors.py rename : scrapy/trunk/scrapy/item/models.py => scrapy/item/models.py rename : scrapy/trunk/scrapy/item/pipeline.py => scrapy/item/pipeline.py rename : scrapy/trunk/scrapy/link/__init__.py => scrapy/link/__init__.py rename : scrapy/trunk/scrapy/link/extractors.py => scrapy/link/extractors.py rename : scrapy/trunk/scrapy/log/__init__.py => scrapy/log/__init__.py rename : scrapy/trunk/scrapy/mail/__init__.py => scrapy/mail/__init__.py rename : scrapy/trunk/scrapy/management/__init__.py => scrapy/management/__init__.py rename : scrapy/trunk/scrapy/management/telnet.py => scrapy/management/telnet.py rename : scrapy/trunk/scrapy/management/web.py => scrapy/management/web.py rename : scrapy/trunk/scrapy/patches/__init__.py => scrapy/patches/__init__.py rename : scrapy/trunk/scrapy/patches/monkeypatches.py => scrapy/patches/monkeypatches.py rename : scrapy/trunk/scrapy/spider/__init__.py => scrapy/spider/__init__.py rename : scrapy/trunk/scrapy/spider/manager.py => scrapy/spider/manager.py rename : scrapy/trunk/scrapy/spider/middleware.py => scrapy/spider/middleware.py rename : scrapy/trunk/scrapy/spider/models.py => scrapy/spider/models.py rename : scrapy/trunk/scrapy/stats/__init__.py => scrapy/stats/__init__.py rename : scrapy/trunk/scrapy/stats/corestats.py => scrapy/stats/corestats.py rename : scrapy/trunk/scrapy/stats/statscollector.py => scrapy/stats/statscollector.py rename : scrapy/trunk/scrapy/store/__init__.py => scrapy/store/__init__.py rename : scrapy/trunk/scrapy/store/db.py => scrapy/store/db.py rename : scrapy/trunk/scrapy/templates/project/module/__init__.py => scrapy/templates/project/module/__init__.py rename : scrapy/trunk/scrapy/templates/project/module/items.py.tmpl => scrapy/templates/project/module/items.py.tmpl rename : scrapy/trunk/scrapy/templates/project/module/pipelines.py.tmpl => scrapy/templates/project/module/pipelines.py.tmpl rename : scrapy/trunk/scrapy/templates/project/module/settings.py.tmpl => scrapy/templates/project/module/settings.py.tmpl rename : scrapy/trunk/scrapy/templates/project/module/spiders/__init__.py => scrapy/templates/project/module/spiders/__init__.py rename : scrapy/trunk/scrapy/templates/project/module/templates/spider_basic.tmpl => scrapy/templates/project/module/templates/spider_basic.tmpl rename : scrapy/trunk/scrapy/templates/project/module/templates/spider_crawl.tmpl => scrapy/templates/project/module/templates/spider_crawl.tmpl rename : scrapy/trunk/scrapy/templates/project/module/templates/spider_csvfeed.tmpl => scrapy/templates/project/module/templates/spider_csvfeed.tmpl rename : scrapy/trunk/scrapy/templates/project/module/templates/spider_xmlfeed.tmpl => scrapy/templates/project/module/templates/spider_xmlfeed.tmpl rename : scrapy/trunk/scrapy/templates/project/root/scrapy-ctl.py => scrapy/templates/project/root/scrapy-ctl.py rename : scrapy/trunk/scrapy/tests/__init__.py => scrapy/tests/__init__.py rename : scrapy/trunk/scrapy/tests/run.py => scrapy/tests/run.py rename : scrapy/trunk/scrapy/tests/sample_data/adaptors/enc-ascii.html => scrapy/tests/sample_data/adaptors/enc-ascii.html rename : scrapy/trunk/scrapy/tests/sample_data/adaptors/enc-cp1252.html => scrapy/tests/sample_data/adaptors/enc-cp1252.html rename : scrapy/trunk/scrapy/tests/sample_data/adaptors/enc-latin1.html => scrapy/tests/sample_data/adaptors/enc-latin1.html rename : scrapy/trunk/scrapy/tests/sample_data/adaptors/enc-utf8-meta-latin1.html => scrapy/tests/sample_data/adaptors/enc-utf8-meta-latin1.html rename : scrapy/trunk/scrapy/tests/sample_data/adaptors/enc-utf8.html => scrapy/tests/sample_data/adaptors/enc-utf8.html rename : scrapy/trunk/scrapy/tests/sample_data/adaptors/extr_unquoted.xml => scrapy/tests/sample_data/adaptors/extr_unquoted.xml rename : scrapy/trunk/scrapy/tests/sample_data/compressed/feed-sample1.tar => scrapy/tests/sample_data/compressed/feed-sample1.tar rename : scrapy/trunk/scrapy/tests/sample_data/compressed/feed-sample1.xml => scrapy/tests/sample_data/compressed/feed-sample1.xml rename : scrapy/trunk/scrapy/tests/sample_data/compressed/feed-sample1.xml.bz2 => scrapy/tests/sample_data/compressed/feed-sample1.xml.bz2 rename : scrapy/trunk/scrapy/tests/sample_data/compressed/feed-sample1.xml.gz => scrapy/tests/sample_data/compressed/feed-sample1.xml.gz rename : scrapy/trunk/scrapy/tests/sample_data/compressed/feed-sample1.zip => scrapy/tests/sample_data/compressed/feed-sample1.zip rename : scrapy/trunk/scrapy/tests/sample_data/compressed/html-gzip.bin => scrapy/tests/sample_data/compressed/html-gzip.bin rename : scrapy/trunk/scrapy/tests/sample_data/compressed/html-rawdeflate.bin => scrapy/tests/sample_data/compressed/html-rawdeflate.bin rename : scrapy/trunk/scrapy/tests/sample_data/compressed/html-zlibdeflate.bin => scrapy/tests/sample_data/compressed/html-zlibdeflate.bin rename : scrapy/trunk/scrapy/tests/sample_data/feeds/feed-sample1.xml => scrapy/tests/sample_data/feeds/feed-sample1.xml rename : scrapy/trunk/scrapy/tests/sample_data/feeds/feed-sample2.xml => scrapy/tests/sample_data/feeds/feed-sample2.xml rename : scrapy/trunk/scrapy/tests/sample_data/feeds/feed-sample3.csv => scrapy/tests/sample_data/feeds/feed-sample3.csv rename : scrapy/trunk/scrapy/tests/sample_data/feeds/feed-sample4.csv => scrapy/tests/sample_data/feeds/feed-sample4.csv rename : scrapy/trunk/scrapy/tests/sample_data/feeds/feed-sample5.csv => scrapy/tests/sample_data/feeds/feed-sample5.csv rename : scrapy/trunk/scrapy/tests/sample_data/link_extractor/image_linkextractor.html => scrapy/tests/sample_data/link_extractor/image_linkextractor.html rename : scrapy/trunk/scrapy/tests/sample_data/link_extractor/linkextractor_latin1.html => scrapy/tests/sample_data/link_extractor/linkextractor_latin1.html rename : scrapy/trunk/scrapy/tests/sample_data/link_extractor/linkextractor_noenc.html => scrapy/tests/sample_data/link_extractor/linkextractor_noenc.html rename : scrapy/trunk/scrapy/tests/sample_data/link_extractor/regex_linkextractor.html => scrapy/tests/sample_data/link_extractor/regex_linkextractor.html rename : scrapy/trunk/scrapy/tests/sample_data/test_site/index.html => scrapy/tests/sample_data/test_site/index.html rename : scrapy/trunk/scrapy/tests/sample_data/test_site/item1.html => scrapy/tests/sample_data/test_site/item1.html rename : scrapy/trunk/scrapy/tests/sample_data/test_site/item2.html => scrapy/tests/sample_data/test_site/item2.html rename : scrapy/trunk/scrapy/tests/test_adaptors.py => scrapy/tests/test_adaptors.py rename : scrapy/trunk/scrapy/tests/test_aws.py => scrapy/tests/test_aws.py rename : scrapy/trunk/scrapy/tests/test_c14nurls.py => scrapy/tests/test_c14nurls.py rename : scrapy/trunk/scrapy/tests/test_contrib_response_soup.py => scrapy/tests/test_contrib_response_soup.py rename : scrapy/trunk/scrapy/tests/test_dependencies.py => scrapy/tests/test_dependencies.py rename : scrapy/trunk/scrapy/tests/test_downloadermiddleware_cookies.py => scrapy/tests/test_downloadermiddleware_cookies.py rename : scrapy/trunk/scrapy/tests/test_downloadermiddleware_decompression.py => scrapy/tests/test_downloadermiddleware_decompression.py rename : scrapy/trunk/scrapy/tests/test_downloadermiddleware_httpcompression.py => scrapy/tests/test_downloadermiddleware_httpcompression.py rename : scrapy/trunk/scrapy/tests/test_downloadermiddleware_redirect.py => scrapy/tests/test_downloadermiddleware_redirect.py rename : scrapy/trunk/scrapy/tests/test_downloadermiddleware_retry.py => scrapy/tests/test_downloadermiddleware_retry.py rename : scrapy/trunk/scrapy/tests/test_downloadermiddleware_useragent.py => scrapy/tests/test_downloadermiddleware_useragent.py rename : scrapy/trunk/scrapy/tests/test_dupefilter.py => scrapy/tests/test_dupefilter.py rename : scrapy/trunk/scrapy/tests/test_engine.py => scrapy/tests/test_engine.py rename : scrapy/trunk/scrapy/tests/test_http_cookies.py => scrapy/tests/test_http_cookies.py rename : scrapy/trunk/scrapy/tests/test_http_headers.py => scrapy/tests/test_http_headers.py rename : scrapy/trunk/scrapy/tests/test_http_request.py => scrapy/tests/test_http_request.py rename : scrapy/trunk/scrapy/tests/test_http_response.py => scrapy/tests/test_http_response.py rename : scrapy/trunk/scrapy/tests/test_http_url.py => scrapy/tests/test_http_url.py rename : scrapy/trunk/scrapy/tests/test_item.py => scrapy/tests/test_item.py rename : scrapy/trunk/scrapy/tests/test_itemadaptor.py => scrapy/tests/test_itemadaptor.py rename : scrapy/trunk/scrapy/tests/test_libxml2.py => scrapy/tests/test_libxml2.py rename : scrapy/trunk/scrapy/tests/test_link.py => scrapy/tests/test_link.py rename : scrapy/trunk/scrapy/tests/test_newitem.py => scrapy/tests/test_newitem.py rename : scrapy/trunk/scrapy/tests/test_pipeline_images.py => scrapy/tests/test_pipeline_images.py rename : scrapy/trunk/scrapy/tests/test_responsetypes.py => scrapy/tests/test_responsetypes.py rename : scrapy/trunk/scrapy/tests/test_robustscrapeditem.py => scrapy/tests/test_robustscrapeditem.py rename : scrapy/trunk/scrapy/tests/test_schedulermiddleware_duplicatesfilter.py => scrapy/tests/test_schedulermiddleware_duplicatesfilter.py rename : scrapy/trunk/scrapy/tests/test_serialization.py => scrapy/tests/test_serialization.py rename : scrapy/trunk/scrapy/tests/test_spidermiddleware_duplicatesfilter.py => scrapy/tests/test_spidermiddleware_duplicatesfilter.py rename : scrapy/trunk/scrapy/tests/test_spidermonkey.py => scrapy/tests/test_spidermonkey.py rename : scrapy/trunk/scrapy/tests/test_spiders/__init__.py => scrapy/tests/test_spiders/__init__.py rename : scrapy/trunk/scrapy/tests/test_spiders/testspider.py => scrapy/tests/test_spiders/testspider.py rename : scrapy/trunk/scrapy/tests/test_stats.py => scrapy/tests/test_stats.py rename : scrapy/trunk/scrapy/tests/test_storedb.py => scrapy/tests/test_storedb.py rename : scrapy/trunk/scrapy/tests/test_utils_datatypes.py => scrapy/tests/test_utils_datatypes.py rename : scrapy/trunk/scrapy/tests/test_utils_defer.py => scrapy/tests/test_utils_defer.py rename : scrapy/trunk/scrapy/tests/test_utils_iterators.py => scrapy/tests/test_utils_iterators.py rename : scrapy/trunk/scrapy/tests/test_utils_markup.py => scrapy/tests/test_utils_markup.py rename : scrapy/trunk/scrapy/tests/test_utils_misc.py => scrapy/tests/test_utils_misc.py rename : scrapy/trunk/scrapy/tests/test_utils_python.py => scrapy/tests/test_utils_python.py rename : scrapy/trunk/scrapy/tests/test_utils_request.py => scrapy/tests/test_utils_request.py rename : scrapy/trunk/scrapy/tests/test_utils_response.py => scrapy/tests/test_utils_response.py rename : scrapy/trunk/scrapy/tests/test_utils_url.py => scrapy/tests/test_utils_url.py rename : scrapy/trunk/scrapy/tests/test_webclient.py => scrapy/tests/test_webclient.py rename : scrapy/trunk/scrapy/tests/test_xpath.py => scrapy/tests/test_xpath.py rename : scrapy/trunk/scrapy/tests/test_xpath_extension.py => scrapy/tests/test_xpath_extension.py rename : scrapy/trunk/scrapy/utils/__init__.py => scrapy/utils/__init__.py rename : scrapy/trunk/scrapy/utils/c14n.py => scrapy/utils/c14n.py rename : scrapy/trunk/scrapy/utils/datatypes.py => scrapy/utils/datatypes.py rename : scrapy/trunk/scrapy/utils/db.py => scrapy/utils/db.py rename : scrapy/trunk/scrapy/utils/defer.py => scrapy/utils/defer.py rename : scrapy/trunk/scrapy/utils/display.py => scrapy/utils/display.py rename : scrapy/trunk/scrapy/utils/http.py => scrapy/utils/http.py rename : scrapy/trunk/scrapy/utils/iterators.py => scrapy/utils/iterators.py rename : scrapy/trunk/scrapy/utils/markup.py => scrapy/utils/markup.py rename : scrapy/trunk/scrapy/utils/misc.py => scrapy/utils/misc.py rename : scrapy/trunk/scrapy/utils/python.py => scrapy/utils/python.py rename : scrapy/trunk/scrapy/utils/request.py => scrapy/utils/request.py rename : scrapy/trunk/scrapy/utils/response.py => scrapy/utils/response.py rename : scrapy/trunk/scrapy/utils/serialization.py => scrapy/utils/serialization.py rename : scrapy/trunk/scrapy/utils/test.py => scrapy/utils/test.py rename : scrapy/trunk/scrapy/utils/url.py => scrapy/utils/url.py rename : scrapy/trunk/scrapy/xlib/BeautifulSoup.py => scrapy/xlib/BeautifulSoup.py rename : scrapy/trunk/scrapy/xlib/ClientForm.py => scrapy/xlib/ClientForm.py rename : scrapy/trunk/scrapy/xlib/__init__.py => scrapy/xlib/__init__.py rename : scrapy/trunk/scrapy/xlib/lrucache.py => scrapy/xlib/lrucache.py rename : scrapy/trunk/scrapy/xlib/lsprofcalltree.py => scrapy/xlib/lsprofcalltree.py rename : scrapy/trunk/scrapy/xlib/pydispatch/__init__.py => scrapy/xlib/pydispatch/__init__.py rename : scrapy/trunk/scrapy/xlib/pydispatch/dispatcher.py => scrapy/xlib/pydispatch/dispatcher.py rename : scrapy/trunk/scrapy/xlib/pydispatch/errors.py => scrapy/xlib/pydispatch/errors.py rename : scrapy/trunk/scrapy/xlib/pydispatch/license.txt => scrapy/xlib/pydispatch/license.txt rename : scrapy/trunk/scrapy/xlib/pydispatch/robust.py => scrapy/xlib/pydispatch/robust.py rename : scrapy/trunk/scrapy/xlib/pydispatch/robustapply.py => scrapy/xlib/pydispatch/robustapply.py rename : scrapy/trunk/scrapy/xlib/pydispatch/saferef.py => scrapy/xlib/pydispatch/saferef.py rename : scrapy/trunk/scrapy/xlib/spidermonkey/INSTALL.scrapy => scrapy/xlib/spidermonkey/INSTALL.scrapy rename : scrapy/trunk/scrapy/xlib/spidermonkey/__init__.py => scrapy/xlib/spidermonkey/__init__.py rename : scrapy/trunk/scrapy/xlib/spidermonkey/sm_settings.py => scrapy/xlib/spidermonkey/sm_settings.py rename : scrapy/trunk/scrapy/xlib/spidermonkey/spidermonkey.py => scrapy/xlib/spidermonkey/spidermonkey.py rename : scrapy/trunk/scrapy/xpath/__init__.py => scrapy/xpath/__init__.py rename : scrapy/trunk/scrapy/xpath/constructors.py => scrapy/xpath/constructors.py rename : scrapy/trunk/scrapy/xpath/document.py => scrapy/xpath/document.py rename : scrapy/trunk/scrapy/xpath/extension.py => scrapy/xpath/extension.py rename : scrapy/trunk/scrapy/xpath/selector.py => scrapy/xpath/selector.py rename : scrapy/trunk/scrapy/xpath/types.py => scrapy/xpath/types.py rename : scrapy/trunk/scripts/rpm-install.sh => scripts/rpm-install.sh rename : scrapy/trunk/setup.cfg => setup.cfg rename : scrapy/trunk/setup.py => setup.py
115 lines
4.5 KiB
ReStructuredText
115 lines
4.5 KiB
ReStructuredText
.. _ref-link-extractors:
|
|
|
|
=========================
|
|
Available Link Extractors
|
|
=========================
|
|
|
|
.. module:: scrapy.link
|
|
:synopsis: Link extractors classes
|
|
|
|
LinkExtractor
|
|
=============
|
|
|
|
.. class:: LinkExtractor(tag="a", href="href", unique=False, process_value=None)
|
|
|
|
This is the most basic Link Extractor which extracts links from a response with
|
|
by looking at the given attributes inside the given tags.
|
|
|
|
The constructor arguments are:
|
|
|
|
:param tag: either a string (with the name of a tag) or a function that
|
|
receives a tag name and returns ``True`` if links should be extracted
|
|
from those tag, or ``False`` if they shouldn't. Defaults to ``'a'``.
|
|
request (once its downloaded) as its first parameter. For more
|
|
information see :ref:`ref-request-callback-arguments` below.
|
|
:type tag: str or callable
|
|
|
|
:param attr: either string (with the name of a tag attribute), or a
|
|
function that receives a an attribute name and returns ``True`` if
|
|
links should be extracted from it, or ``False`` if the shouldn't.
|
|
Defaults to ``href``.
|
|
:type attr: str or callable
|
|
|
|
:param unique: is a boolean that specifies if a duplicate filtering should
|
|
be applied to links extracted.
|
|
:type unique: boolean
|
|
|
|
:param process_value: a function which receives each value extracted from
|
|
the tag and attributes scanned and can modify the value and return a
|
|
new one, or return ``None`` to ignore the link altogether. If not
|
|
given, ``process_value`` defaults to ``lambda x: x``.
|
|
|
|
.. highlight:: html
|
|
|
|
For example, to extract links from this code::
|
|
|
|
<a href="javascript:goToPage('../other/page.html'); return false">Link text</a>
|
|
|
|
.. highlight:: python
|
|
|
|
You can use the following function in ``process_value``::
|
|
|
|
def process_value(value):
|
|
m = re.search("javascript:goToPage\('(.*?)'", value)
|
|
if m:
|
|
return m.group(1)
|
|
|
|
:type process_value: callable
|
|
|
|
RegexLinkExtractor
|
|
==================
|
|
|
|
.. class:: RegexLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths(), tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None)
|
|
|
|
The RegexLinkExtractor extends the base :class:`LinkExtractor` by providing
|
|
additional filters that you can specify to extract links, including regular
|
|
expressions patterns that the links must match to be extracted. All those
|
|
filters are configured through these constructor paramters:
|
|
|
|
:param allow: a single regular expression (or list of regular expressions)
|
|
that the (absolute) urls must match in order to be extracted. If not
|
|
given (or empty), it will match all links.
|
|
:type allow: a regular expression (or list of)
|
|
|
|
:param deny: a single regular expression (or list of regular expressions)
|
|
that the (absolute) urls must match in order to be excluded (ie. not
|
|
extracted). It has precedence over the ``allow`` parameter. If not
|
|
given (or empty) it won't exclude any links.
|
|
:type allow: a regular expression (or list of)
|
|
|
|
:param allow_domains: is single value or a list of string containing
|
|
domains which will be considered for extracting the links
|
|
:type allow: str or list
|
|
|
|
:param deny_domains: is single value or a list of strings containing
|
|
domains which which won't be considered for extracting the links
|
|
:type allow: str or list
|
|
|
|
:param restrict_xpaths: is a XPath (or list of XPath's) which defines
|
|
regions inside the response where links should be extracted from.
|
|
If given, only the text selected by those XPath will be scanned for
|
|
links. See examples below.
|
|
:type restrict_xpaths: str or list
|
|
|
|
:param tags: a tag or a list of tags to consider when extracting links.
|
|
Defaults to ``('a', 'area')``.
|
|
:type tags: str or list
|
|
|
|
:param attrs: list of attrbitues which should be considered when looking
|
|
for links to extract (only for those tags specified in the ``tags``
|
|
parameter). Defaults to ``('href',)``
|
|
:type attrs: boolean
|
|
|
|
:param canonicalize: canonicalize each extracted url (using
|
|
scrapy.utils.url.canonicalize_url). Defaults to ``True``.
|
|
:type canonicalize: boolean
|
|
|
|
:param unique: whether duplicate filtering should be applied to extracted
|
|
links.
|
|
:type unique: boolean
|
|
|
|
:param process_value: see ``process_value`` argument of
|
|
:class:`LinkExtractor` class constructor
|
|
:type process_value: boolean
|
|
|