mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 09:04:10 +00:00
Remove djangoitem since we moved it to scrapy/scrapy-djangoitem
This commit is contained in:
parent
bb4c8c33cc
commit
ffc60910aa
@ -2,11 +2,7 @@ import six
|
|||||||
import pytest
|
import pytest
|
||||||
from twisted.python import log
|
from twisted.python import log
|
||||||
|
|
||||||
from scrapy import optional_features
|
|
||||||
|
|
||||||
collect_ignore = ["scrapy/stats.py", "scrapy/project.py"]
|
collect_ignore = ["scrapy/stats.py", "scrapy/project.py"]
|
||||||
if 'django' not in optional_features:
|
|
||||||
collect_ignore.append("tests/test_djangoitem/models.py")
|
|
||||||
|
|
||||||
if six.PY3:
|
if six.PY3:
|
||||||
for line in open('tests/py3-ignores.txt'):
|
for line in open('tests/py3-ignores.txt'):
|
||||||
|
@ -4,146 +4,8 @@
|
|||||||
DjangoItem
|
DjangoItem
|
||||||
==========
|
==========
|
||||||
|
|
||||||
:class:`DjangoItem` is a class of item that gets its fields definition from a
|
DjangoItem has been moved into a separate project.
|
||||||
Django model, you simply create a :class:`DjangoItem` and specify what Django
|
|
||||||
model it relates to.
|
|
||||||
|
|
||||||
Besides of getting the model fields defined on your item, :class:`DjangoItem`
|
It is hosted at:
|
||||||
provides a method to create and populate a Django model instance with the item
|
|
||||||
data.
|
|
||||||
|
|
||||||
Using DjangoItem
|
https://github.com/scrapy/scrapy-djangoitem
|
||||||
================
|
|
||||||
|
|
||||||
:class:`DjangoItem` works much like ModelForms in Django, you create a subclass
|
|
||||||
and define its ``django_model`` attribute to be a valid Django model. With this
|
|
||||||
you will get an item with a field for each Django model field.
|
|
||||||
|
|
||||||
In addition, you can define fields that aren't present in the model and even
|
|
||||||
override fields that are present in the model defining them in the item.
|
|
||||||
|
|
||||||
Let's see some examples:
|
|
||||||
|
|
||||||
Creating a Django model for the examples::
|
|
||||||
|
|
||||||
from django.db import models
|
|
||||||
|
|
||||||
class Person(models.Model):
|
|
||||||
name = models.CharField(max_length=255)
|
|
||||||
age = models.IntegerField()
|
|
||||||
|
|
||||||
Defining a basic :class:`DjangoItem`::
|
|
||||||
|
|
||||||
from scrapy.contrib.djangoitem import DjangoItem
|
|
||||||
|
|
||||||
class PersonItem(DjangoItem):
|
|
||||||
django_model = Person
|
|
||||||
|
|
||||||
:class:`DjangoItem` work just like :class:`~scrapy.item.Item`::
|
|
||||||
|
|
||||||
>>> p = PersonItem()
|
|
||||||
>>> p['name'] = 'John'
|
|
||||||
>>> p['age'] = '22'
|
|
||||||
|
|
||||||
To obtain the Django model from the item, we call the extra method
|
|
||||||
:meth:`~DjangoItem.save` of the :class:`DjangoItem`::
|
|
||||||
|
|
||||||
>>> person = p.save()
|
|
||||||
>>> person.name
|
|
||||||
'John'
|
|
||||||
>>> person.age
|
|
||||||
'22'
|
|
||||||
>>> person.id
|
|
||||||
1
|
|
||||||
|
|
||||||
The model is already saved when we call :meth:`~DjangoItem.save`, we
|
|
||||||
can prevent this by calling it with ``commit=False``. We can use
|
|
||||||
``commit=False`` in :meth:`~DjangoItem.save` method to obtain an unsaved model::
|
|
||||||
|
|
||||||
>>> person = p.save(commit=False)
|
|
||||||
>>> person.name
|
|
||||||
'John'
|
|
||||||
>>> person.age
|
|
||||||
'22'
|
|
||||||
>>> person.id
|
|
||||||
None
|
|
||||||
|
|
||||||
As said before, we can add other fields to the item::
|
|
||||||
|
|
||||||
import scrapy
|
|
||||||
from scrapy.contrib.djangoitem import DjangoItem
|
|
||||||
|
|
||||||
class PersonItem(DjangoItem):
|
|
||||||
django_model = Person
|
|
||||||
sex = scrapy.Field()
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
>>> p = PersonItem()
|
|
||||||
>>> p['name'] = 'John'
|
|
||||||
>>> p['age'] = '22'
|
|
||||||
>>> p['sex'] = 'M'
|
|
||||||
|
|
||||||
.. note:: fields added to the item won't be taken into account when doing a :meth:`~DjangoItem.save`
|
|
||||||
|
|
||||||
And we can override the fields of the model with your own::
|
|
||||||
|
|
||||||
class PersonItem(DjangoItem):
|
|
||||||
django_model = Person
|
|
||||||
name = scrapy.Field(default='No Name')
|
|
||||||
|
|
||||||
This is useful to provide properties to the field, like a default or any other
|
|
||||||
property that your project uses.
|
|
||||||
|
|
||||||
DjangoItem caveats
|
|
||||||
==================
|
|
||||||
|
|
||||||
DjangoItem is a rather convenient way to integrate Scrapy projects with Django
|
|
||||||
models, but bear in mind that Django ORM may not scale well if you scrape a lot
|
|
||||||
of items (ie. millions) with Scrapy. This is because a relational backend is
|
|
||||||
often not a good choice for a write intensive application (such as a web
|
|
||||||
crawler), specially if the database is highly normalized and with many indices.
|
|
||||||
|
|
||||||
Django settings set up
|
|
||||||
======================
|
|
||||||
|
|
||||||
To use the Django models outside the Django application you need to set up the
|
|
||||||
``DJANGO_SETTINGS_MODULE`` environment variable and --in most cases-- modify
|
|
||||||
the ``PYTHONPATH`` environment variable to be able to import the settings
|
|
||||||
module.
|
|
||||||
|
|
||||||
There are many ways to do this depending on your use case and preferences.
|
|
||||||
Below is detailed one of the simplest ways to do it.
|
|
||||||
|
|
||||||
Suppose your Django project is named ``mysite``, is located in the path
|
|
||||||
``/home/projects/mysite`` and you have created an app ``myapp`` with the model
|
|
||||||
``Person``. That means your directory structure is something like this::
|
|
||||||
|
|
||||||
/home/projects/mysite
|
|
||||||
├── manage.py
|
|
||||||
├── myapp
|
|
||||||
│ ├── __init__.py
|
|
||||||
│ ├── models.py
|
|
||||||
│ ├── tests.py
|
|
||||||
│ └── views.py
|
|
||||||
└── mysite
|
|
||||||
├── __init__.py
|
|
||||||
├── settings.py
|
|
||||||
├── urls.py
|
|
||||||
└── wsgi.py
|
|
||||||
|
|
||||||
Then you need to add ``/home/projects/mysite`` to the ``PYTHONPATH``
|
|
||||||
environment variable and set up the environment variable
|
|
||||||
``DJANGO_SETTINGS_MODULE`` to ``mysite.settings``. That can be done in your
|
|
||||||
Scrapy's settings file by adding the lines below::
|
|
||||||
|
|
||||||
import sys
|
|
||||||
sys.path.append('/home/projects/mysite')
|
|
||||||
|
|
||||||
import os
|
|
||||||
os.environ['DJANGO_SETTINGS_MODULE'] = 'mysite.settings'
|
|
||||||
|
|
||||||
Notice that we modify the ``sys.path`` variable instead the ``PYTHONPATH``
|
|
||||||
environment variable as we are already within the python runtime. If everything
|
|
||||||
is right, you should be able to start the ``scrapy shell`` command and import
|
|
||||||
the model ``Person`` (i.e. ``from myapp.models import Person``).
|
|
||||||
|
@ -38,13 +38,6 @@ except ImportError:
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
optional_features.add('boto')
|
optional_features.add('boto')
|
||||||
try:
|
|
||||||
import django
|
|
||||||
del django
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
optional_features.add('django')
|
|
||||||
|
|
||||||
from twisted import version as _txv
|
from twisted import version as _txv
|
||||||
twisted_version = (_txv.major, _txv.minor, _txv.micro)
|
twisted_version = (_txv.major, _txv.minor, _txv.micro)
|
||||||
|
@ -1,75 +0,0 @@
|
|||||||
from scrapy.item import Field, Item, ItemMeta
|
|
||||||
from scrapy import optional_features
|
|
||||||
if 'django' in optional_features:
|
|
||||||
from django.core.exceptions import ValidationError
|
|
||||||
|
|
||||||
|
|
||||||
class DjangoItemMeta(ItemMeta):
|
|
||||||
|
|
||||||
def __new__(mcs, class_name, bases, attrs):
|
|
||||||
cls = super(DjangoItemMeta, mcs).__new__(mcs, class_name, bases, attrs)
|
|
||||||
cls.fields = cls.fields.copy()
|
|
||||||
|
|
||||||
if cls.django_model:
|
|
||||||
cls._model_fields = []
|
|
||||||
cls._model_meta = cls.django_model._meta
|
|
||||||
for model_field in cls._model_meta.fields:
|
|
||||||
if not model_field.auto_created:
|
|
||||||
if model_field.name not in cls.fields:
|
|
||||||
cls.fields[model_field.name] = Field()
|
|
||||||
cls._model_fields.append(model_field.name)
|
|
||||||
return cls
|
|
||||||
|
|
||||||
|
|
||||||
class DjangoItem(Item):
|
|
||||||
|
|
||||||
__metaclass__ = DjangoItemMeta
|
|
||||||
|
|
||||||
django_model = None
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super(DjangoItem, self).__init__(*args, **kwargs)
|
|
||||||
self._instance = None
|
|
||||||
self._errors = None
|
|
||||||
|
|
||||||
def save(self, commit=True):
|
|
||||||
if commit:
|
|
||||||
self.instance.save()
|
|
||||||
return self.instance
|
|
||||||
|
|
||||||
def is_valid(self, exclude=None):
|
|
||||||
self._get_errors(exclude)
|
|
||||||
return not bool(self._errors)
|
|
||||||
|
|
||||||
def _get_errors(self, exclude=None):
|
|
||||||
if self._errors is not None:
|
|
||||||
return self._errors
|
|
||||||
|
|
||||||
self._errors = {}
|
|
||||||
if exclude is None:
|
|
||||||
exclude = []
|
|
||||||
|
|
||||||
try:
|
|
||||||
self.instance.clean_fields(exclude=exclude)
|
|
||||||
except ValidationError as e:
|
|
||||||
self._errors = e.update_error_dict(self._errors)
|
|
||||||
|
|
||||||
try:
|
|
||||||
self.instance.clean()
|
|
||||||
except ValidationError as e:
|
|
||||||
self._errors = e.update_error_dict(self._errors)
|
|
||||||
|
|
||||||
# uniqueness is not checked, because it is faster to check it when
|
|
||||||
# saving object to database. Just beware, that failed save()
|
|
||||||
# raises IntegrityError instead of ValidationError.
|
|
||||||
|
|
||||||
return self._errors
|
|
||||||
errors = property(_get_errors)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def instance(self):
|
|
||||||
if self._instance is None:
|
|
||||||
modelargs = dict((k, self.get(k)) for k in self._values
|
|
||||||
if k in self._model_fields)
|
|
||||||
self._instance = self.django_model(**modelargs)
|
|
||||||
return self._instance
|
|
@ -9,7 +9,6 @@ tests/test_contrib_linkextractors.py
|
|||||||
tests/test_contrib_loader.py
|
tests/test_contrib_loader.py
|
||||||
tests/test_crawl.py
|
tests/test_crawl.py
|
||||||
tests/test_crawler.py
|
tests/test_crawler.py
|
||||||
tests/test_djangoitem/__init__.py
|
|
||||||
tests/test_downloader_handlers.py
|
tests/test_downloader_handlers.py
|
||||||
tests/test_downloadermiddleware_ajaxcrawlable.py
|
tests/test_downloadermiddleware_ajaxcrawlable.py
|
||||||
tests/test_downloadermiddleware_cookies.py
|
tests/test_downloadermiddleware_cookies.py
|
||||||
|
@ -1,103 +0,0 @@
|
|||||||
import os
|
|
||||||
from twisted.trial import unittest
|
|
||||||
|
|
||||||
from scrapy.contrib.djangoitem import DjangoItem, Field
|
|
||||||
from scrapy import optional_features
|
|
||||||
|
|
||||||
os.environ['DJANGO_SETTINGS_MODULE'] = 'tests.test_djangoitem.settings'
|
|
||||||
|
|
||||||
if 'django' in optional_features:
|
|
||||||
from .models import Person, IdentifiedPerson
|
|
||||||
|
|
||||||
class BasePersonItem(DjangoItem):
|
|
||||||
django_model = Person
|
|
||||||
|
|
||||||
class NewFieldPersonItem(BasePersonItem):
|
|
||||||
other = Field()
|
|
||||||
|
|
||||||
class OverrideFieldPersonItem(BasePersonItem):
|
|
||||||
age = Field()
|
|
||||||
|
|
||||||
class IdentifiedPersonItem(DjangoItem):
|
|
||||||
django_model = IdentifiedPerson
|
|
||||||
|
|
||||||
|
|
||||||
class DjangoItemTest(unittest.TestCase):
|
|
||||||
|
|
||||||
def assertSortedEqual(self, first, second, msg=None):
|
|
||||||
return self.assertEqual(sorted(first), sorted(second), msg)
|
|
||||||
|
|
||||||
def setUp(self):
|
|
||||||
if 'django' not in optional_features:
|
|
||||||
raise unittest.SkipTest("Django is not available")
|
|
||||||
|
|
||||||
def test_base(self):
|
|
||||||
i = BasePersonItem()
|
|
||||||
self.assertSortedEqual(i.fields.keys(), ['age', 'name'])
|
|
||||||
|
|
||||||
def test_new_fields(self):
|
|
||||||
i = NewFieldPersonItem()
|
|
||||||
self.assertSortedEqual(i.fields.keys(), ['age', 'other', 'name'])
|
|
||||||
|
|
||||||
def test_override_field(self):
|
|
||||||
i = OverrideFieldPersonItem()
|
|
||||||
self.assertSortedEqual(i.fields.keys(), ['age', 'name'])
|
|
||||||
|
|
||||||
def test_custom_primary_key_field(self):
|
|
||||||
"""
|
|
||||||
Test that if a custom primary key exists, it is
|
|
||||||
in the field list.
|
|
||||||
"""
|
|
||||||
i = IdentifiedPersonItem()
|
|
||||||
self.assertSortedEqual(i.fields.keys(), ['age', 'identifier', 'name'])
|
|
||||||
|
|
||||||
def test_save(self):
|
|
||||||
i = BasePersonItem()
|
|
||||||
self.assertSortedEqual(i.fields.keys(), ['age', 'name'])
|
|
||||||
|
|
||||||
i['name'] = 'John'
|
|
||||||
i['age'] = '22'
|
|
||||||
person = i.save(commit=False)
|
|
||||||
|
|
||||||
self.assertEqual(person.name, 'John')
|
|
||||||
self.assertEqual(person.age, '22')
|
|
||||||
|
|
||||||
def test_override_save(self):
|
|
||||||
i = OverrideFieldPersonItem()
|
|
||||||
|
|
||||||
i['name'] = 'John'
|
|
||||||
# it is not obvious that "age" should be saved also, since it was
|
|
||||||
# redefined in child class
|
|
||||||
i['age'] = '22'
|
|
||||||
person = i.save(commit=False)
|
|
||||||
|
|
||||||
self.assertEqual(person.name, 'John')
|
|
||||||
self.assertEqual(person.age, '22')
|
|
||||||
|
|
||||||
def test_validation(self):
|
|
||||||
long_name = 'z' * 300
|
|
||||||
i = BasePersonItem(name=long_name)
|
|
||||||
self.assertFalse(i.is_valid())
|
|
||||||
self.assertEqual(set(i.errors), set(['age', 'name']))
|
|
||||||
i = BasePersonItem(name='John')
|
|
||||||
self.assertTrue(i.is_valid(exclude=['age']))
|
|
||||||
self.assertEqual({}, i.errors)
|
|
||||||
|
|
||||||
# once the item is validated, it does not validate again
|
|
||||||
i['name'] = long_name
|
|
||||||
self.assertTrue(i.is_valid())
|
|
||||||
|
|
||||||
def test_override_validation(self):
|
|
||||||
i = OverrideFieldPersonItem()
|
|
||||||
i['name'] = 'John'
|
|
||||||
self.assertFalse(i.is_valid())
|
|
||||||
|
|
||||||
i = i = OverrideFieldPersonItem()
|
|
||||||
i['name'] = 'John'
|
|
||||||
i['age'] = '22'
|
|
||||||
self.assertTrue(i.is_valid())
|
|
||||||
|
|
||||||
def test_default_field_values(self):
|
|
||||||
i = BasePersonItem()
|
|
||||||
person = i.save(commit=False)
|
|
||||||
self.assertEqual(person.name, 'Robot')
|
|
@ -1,17 +0,0 @@
|
|||||||
from django.db import models
|
|
||||||
|
|
||||||
|
|
||||||
class Person(models.Model):
|
|
||||||
name = models.CharField(max_length=255, default='Robot')
|
|
||||||
age = models.IntegerField()
|
|
||||||
|
|
||||||
class Meta:
|
|
||||||
app_label = 'test_djangoitem'
|
|
||||||
|
|
||||||
class IdentifiedPerson(models.Model):
|
|
||||||
identifier = models.PositiveIntegerField(primary_key=True)
|
|
||||||
name = models.CharField(max_length=255)
|
|
||||||
age = models.IntegerField()
|
|
||||||
|
|
||||||
class Meta:
|
|
||||||
app_label = 'test_djangoitem'
|
|
@ -1,8 +0,0 @@
|
|||||||
DATABASES = {
|
|
||||||
'default': {
|
|
||||||
'ENGINE': 'django.db.backends.sqlite3',
|
|
||||||
'NAME': ':memory:',
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
SECRET_KEY = 'top-secret'
|
|
2
tox.ini
2
tox.ini
@ -12,7 +12,6 @@ deps =
|
|||||||
# Extras
|
# Extras
|
||||||
boto
|
boto
|
||||||
Pillow
|
Pillow
|
||||||
django
|
|
||||||
leveldb
|
leveldb
|
||||||
-rtests/requirements.txt
|
-rtests/requirements.txt
|
||||||
commands =
|
commands =
|
||||||
@ -26,7 +25,6 @@ deps =
|
|||||||
Twisted==11.1.0
|
Twisted==11.1.0
|
||||||
boto==2.2.2
|
boto==2.2.2
|
||||||
Pillow<2.0
|
Pillow<2.0
|
||||||
django==1.3.1
|
|
||||||
cssselect==0.9.1
|
cssselect==0.9.1
|
||||||
zope.interface==3.6.1
|
zope.interface==3.6.1
|
||||||
-rtests/requirements.txt
|
-rtests/requirements.txt
|
||||||
|
Loading…
x
Reference in New Issue
Block a user