1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-25 12:03:58 +00:00

removed nltk dependency from IBL code

This commit is contained in:
Pablo Hoffman 2010-06-22 13:38:32 -03:00
parent 8815de94ff
commit baa523055f
4 changed files with 6 additions and 23 deletions

View File

@ -8,9 +8,5 @@ Based Learning (IBL) algorithm, as described in the following papers:
Extracting web data using instance based learning
http://portal.acm.org/citation.cfm?id=1265174
This code has some additional dependencies too:
* numpy
* nltk
This code requires the numpy library.
"""

View File

@ -3,13 +3,13 @@ Region Extract
Custom extraction for regions in a document
"""
import re
import operator
import copy
import pprint
import cStringIO
from itertools import groupby
import nltk
from numpy import array
from scrapy.contrib.ibl.descriptor import FieldDescriptor
@ -572,7 +572,8 @@ class TemplatePageExtractor(object):
def __str__(self):
return str(self.extractor)
_tokenize = nltk.tokenize.WordPunctTokenizer().tokenize
# Based on nltk's WordPunctTokenizer
_tokenize = re.compile(r'\w+|[^\w\s]+', re.UNICODE | re.MULTILINE | re.DOTALL).findall
class TextRegionDataExtractor(object):
"""Data Extractor for extracting text fragments from within a larger

View File

@ -11,17 +11,12 @@ from scrapy.contrib.ibl.descriptor import (FieldDescriptor as A,
from scrapy.contrib.ibl.extractors import (contains_any_numbers,
image_url)
try:
import nltk
except ImportError:
nltk = None
try:
import numpy
except ImportError:
numpy = None
if nltk and numpy:
if numpy:
from scrapy.contrib.ibl.extraction import InstanceBasedLearningExtractor
# simple page with all features
@ -791,8 +786,6 @@ TEST_DATA = [
class TestExtraction(TestCase):
def setUp(self):
if not nltk:
raise SkipTest("nltk not available")
if not numpy:
raise SkipTest("numpy not available")

View File

@ -12,17 +12,12 @@ from scrapy.utils.py26 import json
from scrapy.contrib.ibl.htmlpage import HtmlPage
from scrapy.tests.test_contrib_ibl import path
try:
import nltk
except ImportError:
nltk = None
try:
import numpy
except ImportError:
numpy = None
if nltk and numpy:
if numpy:
from scrapy.contrib.ibl.extraction.pageparsing import (
InstanceLearningParser, TemplatePageParser, ExtractionPageParser)
from scrapy.contrib.ibl.extraction.pageobjects import TokenDict, TokenType
@ -174,8 +169,6 @@ def _tags(pp, predicate):
class TestPageParsing(TestCase):
def setUp(self):
if not nltk:
raise SkipTest("nltk not available")
if not numpy:
raise SkipTest("numpy not available")