removed nltk dependency from IBL code

2025-02-25 12:03:58 +00:00 · 2010-06-22 13:38:32 -03:00 · 2010-06-22 13:38:32 -03:00 · baa523055f
commit baa523055f
parent 8815de94ff
4 changed files with 6 additions and 23 deletions
--- a/scrapy/contrib/ibl/init.py
+++ b/scrapy/contrib/ibl/init.py
@ -8,9 +8,5 @@ Based Learning (IBL) algorithm, as described in the following papers:
    Extracting web data using instance based learning
    http://portal.acm.org/citation.cfm?id=1265174 

-This code has some additional dependencies too:
-
-* numpy
-* nltk
-
+This code requires the numpy library.
 """
--- a/scrapy/contrib/ibl/extraction/regionextract.py
+++ b/scrapy/contrib/ibl/extraction/regionextract.py
@ -3,13 +3,13 @@ Region Extract

 Custom extraction for regions in a document
 """
+import re
 import operator
 import copy
 import pprint
 import cStringIO
 from itertools import groupby

-import nltk
 from numpy import array

 from scrapy.contrib.ibl.descriptor import FieldDescriptor
@ -572,7 +572,8 @@ class TemplatePageExtractor(object):
    def __str__(self):
        return str(self.extractor)

-_tokenize = nltk.tokenize.WordPunctTokenizer().tokenize
+# Based on nltk's WordPunctTokenizer
+_tokenize = re.compile(r'\w+|[^\w\s]+', re.UNICODE | re.MULTILINE | re.DOTALL).findall

 class TextRegionDataExtractor(object):
    """Data Extractor for extracting text fragments from within a larger
--- a/scrapy/tests/test_contrib_ibl/test_extraction.py
+++ b/scrapy/tests/test_contrib_ibl/test_extraction.py
@ -11,17 +11,12 @@ from scrapy.contrib.ibl.descriptor import (FieldDescriptor as A,
 from scrapy.contrib.ibl.extractors import (contains_any_numbers,
        image_url)

-try:
-    import nltk
-except ImportError:
-    nltk = None
-
 try:
    import numpy
 except ImportError:
    numpy = None

-if nltk and numpy:
+if numpy:
    from scrapy.contrib.ibl.extraction import InstanceBasedLearningExtractor

 # simple page with all features
@ -791,8 +786,6 @@ TEST_DATA = [
 class TestExtraction(TestCase):

    def setUp(self):
-        if not nltk:
-            raise SkipTest("nltk not available")
        if not numpy:
            raise SkipTest("numpy not available")

--- a/scrapy/tests/test_contrib_ibl/test_pageparsing.py
+++ b/scrapy/tests/test_contrib_ibl/test_pageparsing.py
@ -12,17 +12,12 @@ from scrapy.utils.py26 import json
 from scrapy.contrib.ibl.htmlpage import HtmlPage
 from scrapy.tests.test_contrib_ibl import path

-try:
-    import nltk
-except ImportError:
-    nltk = None
-
 try:
    import numpy
 except ImportError:
    numpy = None

-if nltk and numpy:
+if numpy:
    from scrapy.contrib.ibl.extraction.pageparsing import (
        InstanceLearningParser, TemplatePageParser, ExtractionPageParser)
    from scrapy.contrib.ibl.extraction.pageobjects import TokenDict, TokenType
@ -174,8 +169,6 @@ def _tags(pp, predicate):
 class TestPageParsing(TestCase):

    def setUp(self):
-        if not nltk:
-            raise SkipTest("nltk not available")
        if not numpy:
            raise SkipTest("numpy not available")