From baa523055f77087a2add4a8725f2ac4553e72bae Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Tue, 22 Jun 2010 13:38:32 -0300 Subject: [PATCH] removed nltk dependency from IBL code --- scrapy/contrib/ibl/__init__.py | 6 +----- scrapy/contrib/ibl/extraction/regionextract.py | 5 +++-- scrapy/tests/test_contrib_ibl/test_extraction.py | 9 +-------- scrapy/tests/test_contrib_ibl/test_pageparsing.py | 9 +-------- 4 files changed, 6 insertions(+), 23 deletions(-) diff --git a/scrapy/contrib/ibl/__init__.py b/scrapy/contrib/ibl/__init__.py index 52c6e08b2..fbb1c895e 100644 --- a/scrapy/contrib/ibl/__init__.py +++ b/scrapy/contrib/ibl/__init__.py @@ -8,9 +8,5 @@ Based Learning (IBL) algorithm, as described in the following papers: Extracting web data using instance based learning http://portal.acm.org/citation.cfm?id=1265174 -This code has some additional dependencies too: - -* numpy -* nltk - +This code requires the numpy library. """ diff --git a/scrapy/contrib/ibl/extraction/regionextract.py b/scrapy/contrib/ibl/extraction/regionextract.py index 9b5aab166..dd4bc6256 100644 --- a/scrapy/contrib/ibl/extraction/regionextract.py +++ b/scrapy/contrib/ibl/extraction/regionextract.py @@ -3,13 +3,13 @@ Region Extract Custom extraction for regions in a document """ +import re import operator import copy import pprint import cStringIO from itertools import groupby -import nltk from numpy import array from scrapy.contrib.ibl.descriptor import FieldDescriptor @@ -572,7 +572,8 @@ class TemplatePageExtractor(object): def __str__(self): return str(self.extractor) -_tokenize = nltk.tokenize.WordPunctTokenizer().tokenize +# Based on nltk's WordPunctTokenizer +_tokenize = re.compile(r'\w+|[^\w\s]+', re.UNICODE | re.MULTILINE | re.DOTALL).findall class TextRegionDataExtractor(object): """Data Extractor for extracting text fragments from within a larger diff --git a/scrapy/tests/test_contrib_ibl/test_extraction.py b/scrapy/tests/test_contrib_ibl/test_extraction.py index 65d48cf55..d4acbf5d5 100644 --- a/scrapy/tests/test_contrib_ibl/test_extraction.py +++ b/scrapy/tests/test_contrib_ibl/test_extraction.py @@ -11,17 +11,12 @@ from scrapy.contrib.ibl.descriptor import (FieldDescriptor as A, from scrapy.contrib.ibl.extractors import (contains_any_numbers, image_url) -try: - import nltk -except ImportError: - nltk = None - try: import numpy except ImportError: numpy = None -if nltk and numpy: +if numpy: from scrapy.contrib.ibl.extraction import InstanceBasedLearningExtractor # simple page with all features @@ -791,8 +786,6 @@ TEST_DATA = [ class TestExtraction(TestCase): def setUp(self): - if not nltk: - raise SkipTest("nltk not available") if not numpy: raise SkipTest("numpy not available") diff --git a/scrapy/tests/test_contrib_ibl/test_pageparsing.py b/scrapy/tests/test_contrib_ibl/test_pageparsing.py index 264e1644f..12a6d346e 100644 --- a/scrapy/tests/test_contrib_ibl/test_pageparsing.py +++ b/scrapy/tests/test_contrib_ibl/test_pageparsing.py @@ -12,17 +12,12 @@ from scrapy.utils.py26 import json from scrapy.contrib.ibl.htmlpage import HtmlPage from scrapy.tests.test_contrib_ibl import path -try: - import nltk -except ImportError: - nltk = None - try: import numpy except ImportError: numpy = None -if nltk and numpy: +if numpy: from scrapy.contrib.ibl.extraction.pageparsing import ( InstanceLearningParser, TemplatePageParser, ExtractionPageParser) from scrapy.contrib.ibl.extraction.pageobjects import TokenDict, TokenType @@ -174,8 +169,6 @@ def _tags(pp, predicate): class TestPageParsing(TestCase): def setUp(self): - if not nltk: - raise SkipTest("nltk not available") if not numpy: raise SkipTest("numpy not available")