Skip to content

Tutorial results in obscure errors. #40

@Granitosaurus

Description

@Granitosaurus

I've been following webstruct tutorial and I'm getting few peculiar errors.
From the tutorial I end up with code along the lines of this:

from itertools import islice
import pkg_resources
import webstruct


def token_identity(html_token):
    return {'token': html_token.token}


def token_isupper(html_token):
    return {'isupper': html_token.token.isupper()}


def parent_tag(html_token):
    return {'parent_tag': html_token.parent.tag}


def border_at_left(html_token):
    return {'border_at_left': html_token.index == 0}


DATA_DIR = pkg_resources.resource_filename('project', 'data/business_annotated')


def get_training():
    trees = webstruct.load_trees("{}/*.html".format(DATA_DIR), webstruct.WebAnnotatorLoader())
    trees = islice(trees, 0, 10)  # todo
    return trees


def tokenize_training(trees):
    html_tokenizer = webstruct.HtmlTokenizer()
    tokens, labels = html_tokenizer.tokenize(trees)
    return tokens, labels


def main():
    print('creating model...')
    model = webstruct.create_wapiti_pipeline(
        'company.wapiti',
        token_features=[token_identity, token_isupper, parent_tag, border_at_left],
        train_args='--algo l-bfgs --maxiter 50 --compact',
    )
    print('getting training data...')
    tokens, labels = tokenize_training(get_training())
    print('fitting training data...')
    model.fit(tokens, labels)
    print('starting extract...')
    ner = webstruct.NER(model)
    print(ner.extract_from_url('http://scrapinghub.com/contact'))

if __name__ == '__main__':
    main()

The first error I get is TypeError when trying to use extract something with ner:

Traceback (most recent call last):
  File "/home/dex/projects/project/project/spiders/test.py", line 54, in <module>
    main()
  File "/home/dex/projects/project/project/spiders/test.py", line 51, in main
    print(ner.extract_from_url('http://scrapinghub.com/contact'))
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/model.py", line 58, in extract_from_url
    return self.extract(data)
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/model.py", line 46, in extract
    groups = IobEncoder.group(zip(html_tokens, tags))
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/sequence_encoding.py", line 128, in group
    return list(cls.iter_group(data, strict))
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/sequence_encoding.py", line 136, in iter_group
    if iob_tag.startswith('I-') and tag != iob_tag[2:]:
TypeError: startswith first arg must be bytes or a tuple of bytes, not str

It seems like python3 support issue as it's expects bytes but get a string?

Second error is when trying to build a ner straight from model without fitting it first:

def main():
    print('creating model...')
    model = webstruct.create_wapiti_pipeline(
        'company.wapiti',
        token_features=[token_identity, token_isupper, parent_tag, border_at_left],
        train_args='--algo l-bfgs --maxiter 50 --compact',
    )
    # print('getting training data...')
    # tokens, labels = tokenize_training(get_training())
    # print('fitting training data...')
    # model.fit(tokens, labels)
    # print('starting extract...')
    ner = webstruct.NER(model)
    print(ner.extract_from_url('http://scrapinghub.com/contact'))

Results in:

Traceback (most recent call last):
  File "/home/dex/projects/project/project/spiders/test.py", line 53, in <module>
    main()
  File "/home/dex/projects/project/project/spiders/test.py", line 50, in main
    print(ner.extract_from_url('http://scrapinghub.com/contact'))
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/model.py", line 58, in extract_from_url
    return self.extract(data)
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/model.py", line 45, in extract
    html_tokens, tags = self.extract_raw(bytes_data)
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/model.py", line 67, in extract_raw
    tags = self.model.predict([html_tokens])[0]
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/sklearn/utils/metaestimators.py", line 54, in <lambda>
    out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/sklearn/pipeline.py", line 327, in predict
    return self.steps[-1][-1].predict(Xt)
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/wapiti.py", line 211, in predict
    sequences = self._to_wapiti_sequences(X)
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/wapiti.py", line 230, in _to_wapiti_sequences
    X = self.feature_encoder.transform(X)
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/wapiti.py", line 313, in transform
    return [self.transform_single(feature_dicts) for feature_dicts in X]
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/wapiti.py", line 313, in <listcomp>
    return [self.transform_single(feature_dicts) for feature_dicts in X]
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/wapiti.py", line 308, in transform_single
    line = ' '.join(_tostr(dct.get(key)) for key in self.feature_names_)
TypeError: 'NoneType' object is not iterable

The errors seem to be very vague and I don't even know where to start debugging this. Am I missing something?

I'm running:
webstruct - 0.5
scikit-learn - 0.18.2
scipy - 0.19
libwapiti - 0.2.1

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions