Skip to content

to_webannotator may fail if an attribute value of some HTML element contains a control character #17

@kmike

Description

@kmike

Traceback (after trying to NER.annotate() https://github.com/scrapinghub/webstruct/blob/master/webstruct_data/corpus/business_pages/source/301.html page):

ValueError                                Traceback (most recent call last)
<ipython-input-8-45ad24ffcda1> in <module>()
      9     try:
     10         with open(fn, 'rb') as f:
---> 11             annotated = ner.annotate(f.read())
     12 
     13         path, filename = os.path.split(fn)

/Users/kmike/svn/webstruct/webstruct/model.pyc in annotate(self, bytes_data, pretty_print)
    105         html_tokens, tags = self.extract_raw(bytes_data)
    106         tree = self.html_tokenizer.detokenize_single(html_tokens, tags)
--> 107         tree = to_webannotator(tree, self.entity_colors)
    108         return tostring(tree, pretty_print=pretty_print)
    109 

/Users/kmike/svn/webstruct/webstruct/webannotator.py in to_webannotator(tree, entity_colors)
    258     """
    259     handler = _WaContentHandler(entity_colors)
--> 260     lxml.sax.saxify(tree, handler)
    261     tree = handler.out.etree
    262     _copy_title(tree)

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in saxify(element_or_tree, content_handler)
    245     them against a SAX ContentHandler.
    246     """
--> 247     return ElementTreeProducer(element_or_tree, content_handler).saxify()

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in saxify(self)
    178                 self._recursive_saxify(sibling, {})
    179 
--> 180         self._recursive_saxify(element, {})
    181 
    182         if hasattr(element, 'getnext'):

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    220             content_handler.startPrefixMapping(prefix, uri)
    221         content_handler.startElementNS((ns_uri, local_name),
--> 222                                        qname, sax_attributes)
    223         if element.text:
    224             content_handler.characters(element.text)

/Users/kmike/svn/webstruct/webstruct/webannotator.py in startElementNS(self, name, qname, attributes)
    122         self._closeSpan()
    123         # print('start %s' % qname)
--> 124         self.out.startElementNS(name, qname, attributes)
    125         self._openSpan()
    126 

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in startElementNS(self, ns_name, qname, attributes)
    110         else:
    111             element = SubElement(element_stack[-1], el_name,
--> 112                                  attrs, self._new_mappings)
    113         element_stack.append(element)
    114 

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/etree.so in lxml.etree.SubElement (src/lxml/lxml.etree.c:67070)()

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/etree.so in lxml.etree._makeSubElement (src/lxml/lxml.etree.c:15492)()

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/etree.so in lxml.etree._makeSubElement (src/lxml/lxml.etree.c:15423)()

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/etree.so in lxml.etree._initNodeAttributes (src/lxml/lxml.etree.c:16529)()

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/etree.so in lxml.etree._addAttributeToNode (src/lxml/lxml.etree.c:16701)()

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/etree.so in lxml.etree._utf8 (src/lxml/lxml.etree.c:26485)()

ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions