-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdomextensions.py
executable file
·3390 lines (2893 loc) · 127 KB
/
domextensions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
#
# DomExtensions: A bunch of (hopefully) useful additions to the DOM API.
# 2010-01-10: Written by Steven J. DeRose.
#
#pylint: disable=W0613, W0212, E1101
#
import sys
import re
import codecs
from enum import Enum
from typing import List, IO, Callable, Any, Union, Iterable
from collections import namedtuple
import logging
import xml.dom
import xml.dom.minidom
from xml.dom.minidom import Node, NamedNodeMap, Element, Document
#from html.entities import codepoint2name, name2codepoint
from xmlstrings import XmlStrings
from domgetitem import NodeArgs
from domgetitem import __domgetitem__ as DEgetitem
lg = logging.getLogger("DomExtensions.py")
lg.setLevel(logging.INFO)
def cmp(a, b) -> int: return ((a > b) - (a < b))
__metadata__ = {
"title" : "DomExtensions",
"description" : "A bunch of (hopefully) useful additions to the DOM API.",
"rightsHolder" : "Steven J. DeRose",
"creator" : "http://viaf.org/viaf/50334488",
"type" : "http://purl.org/dc/dcmitype/Software",
"language" : "Python 3.7",
"created" : "2010-01-10",
"modified" : "2021-07-08",
"publisher" : "http://github.com/sderose",
"license" : "https://creativecommons.org/licenses/by-sa/3.0/"
}
__version__ = __metadata__['modified']
descr = """
=Description=
An XML-manipulation package that sits on top of a generic DOM implementation
and provides a lot of higher-level methods, and more Pythonic syntax.
Or you may want to just take __getitem__, which lets you navigate DOM
trees very much like nested Python dicts and lists (see below for
details): myRoot[4][2][3:"p"]...
This package also provides access along all of the XPath axes, so you can
easily get the n-th ancestor, descendant, preceding nodes (which are different
from preceding *sibling* nodes, also available), etc. Or iterate along an axis
in either direction.
It provides many "geometric" operations in the tree, such as
locating the left or right "branch", determing the sequence of nodeNames down to
a node, determining the child number of a node among its siblings (counting
text nodes or not),
comparing node order, containment, and some support for ranges (which also
can be in semi-precedence and other relationships). And it can map back and
forth a node plus optional local offset, to and from global text offsets.
It provides "bulk" operations such as removing all white-space-only nodes,
removing or renaming all instances of attributes or elements, and much more.
It also provides operations inspired by other work. It can generate and
interpretg XPointers; do what I see as the more useful BS4 "find" operations
and a variety of CSS selectors;
generate equivalent SAX event sequences from any subtree; etc.
This is intended to achieve a few basic things:
* provide more Pythonic bindings of DOM functionality
* save people from repeatedly implementing commonly-needed XML operations
* let people use operations they may be familiar with when coming to XML and DOM
from a variety of related milieux.
Comments are welcome, as are bug reports.
There is also a Perl version of this, though sometimes it falls behind this one.
==Usage==
You can use ''patchDOM''() to monkey-patch these routines into a class
(default: ''xml.dom.minidom.Node''):
from domextensions import DomExtensions
import xml.dom
DomExtensions.patchDom(toPatch=xml.dom.minidom.Node)
or to just enable support for using Python's
subscript brackets: `myNode[...]`, do:
DomExtensions.enableBrackets(toPatch=xml.dom.minidom.Node)
See [##Using \\[\\] notation], below, for the details. In short, you can say things
like myNode[0], myNode["p":3:5], myNode["@id"], and so on.
In either case, just use the methods as if they had been on Node and its many
subclasses already:
dd = xml.dom.minidom.Document()
===Main features===
See L[https://doi.org/10.4242/BalisageVol13.DeRose02] for discussion of the
approach used here.
* Use of Python `[]` notation to get child nodes or attributes of a node.
* Methods for searching along the XPath axes, such as ancestors, children,
descendants, siblings, predecessors, etc.
* Generating and interpreting XPointers
* more flexible local restructuring such as wrapping, unwrapping, splitting,
and combining nodes.
* node iterators so you don't have to recurse and filter
* SAX event generator
DomExtensions also provides support for:
* attributes whose values inherit (similar to xml:lang) or accumulate
* quick testing of containment and order relationships
* creation of formatted XML for tags, subtrees, etc., including indentation
* escaping for content, attributes, cdata, pis, and comments
* managing whitespace-only text nodes and Unicode whitespace normalization
* finding left and right extremes of a subtree, lowest common ancestor, etc.
* change or case-fold element type names.
==Using [] notation==
The most obvious (though optional) feature that DomExtensions adds to
Node and its subclasses, is support for using Python's
subscript brackets: `myNode[...]`.
To enable ''just'' this feature, do:
DomExtensions.enableBrackets(toPatch=xml.dom.minidom.Node)
For example, instead of
myNode.childNodes[3].childNodes[27].childNodes[0]
you can just write:
myNode[3][27][0]
You can also do normal Python slices such as `myNode[0:5]` or `myNode[0:-1]`,
and even use Python's third argument to skip through the range several
steps at a time, like `myNode[0:5:2]`.
Warning to Xpath users: This package is being Pythonic, so child numbers count
from 0, like the rest of Python (and Javascript), and the second index is
the index of the first child ''past'' what you want (so x[1:2] gets you a sublist
that contains just one item, the second one.
Warning to Python users: This really does include ''all'' the children, not
just elements but also types that can only be leaves: text, comment, pi, and
cdata (text is by far the most common). Keep reading if you
want to count among just child nodes that are elements.
With DomExtensions you also have the option to specify an element type name,
to get all child elements of that type:
myNode["p"]
and you can test whether there is a child of a given kind with the usual Python
`in` construct:
if ("p" in myNode)...
Several values have special meanings:
* "*" means any element child, but no text, comment, pi, or cdata nodes.
* "#text", "#comment", "#cdata", and "#pi" mean just children of that node type.
* a name beginning with `@`, such as "@id", gets the attribute
if ("*" in myNode)...
if ("#text" in myNode)...
To avoid ambiguity, the values including element type name as well as these
special names, are chere alled "kinds" of nodes.
As usual in Python, this only checks direct children; if a child
happens to have children of its own, `in` doesn't know anything about them.
Remember that in XML, the element type name is not a key, like in a Python `dict`,
but a type name. There can be any number of `p` elements in section, for example.
So a kind in brackets leads to a list coming back (which might have only one
, or might have many)
===Fancier cases===
The author recommends that you get used to the simpler cases first.
You can combine integer indexes and range, with element type constraints
(except for attributes -- you can't say ['@id':0:-1]).
If you want the fourth paragraph child element, use:
myNode['p':3]
This is not quite the same as doing the opposite order! `myNode['p':3]` instead
finds the fourth child (of whatever type), and returns it if it is a `p`.
In other words, the indexes are interpreted in order from left to right. This
is familiar to XPath users, but there isn't really a comparable case with
Python lists.
If the fourth child is not a `p` element, `IndexError` is raised, just as for
any other "not found" case. Unfortunately, "in" has no way to test more than
the simplest condition, just as in Python you can't ask `[1:2] in myList`.
It may be more Pythonic the following way, which works fine:
try:
myStuff = myNode['p':3]
except IndexError:
myStuff = []
The same two cases can be used with a range instead of a singleton index:
myNode["#text", 3:-1]
gets all text nodes from the fourth one to the last, while
myNode[0:20:"*"]
gets all elements (but not other kinds of nodes) which are among the first 20 children of `myNode`.
==Choosing nodes along XPath axes==
A second major feature of DomExtensions, is called "Axis Selection".
It provides support for
walking all the XPath 'axes' in a DOM tree, such as
ancestors, children, descendants, siblings, and so on. Each axis is available
via a method called 'select' plus the axis name.
Because it's often useful to consider only element nodes,
or only text nodes, or only element nodes of a certain element type, or
with a certain attribute or attribute-value pair, optional arguments can be
used to filter in those ways:
The 'select...' methods return node [n] (counting from 0)
of a given type and/or attributes along a given axis.
For example:
myNode.selectChild(3, nodeSel='li', attrs={'class':'major'})
This will return the fourth child of `myNode` which:
* is of the given element type name ('li'), and
* has the attribute ''class'' with value ''major''.
All the arguments are optional. If none are provided, the first item
along the axis (that is, #0) is returned.
Constraints:
* ''n'' counts from 0 and defaults to 0.
Most axes support negative indexes; but not yet all of them.
* ''nodeSel'' supports "*", "#text", "#comment", "#cdata", and "#pi", just like
the bracket notation described earlier. It defaults to "*".
The `attrs` argument cannot be used except with an element type name or "*".
qnames (names with a namespace) are not yet supported for ''nodeSel''.
* `attrs`, if provided, must be None or a dict of attribute name:value pairs,
which all must match for an element to be selected.
The kind of matching depends on the type of value passed in the dict:
* int or float: match numerically. Attribute values that are not
strings of Latin decimal digits will not match,
but will not raise exceptions either. Hexadecimal and other forms should
be added. [TODO]
* a compiled regex: the regex is matched against the attribute value.
* a string: matched literally. [TODO support case-ignoring]
The `attribute`, and `namespace`
axes are not yet supported for ''select...''.
This is far less power than XPath provides, but it facilitates many programming tasks,
such as scanning for all elements of a given type (which I find very common).
==Node-selection methods==
===Tree information and navigation methods===
* '''getInheritedAttribute'''(node, name)
Return the value of attribute ''name'', from the first node along the
''ancestor-or-self'' axis of ''node'' that specifies it.
Thus, the attribute's value is inherited down the tree, similar to ''xml:lang''.
* '''getCompoundAttribute'''(self, name, sep='#', keepMissing=False)
Return the concatenation of the values of the named attribute,
from all ancestors (and self). Separate them with 'sep'.
This facilitates a notion of compound keys (or hierarchical IDs).
* '''leftBranch'''(node)
Return the leftmost descendant of ''node''.
If ''node'''s first child has no children of its own, then it is
also ''node'''s first descendant; otherwise, it will not be.
* '''rightBranch'''(node)
Return the rightmost (last) descendant of ''node''.
* '''getNChildNodes'''(node)
Return the number of (direct) child nodes. This is preferable to asking
for `len(node.childNodes)`, because some implementations may not keep
all childNodes as a physical array underneath, and would have to collect it,
only to then measure it and discard the collection again (hopefully they'll
cache it, but still....).
* '''getDepth'''(node)
Return how deeply nested ''node'' is (the document element is ''1'').
* '''isWithin'''(''node'', ''type'')
Return 1 if ''node'' is, or is within, an element of the given ''type'',
otherwise return 0.
* '''getContentType(node)'''
Returns whether the node has element children, text children, both ("mixed"),
or nothing ("empty"). This is currently not graceful about other possibilities,
such as an element that contains only PIs, COMMENTs, etc. ("odd").
The values returned are drawn from:
CONTENT_EMPTY does not make any distinction between (for example) <b></b> and <b/>.
**CONTENT_EMPTY = 0
**CONTENT_TEXT = 1
**CONTENT_ELEMENT = 2
**CONTENT_MIXED = 3
**CONTENT_ODD = -1
* '''getFQGI'''(node)
Return the list of element types of ''node'''s
ancestors, from the root down, separated by '/'.
For example, "html/body/div/ul/li/p/b".
'''Note''': An FQGI does ''not'' identify a specific element instance or location
in a document; for that, see ''getXPointer''().
* '''compareDocumentPosition'''(n1, n2)
Compare two nodes for document order, returning -1, 0, or 1.
* '''getXPointer'''(node, textOffset=None)
Return the XPointer child sequence that leads to ''node''.
That is, the list of child-numbers for all the ancestors of the node, from
the root down, separated by '/'. For example, "1/1/5/2/1".
This is a fine unique name for the node's location in the document.
If `textOffset` is given, the XPointer will point to that character of the
concatenated text content of `node` (which will typically be down in some
descendant node).
This does not yet handle ranges, but will when I get around to it. In the
meantime, you can fetch XPointers for the two ends and combine them in
the caller.
* '''compareXPointer'''(x1, x2)
Compare two XPointer child-sequences (see ''getXPointer'')
for relative document order, returning -1, 0, or 1.
This does not require actually looking at a document, so no document or
node is passed.
* '''interpretXPointer'''(document, x)
Interpret the XPointer child sequence in the string
''x'', in the context of the given ''document'',
and return the node it identifies (or None if there is no such node).
* '''getEscapedAttributeList'''(node, sortAttributes=False, quoteChar='"')
Return the entire attribute list for
''node'', as needed to go within a XML start-tag. The attributes are
quoted using `quoteChar`, and any <, &, or `quoteChar` in them
are replaced by the appropriate XML predefined charcter reference.
If `sortAttributes` is set, the attributes will appear in alphabetical order;
otherwise, the order is unspecified.
==Large-scale tree operations==
* '''removeWhiteSpaceNodes'''(node)
Delete all white-space-only text nodes that are descendants of ''node''.
* '''normalizeAllSpace'''(node)
Do the equivalent of XSLT normalize-space()
on all text nodes in the subtree headed at ''node''.
'''Note''': This is ''not'' the same as the XML::DOM::normalize() method, which
instead combines adjacent text nodes.
* '''normalize'''(node)
Coalesce adjacent text nodes, and delete empty ones.
* '''insertPrecedingSibling'''(''node, newNode'')
* '''insertFollowingSibling'''(''node, newNode'')
* '''insertParent'''(''node, type'')
* '''mergeWithFollowingSibling'''(''node'')
The following sibling of ''node'' is deleted, but its text content
is appended to ''node''.
This drops any sub-structure of the current and sibling.
New. See also the `diffCorefs` command.
* '''mergeWithPrecedingSibling'''(node)
* '''groupSiblings'''(node1, node2, typeForNewParent)
Group all the nodes from ''node1'' through ''node2'' together, under a
new node of type ''typeForNewParent''. Fails if the specified nodes are
not siblings or are not defined.
* '''promoteChildren'''(node)
Remove ''node'' but keep all its children, which becomes siblings at the same
place where ''node'' was.
* '''splitNode'''(node, childNode, offset)
Breaks ''node'' into 2 new sibling nodes of the same type.
The children preceding and including ''childNode'' end up
under the first resulting sibling node; the rest under the second.
However, if ''childNode'' is a text node and ''offset'' is provided,
then ''childNode'' will also be split, with the characters preceding and
including ''offset'' (counting from 1) becoming a final text node under
the first new sibling node, and the rest becoming an initial text node
under the second.
(not yet supported)
* '''eachNodeCB'''(node, preCallback, postCallback)
Traverse the subtree headed at ''node'',
calling the callbacks before and after traversing each node's subtree.
* '''eachNode'''(node)
Traverse the subtree headed at ''node'',
yielding it and each descendant in document order.
* '''eachTextNode'''(node)
A generator that yields each text node descendant.
* '''eachElement'''(node, etype=None)
A generator that yields each element node descendant. If `etype` is specified,
skip any that don't match.
* '''generateSaxEvents'''(self, handlers=None)
Generate the same SAX events that would be encountered if the node passed
were parsed as a document. ''handlers'' is a dict, with some or all of theDoc
following keys, whose values are the handlers to be called when that event
type occurs (or None):
'XmlDeclHandler',
'StartElementHandler',
'EndElementHandler',
'CharacterDataHandler',
'ProcessingInstructionHandler',
'CommentHandler'.
Events are not generated (yet) for Initial or Final, and CDATA marked sections
generate a regular CharacterDataHandler event.
* '''addArgsForCollectorOptions'''(parser, prefix="")
Assuming ''parser'' is an instance of ''argparse'', add the layout options
known for ''collectAllXml2'' and its kin, to that parser. If specified, insert
''prefix'' between the '--' and the plain option name (this lets you avoid
conflicts with other program options).
* '''collectAllText'''(node, delimiter)
Concatenate together the content of
all the text nodes in the subtree headed at ''node'',
putting ''delimiter'' (default: space) in between.
* '''collectAllXml2'''(node,
'breakComments', # True
'breakEnds', # False
'breakPIs', # True
'breakStarts', # True
'canonical', # False
'delim', # " "
'emitter', # None
'emptyForm', # "HTML" or XML or SGML
'indentString', # ' '
'indentText', # False
'lineBreak', # "\n"
'quoteChar', # '"'
'schemaInfo', # None
'sortAttributes', # False
'strip' # False
)
(newer version of collectAllXml, in testing).
** `breakComments`: Break and indent before comments
** `breakEnds`: Break and indent before end tags
** `breakPIs`: Break and indent before processing instructions
** `breakStarts`: Break and indent before start tags
** `canonical`: Generate ''Canonical XML'' per [https://www.w3.org/TR/xml-c14n].
** `delim`: Insert this string between adjacent text nodes (if any)
** `emitter`: Use this EWmitter instance for output
** `emptyForm`: Whether to generate `HTML`, `XML`, or `SGML` style empty element
** `indentString`: Repeat this string to create indentation
** `indentText`: Break and indent at start of texst nodes
** `lineBreak`: String to use for line-breaks
** `quoteChar`: Char to use for quoting attributes etc.
** `schemaInfo`: (reserved)
** `sortAttributes`: Put attributes of each element in alphabetical order
** `strip`: Whether to strip leading and trailing whitespace from texst nodes.
`emitter' can be an instance of class Emitter (also defined here), which
will get passed the generated result strings in document order.
The provided class has options to write them to a path or file handle,
collect them in a string buffer, or call some other callback for each.
By default, the result is collected in a string and passed back whole.
* '''collectAllXml''' (node, delim=" ", indentString=' ',
emitter=None, schemaInfo=None)
Generate the XML representation for the subtree headed at ''node''.
It knows about elements, attributes, pis, comments, and appropriate escaping.
However, it won't do anything for CDATA sections (other than escape as
needed), XML Declaration, DOCTYPE, or any DTD nodes.
This version only supports a few of the ayout options known to '''collectAllXml2'''().
* '''export'''(element, fileHandle, includeXmlDecl, includeDoctype)
Save the subtree headed at ''element'' to the ''fileHandle''.
If ''includeXmlDecl'' is present and True, start with an XML declaration.
If ''includeDoctype'' is present and True, include a DOCTYPE declaration,
using ''element'''s type as the document-element name, and include any
DTD information that was available in the DOM (unfinished).
See also XML::DOM::Node::toString(),
XML::DOM::Node::printToFile(), XML::DOM::Node::printToFileHandle().
This is effectively shorthand for ''collectAllXml''(), but isn't implemented
that way yuet.
==Index (internal package)==
(not yet implemented)
* '''buildIndex'''(attributeName)
Return a hash table in which each
entry has the value of the specified ''attributeName'' as key, and the element
on which the attribute occurred as value.
This is similar to the XSLT 'key' feature.
* '''find'''(value)
==Character stuff==
The various escapers (escapeText, etc.) and isa tests (isXmlName, etc.)
have been moved to a separate package, `XmlStrings`.
=Known bugs and limitations=
* Axis selects should:
** uniformly support negative indexes (currently only for the self, child,
and descendant axes).
** uniformly allow begin:end ranges.
** permit selecting elements and
attributes via regexes, and support multiple attribute constraints
(probably just allow passing a dict).
* `Node` is not changed to actually inherit from `list`, so there are surely some
list behaviors that I haven't included (yet). Like sorting. Let me know.
* qnames.
* The __getitem__() implementation for []-notation, like normal Python lists,
actually checks for `isinstance(arg, int)`, so `myNode["1"]` will not work.
=Model=
This makes accessing a DOM a lot more like using XPath, but stays in the
"walk around the structure" approach, rather than the "construct a query" one.
So, there are methods to retrieve items from along each XPath axis.
There is not a full filtering language like XPath, but when retrieving nodes
you can specify constraints such as:
* whether you want elements, attributes, text nodes, PIs, comments, etc.
* restrictions on the element types wanted (by string or regex)
* restriction to elements with certain attribute values
* whether whitespace-only text nodes "count" (this avoid a lot of testing)
* Going a distance along the axis, not just the "next" item as in DOM (other
than indexing into childNodes).
This also tries to be more Pythonic. For example, you can access child
nodes and attributes with the usual Python [] notation. Since Python []
can handle both strings and integers, you can say any of these:
myNode[3] gets the 4th child node
myNode["li"] gets all the child elements of type "li"
myNode["@class"] gets the class attribute.
There are a few special values: "#text", "#pi", and "#comment" get just
the child nodes of those types, and
myNode["*"] gets just element children.
You can also combine them and/or slice:
myNode["p", 3] gets the 4th child element of type "p"
myNode[1, 4] gets child nodes 1 through 3, just like Python lists
myNode["p", 1, 4] extracts all the "p" children, then gets [1,4] among those
You can do some other things you'd expect, like copy() instead of cloneNode(), and
using operators instead of compareDocumentPosition.
* insertBefore and appendChild are just list splicing
* cloneNode becomes copy
* compareDocumentPosition becomes <, >, <=, >=, in, and contains.
<< and >> for psib/fsib?
* countOF for # of a in b
* Could overload elem @ "attrname"
* indexof
** TODO: should in/contains be direct or indirect? cf compareDocumentPosition().
* isEqualNode and isSameNode become == and is
* attributes can be referenced by ["@class"] etc.
=To do=
*** In patchDom(), don't overwrite anything already there.
Test that patched nodes end up working ok.
*** Profile
Allow callable for first arg to __getitem__, which is a filter: takes
a node, returns a bit.
*** Rename all nodeplus args to nodeSel. Check other naming consistency.
*** Rename all "Attribute" to "Attr"?
Add "tagToRuby".
Reduce redundancy in traversers and generators.
For canonical XML, make getEscapedAttributeList() put namespace attrs first,
and escape CR and all > in content.
* Update to use/provide whatwg DOM features like
NodeIterator and TreeWalker [https://dom.spec.whatwg.org/#nodeiterator]
and NodeFilters (latter is there as enum), DOMTokenList,...?
* Find a better solution for getting this to be a true subclass. The main issue is functions
that do constructions. They need to instantiate based on the fancier extended Node.
I think the culprits are (see https://github.com/python/cpython/blob/main/Lib/xml/dom/minidom.py):
DOMImplementation.createDocument
DOMImplementation.createDocumentType
Document.createDocumentFragment
Document.createElement
Document.createTextNode
Document.createCDATASection
Document.createComment
Document.createProcessingInstruction
Document.createAttributec
Document.createElementNS
Document.createAttributeNS
Document._create_entity
Document._create_notation
* Support negative 'n' for rest of axis selects?
* Allow creating nodes with no ownerDocument; they get attached when inserted.
* Implement splitNode and wrap/surround(see help)
* Let appendChild, insertBefore, insertAfter, etc. take lists.
* Change eachTextNode, etc. to be real generators.
* Add access for items in NamedNodeMaps via [] notation.
* Option for innerXml/outerXml to guarantee Canonical result (in progress).
* Promote attribute to element type?
==Lower priority==
* Add insertParent
* Possibly support myNode[["P"]] to scan all descendants (by analogy
with XPath "//" operator).
* Add a way to turn off all indenting for collectAllXml2 with one option.
* Implement a few useful BS4 bits (see class BS4Features) (and add to help).
* String to tag converter, like for quotes?
* Normalize line-breaks, Unicode whitespace within text nodes
* Consider additions from https://lxml.de/api/index.html:
** strip_attributes,....
* Possibly, extend `innerXML`, `innerText`, etc. to be able to exclude
certain subtrees, such as for embedded footnotes, speaker tags interrupting
speeches, etc.?
* Move in matchesToElements from mediaWiki2HTML.
* Sync with XPLib.js
* Find next node of given qgi? Select by qgi?
* String to entity ref, like for quotes, dashes, odd spaces, etc.
* Un-namespace methods: merge ns1/ns2; change prefix/url; drop all;....
* Allow choice of how to 'escape' PIs and comments.
* More optional parameters for Document.create___ (like attrs on element).
* More flexibility on createAttribute etc.
=Related commands=
`JSLIBS/DomExtension.js` -- obsolete but similar Javascript version.
`domtabletools.py` -- DOM additions specifically for tables.
`testDom.py` -- a package of test cases to exercise DOM and this.
`basedom.py` -- A very simple DOM implementation in pure Python. Mostly for testing.
`Dominus.py` -- a disk-resident DOM implementation that can handle absurdly
large documents.
`xmloutput.py` -- Makes it easy to produce WF XML output. Provides methods
for escaping data correctly for each relevant context; knows about character
references, namespaces, and the open-element context; has useful methods for
inferring open and close tags to keep things in sync.
=History=
* Written 2010-04-01~23 by Steven J. DeRose (originally in Perl).
* ...
* 2012-01-10 sjd: Start port to Python.
* 2016-01-05: Fix previous/next. Add monkey-patching.
* 2018-02-07: Sync with new AJICSS Javascript API. Add/fix various.
* 2018-04-11: Support unescaping HTML named special character entities.
* 2019-12-13ff: Clean up inline doc. Emitter class. collectAllXml2. lint.
Pull in other extensions implemented in my BaseDom.py (nee RealDOM.py).
Generate SAX.
* 2020-05-22: Add a few items from BS4, and a few new features.
* 2021-01-02: Add NodeTypes class, improve XPointer support.
* 2021-03-17: Add getContentType().
* 2021-07-08: Add some methods omitted from patchDom(). Add checking for such.
Type-hinting. Proof and sort patchDom list vs. reality.
* 2021-07-20: Fix eachNode for attribute nodes. Add removeNodesByNodeType().
* 2022-01-27: Fix various annoying bugs with NodeTypes Enum. Remember that the Document
element is really an element. Improve handling for bool and for multi-token values
in getAttributeAs(). Turn off default of appending id comment in getEndTag().
* 2023-02-06: Clean up parent/sibling insert/wrap methods.
* 2023-04-28; Move table stuff to domtabletools.py. Implement comparison operators.
* 2023-07-21: Fix getFQGI(), getContentType(). Add getTextLen().
=Rights=
Copyright 2010, 2020, Steven J. DeRose. This work is licensed under a Creative
Commons Attribution-Share Alike 3.0 Unported License. For further information on
this license, see http://creativecommons.org/licenses/by-sa/3.0/.
For the most recent version, see [http://www.derose.net/steve/utilities] or
[http://github/com/sderose].
Many of the methods here are based on
[https://www.w3.org/TR/1999/REC-xpath-19991116/] and on my paper:
"JSOX: A Justly Simple Objectization for XML,
or: How to do better with Python and XML."
Balisage Series on Markup Technologies, vol. 13 (2014).
[https://doi.org/10.4242/BalisageVol13.DeRose02].
=Options=
"""
###############################################################################
# A couple types, for type-hinting parameters.
# Would be nicer but non-trivial to implement these as subclasses of str.
#
_regexType = type(re.compile(r'a*'))
NMToken = str # An XML name token (mainly for type hint readability)
NodeSel = str # Union(XMLQName, "@"+XMLQName, "*", "#text",
# #comment, #cdata, #pi, #entref, #cdata, #frag, #notation
# or callable(Node)->bool
###############################################################################
# Exceptions and assertions
#
class NOT_SUPPORTED_ERR(Exception):
pass
class HIERARCHY_REQUEST_ERR(Exception):
"""Thrown when requested navigation through the tree cannot be done.
"""
def assertElement(self:Node):
assert self.nodeType==Node.ELEMENT_NODE, "We're at a %s, not an element." % (
self.nameOfNodeType())
###############################################################################
#
class Axes(Enum):
"""A list of the XPath axes. With a bit each, so they can be ORed.
That's mainly to enable an "or self" axis for every other axis (though it
seems unlikely for ATTRIBUTE). For other combinations, a definition of
relative order would be needed. For example, in what order would nodes
along PSIBLING|FSIBLING be checked? Also, some nodes occur in
multiple axes relative to a given starting node (e.g., CHILD|DESCENDANT).
For the present, only SELF can be combined with others.
Possibly introduce axis-specific prefixes to __getitem__ -- like current "@",
but all the others apply only to element targets?
"""
NONE = 0x000 #
SELF = 0x001 # .
ANCESTOR = 0x002 # ..
CHILD = 0x004 # /
DESCENDANT = 0x008 # //
PSIBLING = 0x010 # < # or arrows?
FSIBLING = 0x020 # >
PRECEDING = 0x040 # <<
FOLLOWING = 0x080 # >>
ATTRIBUTE = 0x100 # @
ANCESTOR_SELF = 0x003 #
DESCENDANT_SELF = 0x009 #
###############################################################################
#
class DOCUMENT_POSITIONS(Enum):
"""Like DOM's compareDocumentPosition but more complete.
See DOM Node.Node.DOCUMENT_POSITION_xxx.
"""
DOCPOS_EQUAL = 0 # Not defined in DOM, but 0 is what you get....
DOCPOS_DISCONNECTED = 1
DOCPOS_PRECEDING = 2
DOCPOS_FOLLOWING = 4
DOCPOS_CONTAINS = 8 # dom also sets 2???
DOCPOS_CONTAINED_BY = 16 # dom also sets 4???
#
DOCPOS_IMPLEMENTATION_SPECIFIC = 32
#
DOCPOS_SWAPPABLE = 32 ^ 8 ^ 16 # Like <b><i>hello></i></b>
DOCPOS_SEMI_PRECEDING = 32 ^ 2 # For overlap
DOCPOS_SEMI_FOLLOWING = 32 ^ 4 # For overlap
###############################################################################
# # Move to separate domgetitem.py (see import at top)
#
class NodeSelKindOBS(Enum): # ==> domgetitem.NodeArgs
"""Major types of arguments for DEgetitem etc.
This has two main uses:
1: given one of the 3 args to __getitem__, identify what kind it is.
Int seems most likely, which is why that's here.
2: Helping interpret the similar arg to many selection/navigation
methods, e.g., selectChild("#pi", n=1). In that case, the
arg can also be a regex, which specifies matching nodeNames.
"""
ARG_NONE = 0x000 # Failed
ARG_INT = 0x001 # (this is for the non-nodeKind (index) args only)
ARG_ELEMENT = 0x002 # Any QName
ARG_REGEX = 0x004 # A compiled regex, to match vs. element names
ARG_ATTRIBUTE = 0x008 # @ + QName
ARG_STAR = 0x010 # "*"
ARG_TEXT = 0x020 # #text
ARG_PI = 0x040 # #pi
ARG_COMMENT = 0x080 # #comment
ARG_CDATA = 0x100 # #cdata
@staticmethod
def getKind(someArg:Union[str, int]) -> 'NodeArgs': # nee def argType()
"""Categorize one of the arguments to __getitem__().
"""
if (someArg is None):
return NodeArgs.ARG_NONE
if (isinstance(someArg, int)):
return NodeArgs.ARG_INT
if (XmlStrings.isXmlName(someArg)):
return NodeArgs.ARG_ELEMENT
if (someArg == "*"):
return NodeArgs.ARG_STAR
if (someArg[0] == "@" and XmlStrings.isXmlName(someArg[1:])):
return NodeArgs.ARG_ATTRIBUTE
# Next 3 can all be handled by _getListItemsByKind_()
if (someArg == "#text"):
return NodeArgs.ARG_TEXT
if (someArg == "#comment"):
return NodeArgs.ARG_COMMENT
if (someArg == "#pi"):
return NodeArgs.ARG_PI
if (someArg == "#cdata"):
return NodeArgs.ARG_CDATA
raise ValueError("Argument '%s' not of identifiable kind." % (someArg))
def isLeafType(node:Node) -> bool:
"""Return whether the node is of a nodeType that can only ever be a leaf.
Not to be confused with isLeaf().
"""
return (node.nodeType in [
Node.TEXT_NODE, Node.CDATA_SECTION_NODE, Node.COMMENT_NODE,
Node.ATTRIBUTE_NODE, Node.PROCESSING_INSTRUCTION_NODE,
])
def isLeaf(node:Node) -> bool:
"""Return whether the node is in fact a leaf (no childnodes).
"""
return bool(node.childNodes)
###############################################################################
# Methods to patch on DOM Node.
#
#try:
# from BaseDom import BaseDom
# BaseDom.usePythonExceptions()
#except ImportError as e:
# sys.stderr.write("DomExtensions: Could not import BaseDom for Exceptions.\n")
#
# TODO: Drop in favor of separate domgetitem.py (see imports at top)
def DEgetitemOBS(self:Node, n1, n2=None, n3=None) -> List:
"""Access nodes via Python list notation, based on my paper:
DeRose, Steven J. JSOX: A Justly Simple Objectization for XML:
Or: How to do better with Python and XML.
Balisage Series on Markup Technologies, vol. 13 (2014).
https://doi.org/10.4242/BalisageVol13.DeRose02.
For example:
myElement['p'] get the 1st 'p' child as a scalar
*** This could just as well have returned ALL 'p' children ***
myElement['p':2] get the 3rd 'p' child
myElement[2:'p'] get 3rd child if it's a 'p'
myElement['p':2:8] get the 3rd through 7th 'p' children
myElement[2:8] get the 3rd through 7th children
myElement[2:8:'p'] of the 3rd through 7th children, get the 'p's
myElement['@class'] get the 'class' attribute's value (no indexes!)
myElement[1:10:2] get every other childNode from 1:10.
TODO: Test support for negatives -- anything missing?
TODO: Test cases like [1:] and [:5]
TODO: Perhaps allow name ".." for ancestor axis?
TODO: Perhaps extend [] to support jQuery-style CSS selectors?
TODO: Perhaps allow regexes for name?
TODO: Perhaps move attrs to setAttr space, access w/ . notation?
"""
typ1 = NodeArgs.getKind(n1)
typ2 = NodeArgs.getKind(n2)
typ3 = NodeArgs.getKind(n3)
if (n3 is not None): nargs = 3
elif (n2 is not None): nargs = 2
else: nargs = 1
if (typ2==NodeArgs.ARG_ATTRIBUTE or typ3==NodeArgs.ARG_ATTRIBUTE or
(typ1==NodeArgs.ARG_ATTRIBUTE and nargs > 1)):
raise IndexError("No other indexes allowed with @xxx.")
if (nargs == 1):
if (typ1 == NodeArgs.ARG_INT): # [0]
return self.childNodes[n1]
if (typ1 == NodeArgs.ARG_ATTRIBUTE): # ['@id']
return self.attributes.getNamedItem(n1[1:])
return self._getChildNodesByKind_(n1) # ['p'] ['#text'] ['*']
elif (nargs == 2):
if (typ1 == NodeArgs.ARG_INT):
if (typ2 == NodeArgs.ARG_INT): # [0:2]
return self.childNodes[n1:n2]
return self._getChildNodesByKind_(n2)[n1] # [0:'p']
else:
if (typ2 == NodeArgs.ARG_INT): # ['x':0]
return self._getChildNodesByKind_(n1)[n2]
else: # ['x':'x']
raise IndexError("More than one non-int index.")