-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdomtabletools.py
executable file
·1455 lines (1229 loc) · 51.6 KB
/
domtabletools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
#
# domtabletools.py: Do fancy table stuff with *ML tables.
# 2022-01-30: Written by Steven J. DeRose.
#
#pylint: disable=E1101
#
import sys
import codecs
from enum import Enum
from typing import List, Union, Dict, Callable #, IO
import logging
import re
from xml.dom.minidom import Node # , Element, Document
from xml.dom import minidom
from fsplit import fsplit
from domextensions import DomExtensions, XmlStrings
DomExtensions.patchDom()
lg = logging.getLogger()
sys.stderr.write("Node.selectAncestor is %s." % (Node.selectAncestor))
__metadata__ = {
"title" : "domtabletools",
"description" : "Do fancy table stuff with *ML tables.",
"rightsHolder" : "Steven J. DeRose",
"creator" : "http://viaf.org/viaf/50334488",
"type" : "http://purl.org/dc/dcmitype/Software",
"language" : "Python 3.7",
"created" : "2022-01-30",
"modified" : "2022-01-30",
"publisher" : "http://github.com/sderose",
"license" : "https://creativecommons.org/licenses/by-sa/3.0/"
}
__version__ = __metadata__["modified"]
descr = """
UNFINISHED
=Description=
This add-on for DOM provides many tools to make it easier to deal with
"tables" in HTML or XML.
It provides many basic operations for navigating and modifying tables,
including:
* creating them from CSV
* normalizing into what some call "data" tables, which are much more
regular (so easier to code for) than HTML tables in general.
* Do "bulk" operations on tables, such as:
** reordering columns
** sorting rows
** shifting data between attributes and content
These are the main things it does as a command. It can also be used as
a library that extends DOM with many table-specific operations and tests:
* all the above manipulations
* transposition
* basic relational algebra
* joins that produce either the usual flat result, or placing the joined table's
columns into a nested table.
* addressing rows or cells by sequence number (handling colspans
and rowspans if they haven't been normalized away)
* ignoring or removing whitespace nodes between rows and cells
One goal of the "normalized table" definition, is for data to be round-trippable
without loss.
Can also convert to and from several other formats, such as CSV, JSON, Python
init code, Python data structures. This is all pretty easy once you've
normalized.
* It distinguishes and supports a subset of tables I call "normalized".
These are in one sense "true" tables: 2-d collections of items, where the
rows and columns mean something consistent. Contrast these with "layout"
tables, commonly used on the Web to force various parts to display in different
areas or arrangements. Some differences are:
** It makes sense to sort (at least) columns
** Rowspans and colspans don't make sense
** all rows should have the same number and sequence of columns
(except perhaps a head row, which is special)
** each column typically has a meaningful name and often a datatype.
** nested tables don't make sense (except perhaps in a case described below).
Normalized tables can be used in many ways that non-normalized tables can't
(or at least, can't without extra work, added conventions, etc).
Normalized tables are specifically designed to be a compromise between RDB
tables and HTML tables, for the "kind" of data they have in common.
Thus, it requires:
* no colspans or rowspans
* exactly one head row, with a name for each column, unique within the table
* all rows have to have the same number of columns, and if their cells
specify their own names (which can be awfully useful for readability and
regex-matching), they must match the corresponding head hame
* nested tables are prohibited, *except* for one special case that maps
directly to a common RDB and statistics scenario for which it can be kept
clean (more on that below).
Normalized tables are thus very close analogs to database "relations". A couple
key remaining differences are:
* They have to "be" in some row and column order -- actually, underneath
an RDB may be, too -- it's just that the order isn't considered meaningful.
* RDBs have additional constraints, such as every row of a table being
unique, which tables in documents may not always need to satisfy.
* RDBs have a notion of "keys", similar to
XML IDs, but considerably more powerful. Normalized tables allow one column
to declare itself the primary key, which can be useful for Javascript code
or for fine-grained hypertext linking in general.
* Cells in normalized tables still allow internal markup, such as to
break a string cell into paragraphs, add emphasis, footnotes, etc.
Databases of course have many parts besides the data itself. However,
the normalized tables
defined and implemented here, should be able to
round-trip between (say) SQL and HTML without loss.
* The Normalized Table knows about semantic types of fields, such as bool,
int, float, string, date, and time; and a few special cases such as enums
URLs -- this is mainly so it can do display better (such as integer vs. string justification);
treat URLs as links; offer appropriate editing such as checkboxes for
booleans, menus for enums, or perhaps calendars for dates. But it doesn't know about
integer or string sizes, unisgned vs. signed, etc. Wouldn't be that hard to
upgrade my implementation for that, I just don't think it's that useful ''in
this context''.
* On the flip side, Normalized tables know about XML special-character handling,
and are reliably Unicode (at least for XML and XHTML).
while string fields in
* There are methods implemented here for checking some constraints such as
uniqueness, datatype, etc., but there's nothing in HTML itself for that, so
if you edit the HTML of a normalized table it might no longer be fully
normalized.
==Nested tables==
HTML tables allow arbitrary nesting, which RDBs simply do not. This has led
to some astonishingly bizarre results in the wild. Along with rowspans,
colspans, and the difficulty of being sure what's a heading row and
what's now, this makes tables far harder to deal with.
However, one case of table nesting seems to me less problematic, because
it corresponds to a well-define conceptual case that is common in DBs,
in statistical datasets, and in documents.
Consider census data. Data for a given person may include information on
their children. If there's only one, this is easy (I'm leaving out child
surnames to save space):
ID SURNAME GIVEN DOB Ch_GIVEN CH_DOB
87 Smith Joan 1990-01-01 Bill 2015-07-30
But when there's more than one, it gets messy, as many know. You see things like
ID SURNAME GIVEN DOB Ch_GIVEN1 CH_DOB1 Ch_GIVEN2 CH_DOB2 ...
87 Smith Joan 1990-01-01 Bill 2015-07-30 Cindy 2016-10-10
There are well-known problems with this, and a more effective DB solution
is to segregate all the "child information" into a second table, connected to
the first by an ID:
ID PARENT_ID GIVEN DOB
1020 87 Bill 2015-07-30
1021 87 Cindy 2016-10-10
Several regularities obtain here:
* The "parent" record connects to entire "child" records, not just to
parts of them.
* There are no circularities.
* Commonly, a specific "child" record can be "assigned" to only one
"parent" record.
Actually, in many cases it would make sense to connect a child to two
other records (in this case, both parents). That is often done by having
three tables: one of "parent" records, one of "child" (in the case of literal
children these could be merged into a single
"person" table), and a third table that has the connections, and perhaps
some data about the particular connection:
PARENT_ID CHILD_ID FAVORITE
87 1020 1
87 1021 0
My "normalized" tables permit cases like this to pull in the "child" records,
by selecting the ones connected to the "parent" record and making them a
nested table in a colu n devoted to that action. DB practitioners will see
how this corresponds closely to a left join, and can be "unjoined" to get
back, or "rotated" (as some call it) to get either of the forms commonly used
in statistical datasets.
Of course, if a specific "child" record is attached to multiple "parent" records,
there will be duplicates across the nested tables. These can reliably be
dropped when "un-joining" back out, if needed.
Nested normalized tables are designed to permit this kind of "rotation", by
imposing these requirements:
* If one cell in a column is a nested table, all must be (or be empty).
* Nothing other than the nested table (and whitespace) may occur
in those cells.
* All nested tables in the same column, must have the same sequence of
column names, same schema name, and (if present) datatypes.
* The datatype of the column with nested tables is "_JOIN".
==Usage==
domtabletools.py [options] [files]
XML:sexp::SQL:CSV
Key normalizations vs. general HTML tables:
* No colspans or rowspans
* Columns are all named
* Headings are always and only the first row, and all/only 'th's
* Col heads are unique to one column, and always on all children.
* Nested table only allowed with join-like restrictions
* Optionally, can pre-calculate and save column widths.
=Issues=
Mixture of goals:
* Separate out data-ish HTML tables
* Standardize a SQL transfer format
* Round-trippability with HTML
* Make it easy to put SQL into HTML (say, for nice display)
* Support a little DB editing
* Nesting to make things like stat rotations easy.
Each table should have a schema name (for nested, must match across all
in given col; for non-nested, must be unique in doc).
keys? one col can claim primary key
what to do with empty cells?
should it support compound foreign key checking?
mech to calculate cell on the fly, like spreadsheet?
editable? fill in, menu, type?
add equiv of groupby that makes spanning heads?
=Related Commands=
=Known bugs and Limitations=
Although we use patchDOM to attach a bunch of methods to Node and
to NormTable, pylint doesn't seem to realize it, so it still reports
issues like E1101: NormTable has no 'xxx' member.
=To do=
Find a way to get pylint to see DomExtensions when subclassing Node to NormTable.
See [https://stackoverflow.com/questions/60560093/monkey-patching-class-with-inherited-classes-in-python].
* Add way to get applicable alignment for a table cell or column
** @align or @style on the cell
** <col>?
** CSS (at least if explicit in an HTML header)
** scan column, if all numeric (except header) use right, else left; header center?
** Pull in table and style features from html2latex.py.
* Possibly extend support for proper matrices?
** Ensure datatypes
** Convert to/from numpy (let it do the math!)
=History=
* 2022-01-30: Written by Steven J. DeRose (draft, really).
* 2023-04-28: Move in table stuff from domextensions. Split out TableOptions class.
Make main class a wrapper that owns a table Node and a TableOptions.
Organize methods by components of a table. Decide to require clean (no nesting,
always has thead/tbody, one thead/tr, no spans, co-indexed column members.
Decided to number rows and columns from 1 (that's how everybody talks with
tables, and these really aren't array indexes since there can be
text nodes and PIs and stuff in there).
=Rights=
Copyright 2022-01-30 by Steven J. DeRose. This work is licensed under a
Creative Commons Attribution-Share-alike 3.0 unported license.
See [http://creativecommons.org/licenses/by-sa/3.0/] for more information.
For the most recent version, see [http://www.derose.net/steve/utilities]
or [https://github.com/sderose].
=Options=
"""
###############################################################################
#
ELEM = Node.ELEMENT_NODE
ATTR = Node.ATTRIBUTE_NODE
def getChildByName(node:Node, name:str, ns:str="", case:bool=True):
"""Return the first child of the given node, which is an element of
the given type.
If 'case' is set to False, the nodeName is matched ignoring case.
If 'ns' is not set, only the localName (not the namespace) is checked.
If 'ns' is set, whether it test the prefix or URL may depend on the
underlying DOM implementation.... though it shouldn't.
"""
assert node.nodeType == Node.ELEMENT_NODE
name = re.sub("^.*:", "", name)
for ch in node.childNodes:
if (ch.nodeType != node.ELEMENT_NODE): continue
if (ns):
nsPart = re.sub(r":.*", "", ch.nodeName)
if strcmp(nsPart, ns,case) != 0: continue
if strcmp(ch.localName, name, case) == 0: return ch
return None
def strcmp(s1:str, s2:str, case:bool=True):
"""Compare the strings, ignoring case unless 'case' is Trueish.
"""
if (not case):
s1 = s1.lower()
s2 = s2.lower()
if s1 < s2: return -1
if s1 > s2: return 1
return 0
###############################################################################
#
class CellDataTypes(Enum):
"""A list of common data types in cells, mainly to help formats them.
"""
NONE = 0
BOOL = 1
INT = 2
FLOAT = 3
CURRENCY = 4
YEAR = 10
DATE = 11
TIME = 12
DATETIME = 13
TIMESTAMP = 14
CHAR = 20
NMTOKEN = 21 # (-> SQL Enum?)
NMTOKENS = 22 # (-> SQL SET?)
URL = 23
GUID = 24
STR = 25
FKEY = 30
RASTER = 31
VECTOR = 32
VIDEO = 33
APPDATA = 34
XML = 50
BLOB = 99
@staticmethod
def isKnown(s:str) -> bool:
su = s.upper()
try:
_foo = CellDataTypes.__getitem__(su)
return True
except KeyError:
return False
###############################################################################
#
class TableOptions:
"""Define how each meaningful component is named and mapped to XML/HTML,
and provide a method to turn CSV-ish stuff into it.
TODO: Support swapping things between element/attribute?
"""
def __init__(self):
self.setDefaults()
def setDefaults(self):
# Table elements
self.TABLE = "table"
self.THEAD = "thead"
self.TBODY = "tbody"
self.TFOOT = "tfoot"
self.TR = "tr"
self.TD = "td"
self.TH = "th"
# Attributes
self.CLASS = "class"
self.COLSPAN = "colspan"
self.ROWSPAN = "rowspan"
self.TYPE = "dtype" # On header, datatype of column entries (?)
# Constants
self.ROW_0 = "row0" # @CLASS token for rowspan dummy
self.COL_0 = "col0" # @CLASS token for rowspan dummy
self.SORT = "sortable"
# Places to put special data
# TODO: Abstract following; caller changes the values above.
self.COLIDENTUSE = "class" # Use of column ident
self.COLIDENTDEF = "class" # Definition of column ident
#self.ISKEY = "iskey" # ???
###############################################################################
#
#
def CreateTableFromMatrix(self, theMatrix:List[List], hasHeader:bool=True):
"""Construct from list of lists. First row can be header if desired.
"""
if (not self.topt): self.topt = TableOptions()
nCols = len(theMatrix[0])
colIds = []
if (hasHeader):
colIds = [ str(theMatrix[0][x]) for x in nCols ]
else:
colIds = [ ("col_%02d" % (i+1)) for i in range(nCols) ]
self.tbl = NormTable(colIds)
for i in range(1, len(theMatrix)):
row = theMatrix[i]
rowLen = len(row)
if (rowLen != nCols):
lg.warning("Row %d has %d items, expecting %d.", i, rowLen, nCols)
self.tbl.appendRow()
for colNum in range(rowLen):
self.tbl.setField(i, colNum, row[colNum])
return self.tbl
###############################################################################
# Construct from CSV
#
def CreateTableFromCSV(self, path:str, hasHeader:bool=True, fsplitArgs:Dict=None) -> Node:
"""Load a CSV file and create a NormTable out of it.
First record better be field names; each name may also
have a colon and a datatype name appended.
"""
if (not fsplitArgs):
fsplitArgs = { "quote":'"', "delim":"," }
ifh = codecs.open(path, "rb", encoding="utf-8")
# Process the head
colIds = None
if (hasHeader):
colIds, _colTypes = self.doCSVHeader(ifh, fsplitArgs)
# Process the data
recNum = 1
for rec in ifh.readlines():
recNum += 1
fds = fsplit(rec, fsplitArgs)
dataRow = self.ownerDocument.createElement(self.topt.TR)
self.tbl.appendChild(dataRow)
for i, fd in enumerate(fds):
colElem = self.ownerDocument.createElement(self.topt.TD)
colElem.setAttribute(self.topt.CLASS, colIds[i])
colElem.innerHtml = XmlStrings.escapeText(fd)
return self.tbl
def doCSVHeader(self, ifh, fsplitArgs):
colIds = []
colTypes = []
headRec = ifh.readline()
colIds = []
headFds = fsplit(headRec, fsplitArgs)
for headFd in headFds:
if (":" in headFd):
nm, typ = headFd.split(sep=":")
else:
nm = headFd
typ = None
colIds.append(nm)
colTypes.append(typ)
colElem = self.ownerDocument.createElement(self.topt.TH)
colElem.setAttribute(self.topt.CLASS, nm)
if (typ): colElem.setAttribute("typeName", typ)
self.appendColumn(colElem)
return colIds, colTypes
###############################################################################
#
def checkNorm(tbl:Node, topt:TableOptions=None):
if (not topt): topt = TableOptions()
errCount = 0
trHead = tbl.getElementsByTagName(topt.TR)
if (not trHead):
errCount += 1
lg.warning(" no rows found.")
return errCount
colIds = []
colTypes = []
for headCell in trHead.childNodes:
if (headCell.nodeType != Node.ELEMENT_NODE): continue
if (headCell.nodeName != topt.TH):
errCount += 1
lg.warning(" Non-head cell '%s' found in heading row",
headCell.nodeName)
if (not headCell.hasAttribute(topt.CLASS)):
errCount += 1
lg.warning(" Head cell lacks @class.")
colIds.append(None)
else:
colIds.append(headCell.getAttribute(topt.CLASS))
if (not headCell.hasAttribute("typeName")):
errCount += 1
lg.warning(" Head cell lacks @typeName.")
colTypes.append(None)
elif (headCell.getAttribute("typeName") not in CellDataTypes):
errCount += 1
lg.warning(" Head cell @typeName '%s' not known.",
headCell.getAttribute("typeName"))
colTypes.append(None)
else:
colTypes.append(headCell.getAttribute("typeName"))
for e in tbl.eachElement():
if (e.hasAttribute("rowspan")):
errCount += 1
lg.warning(" rowspan attribute found")
if (e.hasAttribute("colspan")):
errCount += 1
lg.warning(" colspan attribute found")
if (not e.hasAttribute(topt.CLASS)):
errCount += 1
lg.warning(" class attribute missing")
if (e.nodeType == topt.TH and
e.selectAncestor(topt.TR) != trHead):
errCount += 1
lg.warning(" th found outside first (heading row)")
# Check for nesting
errCount += tbl.checkNesting()
return errCount
###############################################################################
# Normalizers
#
def unspan(self, attrValue:str="td-nil") -> None:
"""Expand colspans and rowspans by adding empty cells as needed.
If 'attrName' is not empty, 'attrValue' is added as a class token
to that attribute on all the new empty cells.
"""
for tr in self.getRows():
for cell in tr.childNodes:
if (cell.nodeName not in [ self.topt.TD, self.topt.TH ]): continue
try:
cspan = cell.getAttribute(self.topt.COLSPAN)
cspan = int(cspan.strip()) if cspan else 1
while cspan > 1:
cell.insertDummyCell(self.topt.TD, attrValue)
cspan -= 1
except TypeError:
pass
return
def insertDummyCell(self, node:Node, colName:str="td-nil") -> Node:
"""Create a dummy cell (generally to take the place of a moribund span),
and insert it after the given cell.
"""
dum = self.ownerDocument.createElement(self.topt.TD)
dum.setAttribute(self.topt.CLASS, colName)
fsib = self.followingSibling
if (fsib):
node.parentNode.insertBefore(fsib, dum)
else:
node.parentNode.appendChild(fsib)
return dum
def eliminateSpans(self):
"""Insert extra cells (empty, or copies of 'filler'), to obviate
spans within the table.
"""
theDoc = self.tbl.getDocument()
# The colspans
for node in self.tbl.eachElement():
cs = node.getAttribute(self.topt.COLSPAN)
if (cs and int(cs) > 1):
cs = int(cs)
node.setAttribute(self.topt.COLSPAN, 1)
nn = node.nodeName
#row = node.parentNode
for _i in range(int(cs)):
newNode = theDoc.createElement(nn)
node.insertFollowingSibling(newNode)
# The rowspans
for node in self.generateRows():
rs = node.getAttribute(self.topt.ROWSPAN)
if (rs and int(rs) > 1):
rs = int(rs)
node.setAttribute(self.topt.ROWSPAN, 1)
nn = node.nodeName
while (rs > 1):
newNode = theDoc.createElement(nn)
node.insertFollowingSibling(newNode)
rs -= 1
def unnest(self):
nRemoved = 0
for subtable in self.tbl.eachNode(self.topt.TABLE):
if (subtable == self.tbl): continue
subtable.parentNode.removeChild(subtable)
nRemoved += 1
return nRemoved
def hasSubTable(self):
"""Is there a table within this table?
"""
st = self.tbl.getDescendant(self.topt.TABLE)
return (st is not None)
def ensureOuters(self):
th = self.getHead()
if (not th): self.addHead()
tb = self.getBody()
if (not tb): self.addBody()
def ensureAllColumnIdents(self) -> int:
nFailedColumns = 0
for colNum, col in enumerate(self.generateColumns()):
nFailedCells = self.ensureColumnIdents(colNum, col)
if (nFailedCells): nFailedColumns += 1
return nFailedColumns
def ensureColumnIdents(self:TableOptions, colNum:int, ) -> list:
"""Scan the table by column, and return a list of all the cell
(row, col) coordinates where there's a column-id problem. If the
returned list is empty, everything's ok.
For purposes of this call, the hea row is considered to be row 0.
"""
badCoords = ()
theIDs = ()
for row in self.getHeadRow():
thisColId = theIDs.append(self.getAttribute(row, self.topt.CLASS))
if (not thisColId): badCoords.append( (0, colNum) )
for colNum, _col in enumerate(self.generateColumns()):
for rowNum in range(1, self.getNumRows()):
cell = self.getCellByRowColumn(rowNum, colNum=colNum)
thisColId = cell.getAttribute(self.topt.CLASS)
if (thisColId != theIDs[colNum]):
badCoords.append( (rowNum, colNum) )
return badCoords
###############################################################################
# Tabular structure support (TODO: Integrate)
#
def getColumn(self:Node, onlyChild:str="tbody", colNum:int=1, colSpanAttr:str=None) -> list:
"""Called on the root of table-like structure, return a list of
the colNum-th child element of each child element. That should amount
to the colNum-th column.
@param onlyChild: If not "", look for a child of self of that type,
and treat it as the container-of-rows.
@param colSpanAttr: If set, treat that attribute name like HTML "colspan".
"""
if (onlyChild):
base = self.selectChild(onlyChild)
else:
base = self
if (not base or base.nodeType != Node.ELEMENT_NODE): return None
cells = []
for row in base.childNodes:
if (row.nodeType != Node.ELEMENT_NODE): continue
cells.append(row.getCellOfRow(colNum=colNum, colSpanAttr=colSpanAttr))
return cells
#def generateColumn(self): # TODO: Implement
# assert False
def getCellOfRow(self:TableOptions, row:Node, colNum:int=1, colSpanAttr:str=None) -> Node:
"""Pretty much like getElementChild(), but can account for horizontal
spans (this does not yet adjust for vertical/row spans!).
"""
found = 0
for ch in row.childNodes:
if (ch.nodeType != Node.ELEMENT_NODE): continue
found += 1
if (found == colNum): return ch
if (colSpanAttr):
cspan = int(ch.getAttribute(colSpanAttr))
if (cspan > 1): found += cspan-1
return None
###############################################################################
#
class SORTTYPE(Enum):
"""Basic ways to sort by a given column. Far from complete...
Perhaps add: whitespace or unicode norm; dictionary style; human-numeric;
date/time/datetime; version numbers; Mac auto-split. Cf *nix 'sort'.
"""
STR = 0
CASELESS = 1
TOKENS = 2 # Tokenize at \W+, then sort tokenwise
MACFILE = 3 # Tokenize by alpha vs. numeric
INT = 10
FLOAT = 11
CASH = 12 # Strip whitespace and currency chars, then as float.
###############################################################################
#
class NormTable:
"""Manage a DOM table, with a wrapper and a ton of operations.
Typically, we assume the table has been normalized:
No rowspans or colspans
No nested tables
Always THEAD and TBODY
One row in THEAD
All cells have a column-name set in @CLASS, unique per column.
On the other hand, actual tag names are always indirected through an
instance of TableOptions.
"""
def __init__(self, topt:TableOptions=None, fromTable:Node=None):
if (topt): self.topt = topt
else: self.topt = TableOptions()
if (fromTable is None):
self.tbl = self.makeDOMTable()
else:
self.tbl = fromTable
if (fromTable.nodeName != topt.TABLE):
lg.critical("Table node is named '%s', not '%s'.",
fromTable.nodeName, self.topt.TABLE)
self.tbl.unspan()
self.tbl.unnest()
self.tbl.ensureOuters()
self.tbl.ensureColumnIdents()
def makeDOMTable(self):
"""Make an empty starter table.
"""
doc = minidom.Document()
tbl = doc.createElement(self.topt.TABLE)
doc.appendChild(tbl)
headRow = doc.createElement(self.topt.TR)
thead = doc.createElement(self.topt.THEAD)
thead.appendChild(headRow)
tbl.appendChild(thead)
tbody = doc.createElement(self.topt.TBODY)
tbl.appendChild(tbody)
return tbl
def makeElement(self, name:str, attrs:Dict=None, text:str=None):
doc = self.tbl.ownerDocument
el = doc.createElement(name)
if (attrs):
for k, v in attrs:
el.setAttribute(k, v)
if (text):
el.appendChild(doc.createTextNode(text))
return el
##################################################### TABLE OPERATIONS
#
def getShape(self):
return self.countRows(), self.countColumns()
def ownerTable(self, node:Node):
"""Return the containing table (if any) given any node.
Could also just return self.tbl....
"""
anc = node.tbl
while (anc):
if (anc.nodeName == self.topt.TABLE): return anc
anc = anc.parentNode
return None
def clearTable(self):
# Should this leave same as makeDOMTable?
while (self.tbl.childNodes):
self.removeChild(self.tbl.firstChild)
return self
# def sortBy(self, keyCols:List):
# """Sort the rows by some column(s)
# TODO: Implement. How best to specify keys?
# [ (colNum|colName, SORTTYPE, reverse)+ ]
# """
# assert False
def transpose(self, replace:bool=False):
"""This makes a transposed copy of the table.
TODO Ewww, what to do with the header?
If 'replace' is True, it is spliced in to replace the original.
In any case, the root of the new transposed thing is returned.
"""
self.unspan() # Just in case...
# Make a destination table
doc = self.ownerDocument
t2 = doc.createElement(self.topt.TABLE)
b2 = doc.createElement(self.topt.TBODY)
t2.appendChild(b2)
# Make as many new rows, as the starting table has columns
nCols = self.getRow(0).countColumns()
for _i in range(nCols):
tr2 = doc.createElement(self.topt.TR)
b2.appendChild(tr2)
for tr in self.getRows():
cNum = 0
for cell in tr.childNodes:
if (cell.nodeName not in [ self.topt.TD, self.topt.TH ]):
continue
cell2 = cell.cloneNode()()
b2.childNodes[cNum].appendChild(cell2)
cNum += 1
if (replace):
self.parentNode.replaceChild(t2, self)
return t2
##################################################### HEAD/BODY/FOOT OPS
def getHead(self):
return getChildByName(self.tbl, self.nc.THEAD)
def getHeadRow(self):
h = getChildByName(self.tbl, self.nc.THEAD)
if (h): return getChildByName(h, self.nc.TR)
return None
def addHead(self, labels:List=None,
numbered:str=None, moveRow:bool=False) -> None:
"""Add a table head and head row, and hopefully column labels.
If there's already a thead, do nothing.
Labels can be sourced from:
* labels: A list of strings
* numbered: This string plus a number
* moveRow: moving the first row into the new THEAD
* [otherwise]: Fail.
TODO: Labels vs. contents vs. @CLASS; may want to set all.
"""
if (self.getHead()): return None
thead = self.tbl.ownerDocument.createElement(self.topt.THEAD)
self.tbl.insertBefore(self.tbl.childNodes[0], thead)
if (labels):
for lab in labels:
newCell = self.makeElement(self.topt.TH, text=lab)
newCell.setAttribute("label", lab)
thead.appendChild(newCell)
elif (numbered):
for i, in range(self.tbl.countColumns()):
newCell = self.makeElement(self.topt.TH, text=lab)
newCell.setAttribute("label", numbered+str(i))
thead.appendChild(newCell)
elif moveRow:
theRow = self.tbl.getRow(0)
thead.appendChild(theRow)
else:
assert False, "None of labels, numbers, or moveRow was given."
def getcolIds(self:Node) -> List:
colIds = []
for th in self.getHeadRow().childNodes:
colIds.append(th.getAttribute(self.topt.CLASS))
return colIds
####### TODO Distinguish column IDENT from column LABEL.
####### TODO Support HTML5-ish LABELs.
def setcolIds(self, colIds:list, alone:bool=True):
"""Given a list of names, assign them by setting an attribute
on each instance. If 'alone' is True, the attribute is assigned
that name only; otherwise, the name is appended as a space-separated
token if not already there, and any prior tokens remain.
"""
raise NotImplementedError
def setColName(self, col:Union[int, str], name:str, alone:bool=True):
"""Setting the assigned attribute (typically "CLASS") on the column
header and all cells in the column.
If 'alone' is True, the attribute is assigned
that name only; otherwise, the name is added as a space-separated
token if not already there, and any prior tokens remain.
"""
theCol = self.getColumn(col)
if (alone):
theCol.setAttribute(self.topt.CLASS, name)
else:
buf = theCol.setAttribute(self.topt.CLASS)
if (buf): buf += " "
buf += name
theCol.setAttribute(self.topt.CLASS, buf)
def getBody(self):
return getChildByName(self.tbl, self.topt.TBODY)
def getFoot(self):
return getChildByName(self.tbl, self.topt.TFOOT)
##################################################### ROW OPERATIONS
def countRows(self) -> int:
"""Find out how many rows there are.
"""
nFound = 0
for _tr in self.tbl.getRows(): nFound += 1
return nFound
def getRows(self) -> Node:
"""Get all (body) row elements of this (but not of nested) table.
"""
rows = []
for row in self.generateRows():
rows.append(row)
return rows
def generateRows(self) -> Node:
bod = self.tbl.getBody()
for ch in bod.childNodes:
if (ch.nodeName == self.topt.TR): yield ch
return
def getRow(self, n:int) -> Node:
"""Get the n-th row (counting from 1 -- is that best for this or not?).
TODO Provide a get-by-key?
"""
assert n != 0
if (n < 0):
rows = self.getRows()
tgtRow = len(rows) + n
if (tgtRow < 0): return None
return rows[tgtRow]
else:
nFound = 0
for row in self.generateRows():
nFound += 1
if (nFound >= n): return row
return None
##################################################### COLUMN OPERATIONS
# These actually operate on whole columns -- not just one cell in a column.
#
def getColHeader(self, n:Union[int, str]) -> Node:
"""Can find by name or number.
"""
# TODO Add methods to get from
if (isinstance(n, int)):
i = 0
for th in self.getHeadRow().childNodes:
if (th.nodeType != Node.ELEMENT_NODE): continue
i += 1
if (i==n): return th
return None
elif (isinstance(n, str)):
for th in self.getHeadRow().childNodes:
if (th.nodeType != Node.ELEMENT_NODE): continue
if (th.getAttribute(self.topt.CLASS) == n): return th
return None
else:
raise TypeError("getColHeader: must be string or int.")
def insertColBefore(self, colNum:int, ident:str, label:str=None, theCells:List=None):
"""Insert an entire column. If theCells is a list of cells, use them;
otherwise construct empty ones (plus a header one containing label).
"""
thead = self.getHead()
if (theCells):
newCell = theCells[0]
else:
newCell = self.makeElement(self.topt.TH, { self.topt.CLASS:ident }, text=label)
refCell = self.getCellOfRow(row=thead, colNum=colNum)
thead.insertBefore(newCell, refCell)
for rowNum, row in enumerate(self.generateRows()):
if (theCells):