-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfsplit.py
executable file
·3573 lines (3057 loc) · 140 KB
/
fsplit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
#
# fsplit.py: A better (I hope) str.split() or csv package.
# 2020-02-28: Written by Steven J. DeRose.
#
#pylint: disable=W0603,W0511
#
import sys
import argparse
import codecs
import re
from collections import namedtuple
from typing import Dict, Any, Union, IO, List, Callable, Final
from enum import Enum
import ast
import html
import datetime
import uuid
import array
import unicodedata
import logging
from Datatypes import Datatypes
lg = logging.getLogger()
dt = Datatypes()
__metadata__ = {
"title" : "fsplit",
"description" : "A better (I hope) str.split() or csv package.",
"rightsHolder" : "Steven J. DeRose",
"creator" : "http://viaf.org/viaf/50334488",
"type" : "http://purl.org/dc/dcmitype/Software",
"language" : "Python 3.7",
"created" : "2020-02-28",
"modified" : "2023-11-19",
"publisher" : "http://github.com/sderose",
"license" : "https://creativecommons.org/licenses/by-sa/3.0/"
}
__version__ = __metadata__["modified"]
descr = """
=Description=
An enhanced `split()` function, and APIs using it that are modelled on
Python's `csv` library (see [https://docs.python.org/3/library/csv.html]).
=Usage=
==Usage from command line==
Use this to parse a file(s) and display a block for each record, containing
the record number, followed by a line for each field, with the name and value.
The values are formatted based on their apparent type
(bool, int (or oint or xint), float, complex, string).
The CSV dialect can be specified using options named like the API's options.
For example:
fsplit.py --delim "\\t" --quotechar '"' myFile.csv
If `--visible` is set, control characters are converted to a readable form
(using Unicode "control pictures").
==Usage from the command line==
To parse a CSV-ish file and see a simple/readable display, just do:
fsplit.py myFile.csv
Options can be added to specify alternate syntaxes (--delim, etc.),
the presence of a header (--header),
a header-like string (--ifields) for when there is no header record in the file(s),
etc.). For example:
fsplit.py --header --delim "\t" --quotedouble myFile.csv
Instead of the pretty-rinted default display, you can convert
a file(s) from one CSV-ish format to another, by using additional
options (but (with "o" prefixed to their names, for "output"),
to specify the output syntax and fields:
fsplit.py --header --delim "\t" --quotedouble --odelim ":" --oecapechar "\\" myFile.csv
There is also (in progress) a suite of command-line filters built on top.
See [Related Commands].
==Usage as an API==
This works essentially like {Python's built-in "csv" package. A basic example:
import fsplit
for recnum:int, fields:List in rdr.items():
print("Record %d:" % (recnum))
for fnum, fval in enumerate(fields):
print("%s: %s" % (rdr.schema[fnum].fname, fvalue))
In this example, rdr.schema is constructed when the first record is read,
because of the 'header' argument.
Note that if the schema (however supplied, see below) specifies a datatype for
one or more fields, the values found in those fields are cast to that
type before they are returned from fspit ('ValueError' is raised if that fails).
If you don't want that behavior, just don't specify datatypes.
Instead of the list of syntax arguments (in this example 'delim', 'escapeChar',
'quotechar', and 'header'), a 'dialect' argument may be passed a Dialect or DialectX
object which was created with the desired syntax options.
A dialect object deals with syntax options such as quoting, escaping, space-stripping,
line-ends, etc. It can be created manually like:
myDialect = DialectX("monty", delimiter="\t", escapeChar="\\",
quotechar='"', header=True)
rdr = fsplit.reader("myFile.csv", dialect=myDialect)
...
In addition, fsplit supports 'FieldSchema' objects, which describe
some simple semantics of the data such as the names and order of fields, their
datatypes, and so on. See section [Header Format] for more details.
In short, a FieldSchema can be set up in several ways:
* created from a regular header line (by setting the 'header' option as shown
above). The header line may have just names, or more.
* created from a header-like string and passed in:
fakeHead = "name|UPPER:str!,dob:datetime,age:int[0:150],zip"
mySchema = FieldSchema(fakeHead)
rdr = fsplit.reader("myFile.csv", dialect=myDialect, schema=mySchema):
...
* created from a list of individual header-like items (this lets you avoid
escaping in case delimiters occur in the header information):
headItems = [ "name|UPPER:str!", "dob:datetime", "age:int[0:150]", "zip" ])
mySchema = FieldSchema(headItems)
* created with just a list or string of field names (they remain untyped) and passed in
(these are just special cases of the prior two, that use no datatype, normalizations,
constraints, or defaults):
mySchema = FieldSchema(fieldNames="name,dob,age,zip")
or
mySchema = FieldSchema(fieldNames=[ "name", "dob", "age", "zip" ])
* created manually and passed in (this supports some features that can't be
expressed in a regular header record, such as custom normalizer functions
and output format hints):
mySchema = FieldSchema()
mySchema.append("name", ftype=str, frequired=True, fnormalizer=str.upper)
mySchema.append("dob", ftype=datetime, fnormalizer=myDateFunction)
mySchema.append("age", ftype=int, fmin=0, fmax=150, fformat="%3d")
mySchema.append("zip", ftype=int, fmin=0, fmax=99999, fformat="%5d")
* left implicit. In that case, after the first record is read but before its fields
are returned, a trivial schema is generated, with untyped fields named "Field_01",
"Field_02", etc.
Dialects and schemas have many more options.
===Classes===
The `fsplit` package includes these classes:
* DialectX -- very much like csv.Dialect, but with more items.
One DialectX is predefined, as `__RFC4180__`, with those settings (`header`
is set to False, since RFC 4180 makes it optional; this does mean that if
there is a header (and you don't set `header` or read the header record
manually), the header record is taken as data.
* DictReader -- like csv.DictReader
* DictWriter -- like csv.DictReader
* UnclosedQuote(Exception) -- When `DictReader` calls `fsplit()` to parse
a record, if the record is incomplete due to ending within a quoted literal,
this exception is raised. `DictReader` then reads and appends the next record,
and tries the parse again.
* FieldSchema(list) -- An ordered list of fields, which can be set up
manually or from a header or similar source. Each field is a FieldInfo instance.
Think of this as a CSV header on steroids.
* FieldInfo encapsulates properties of a specific field. Yu can construct them
directly, with the properties below as arguments (only "fname" is required),
and headers work by parsing each header entry and constructing FieldInfos:
** ''fname'' -- the field name, typically an identifier token
** ''ftype'' -- one of the known types, or at least its name
** ''fconstraint'' -- a field constraint as a string (see [Header Format] for
the type-dependent values that can be used)
** ''fdefault'' -- a default value (None may mean no default value, or that the
default value is None -- how would you tell?
** ''frequired'' -- if set, the field must always be non-empty in the data
** ''fformat'' -- a preferred output format, for example "%12.4f". Or a Callable
that takes the (possibly typed) field value and returns a string. A Callable
here is probably most useful to handle "special" values such as None, reserved
codes for things like "missing" or "not applicable", etc.
** ''fnormalizer'' -- a Callable which is the first thing applied to a field once
it is split out from the record (this is still after quote removal and unescaping).
The Callable should take and return strings. Several named normalizers can be
spedified in the header, by inserting them before the []-constraint (see section
[Header Format]).
** ''fnum'' -- the field number, counting automatically from 1 as FieldInfo are appended
to a FieldSchema.
* DatatypeHandler -- this supports many basic datatypes, which can be specified
for particular fields via `FieldSchema`. Parsed values can then be tested and
cast as appropriate. The class also has `autoType()`, which will cast a value
to the most specific type it fits (for example, "9" to int, but "9.1" to float).
You can set `reserved` to a Dict, which `autoType()` will search for the
string value that was parsed (minus whitespace). If it is found, the value
from the dict is returned. Otherwise `autoType()` will go on to try other types.
===A wider range of syntax options===
Most of the *nix fielded-data utilities have little or no support for
quoted fields or escaped characters. No two (afaict) support the same range of
delimiter conventions. The Python ''csv' library is stronger, but still lacks
syntax support such as:
* Delimiters
** Multi-character delimiters
** Ignoring repeated delimiters (`delimiterrepeat`) (like `sed`)
** Delimiters used in alternation (`delimitercycle`) (like `paste -d`) (Experimental)
* Quoting
** Multiple alternative quote characterss (for example, single and double)
** Unicode quote characters (curly, angle, etc.)
** Distinct open and close quotechars
** Quoted fields that can include newlines (`quotednewline`)
I dislike this usage, but Python `csv`, RFC 4180, and some spreadsheetss use it.
(Experimental)
* Escaping and special characters
** The common letter-escapes (\\t, \\n, \\r, \\t, \\a (BEL), \\e (ESC))
** Hex escapes like \\xFF (`xescapes`)
** Wide hex escape like \\uFFFF and \\U0001FFFF (`uescapes`)
** HTML/XML character references like ࿿, ϧ, •, etc. (`entities`)
===Datatype-related features===
Header records here may include not just field names, but also datatypes,
value constraints, and default values (see section [Header format]).
With a sufficient header, fsplit will:
* replace empty fields with that field's default value (if any)
* raise ValueError for empty fields that are declared as required ("!" in the header)
* cast them from the raw strings read to the declared types
(raising ValueError in case of failure).
* test values against the field's constraints, such as min/max values, regexes, etc.
(raising ValueError in case of failure).
Instead of fancy headers, or in addition (for fields which do not declare a type),
'autotype' is available.
=Usage=
It's probably cleanest (and slightly faster) to define the set of
options you want as a `DialectX` (similar to `csv.Dialect`) and then just
pass it to `fsplit()` along with each record in turn:
myDialectX = DialectX("bestCSVever",
delimiter=",",
doublequote=False,
escapechar="\\\\",
quotechar='"',
xescapes=True)
for rec in f.readlines():
myFields = fsplit(rec, dialect=myDialectX)
print("Record %d:\\n " + join("\\n ", myFields))
==Default CSV dialect==
The default dialect is the same as for Python `csv`. The shared properties are:
delimiter:str = ','
doublequote:bool = True
escapechar:str = None (no escaping)
lineterminator:str = '\r\n' (accepts \r and \n, too)
quotechar:str = '"'
quoting:enum = QUOTE_MINIMAL
skipinitialspace:bool = False
strict:bool: False
The extended properties are:
autotype:bool = False
comment:str = None
delimitercycle:bool = False
delimiterrepeat:bool = False
encoding:str = "utf-8"
entities:bool = False
header:bool = False
maxsplit:int = None
minsplit:int = 0
quotednewline:bool = False
skipfinalspace:bool = False
uescapes:bool = False
xescapes:bool = False
Note: the defaults are not all the same as defined by RFC RFC4180 (q.v.).
However, the RFC settings are available as a predefined DialectX, in __RFC4180__.
==Custom DialectX==
To construct a DialectX you can used individual option arguments like shown
above. Or, if you want to set them via command-line options, you can do this:
parser = argparse.ArgumentParser(description=...)
...
DialectX.addargs(parser)
args = parser.parse_args()
myDialectX = DialectX("bestCSVever", args)
Or you can pass format options directly to `fsplit()`:
for rec in f.readlines():
myFields = fsplit(rec, delimiter="\\t",
doublequote=False, escapechar="\\\\", quotechar='"', xescapes=True)
print("Record %d:\\n " + join("\\n ", myFields))
You can read and parse a line at a time as above (unless you data has newlines in
quoted fields!). Or you can operate at the file level, for example using
DictReader(f, fieldNames=None, restkey=None, restval=None, dialect=None)
`f` must be an open file handler or other readable. Usage is just like
the corresponding class in `csv`:
import fsplit
with codecs.open("names.csv", encoding="utf-8") as csvfile:
reader = fsplit.DictReader(csvfile)
for row in reader:
print(row["first_name"], row["last_name"])
Field names can be read from a header record by setting the `header` option
in a DialectX, and/or specified to the constructor.
* DictWriter(f, fieldNames, restval="", extrasaction="raise", dialect=None)
import fsplit
with codecs.open("names.csv", "w", encoding="utf-8") as csvfile:
fieldNames = ["first_name", "last_name"]
writer = csv.DictWriter(csvfile, fieldNames=fieldNames)
writer.writeheader()
writer.writerow({"first_name": "Baked", "last_name": "Beans"})
writer.writerow({"first_name": "Lovely", "last_name": "Spam"})
writer.writerow({"first_name": "Wonderful", "last_name": "Spam"})
* reader()
* writer()
==DialectX options==
A dialect has many options. Several match ones from Python `csv`, with
the same names and at least the same values. You can add them all to an argparse
instance like this:
myParser = argparse.ArgumentParser()
...
DialectX.addargs(myParser, prefix="", csvOnly=False)
args = myParser.parse_args()
dx = DialectX()
dx.applyargs(myParser, prefix="")
("prefix" can be used to ensure that DialextX's options do not collide
with other options in the argument parser).
The options like those in `csv` include:
* ''delimiter'':str = "," -- The field separator character or string.
This can be a single string, or a list of several. If it's a list, the
delimiters are used in rotation.
* ''doublequote'':bool = True -- If set, allow putting `quotechar` inside
a quoted string, by doubling it. I'm not fond of this defaulting True,
but that's what Python `csv` does, so I stuck with it.
* ''escapechar'':str = None -- If set, this character can be put before
a quotechar, delimiter, or itself, to make that character literal rather
than special. The escapechar can also be used before any of
'f', 'n', 'r', 't', 'v', '\\\\', or '0' for the usual meanings.
* ''lineterminator'':str = "\\n" -- What string to write out to indicate
end-of-record. The three usual sequences are all accepted for input.
* ''quotechar'':str = '"' -- Sets what characters or character-pairs are recognized
as quotes (`setupQuoteMap()` handles all of this). Where Python `csv` and
many *nix utilities only accept a single delimiter characters, this also
accepts:
** ''None'' or '', no characters are treated as quotes.
** ''a single character'': that character is the only recognized quote,
and serves as both open and close.
** ''a 2-character string'': the first character is treated as open quote,
and the second as the corresponding close quote. Only the close quote is
subject to `doublequote`, though both are subject to `escapechar`.
With `MINIMAL` quoting, only the close quote gets escaped within output values.
** Various mnemonic names for Unicode open/close quote pairs,
as well as plain apostrophe and double quote. To see a list, use '--showQuotes'.
** ''BOTH'': single quote (apostrophe) and double quote are each
recognized as quotes, as is common in programming and markup languages.
A string enclosed in single quotes may freely contain double quotes,
and vice versa.
** ''ALL'': single and double quotes (as with "BOTH"),
as well as all the other here-named Unicode quotation marks.
* ''quoting'': Determines when output fields should be quoted. Specify one of
these constants, whose names and values are intended to be the same as in `csv`
(you can instead give the string name of one of them):
fsplit.QUOTING.MINIMAL = 0 # only quote if needed (the default)
fsplit.QUOTING.ALL = 1 # quote all fields.
fsplit.QUOTING.NONNUMERIC = 2 # quote all non-numeric fields.
fsplit.QUOTING.NONE = 3 # never quote fields, but use escapechar
Python csv does NONNUMERIC by type, so still quotes digit-strings. See [#To do].
* ''skipinitialspace'':bool = False -- If set, records (not fields)
have any initial (Unicode) whitespace stripped before parsing.
* ''strict'':bool = False -- If set, problems raise `ValueError`
instead of being worked around or ignored.
===Additional options===
The following options are not available in Python's `csv` package:
* ''autotype'' = False -- If set, fields with no overriding type declared (via
a header record, '--ifields', or the API)
are examined and cast to the
most specific type they can (from the type supported in header declarations).
Booleans are not attempted (for more information see section [autoTyping]).
* ''comment'':str = None -- If set and not '', lines starting with
this string are treated as comments rather than data. Leading space is NOT
allowed before the delimiter (this might be added).
* ''delimitercycle'':bool = False -- If set, and there are multiple delimiters
specified, use those delimiters in alternation like 'paste -d'.
This is not yet supported, and is not expected to be happy with named
multi-delimiter sets such as BOTH and ALL.
* ''delimiterrepeat'':bool = False -- If set, multiple adjacent delimiters do
NOT result in empty fields, but are treated the same as a single delimiter.
This would typically be done when the delimiter is just a space (which can also
be accomplished with (so far experimental) regex delimiters.
Empty fields can, however, be inserted by quoting them (because the delimiters
on each side are no longer "adjacent"). This option should probably not be
used in combination with a list of delimiters (see 'delimitercycle').
* ''encoding'' = "utf-8" -- what encoding to use. This affects escaping.
* ''entities'':bool = False -- If set, character references are recognized
and replaced as in HTML. Decimal (•), hexadecimal (•), and the
ubiquitous named forms (• etc.) are all supported.
NOTE: These are converted via Python's `html.unescape()` method, which does not
raise an error for unknown entities such as `&foo;`. References do not have to
be inside quotes.
* ''header'':bool = False -- If set, the first record of each input file is
expected to be a header, giving at least a named for each field.
Headers can be read manually instead, and parsed via parseHeaderStrToSchema().
Field names in the header are also allowed to have suffixes much like Python
type hints, to specify their datatypes, rudimentary constraints (see below), and/or
a default value ("!" to assert the field is required. Via the API, additional
functionality may be added, such as a map from particular string values to
constants (say, "NaN" or "-" to None).
* ''maxsplit'':int = 0 -- If positive, splitting a record stops after
this many splits have occurred (making this many + 1 tokens; 0 means unlimited).
* ''minsplit'':int = 0 -- If specified, an exception is raised if fewer than
this many splits are made.
* ''skipfinalspace'':bool = False -- If set, records (not fields)
have any trailing (Unicode) whitespace stripped before parsing.
* ''uescapes'':bool = True -- If set, escapes like \\uFFFF may be used.
This uses `escapechar` (not necessarily backslash).
If `escapechar` is set but not `uescapes`, an escapechar before 'u' will
not raise an error (even with `strict`), because that escape just ensures
(unnecessarily) that the 'u' is literal.
* ''xescapes'':bool = True -- If set, escapes like \\xFF may be used.
This uses `escapechar` (not necessarily backslash).
If `escapechar` is set but not `xescapes`, an escapechar before 'x' will
not raise an error (even with `strict`), because that escape just ensures
(unnecessarily) that the 'x' is literal.
===Header format===
If the 'header' option is set, the first record is treated as a header.
It should give names for all the fields, separated by the same delimiter as
in the rest of the file. The names should typically be identifiers:
starting with a (Unicode) word-character (letter, syllable, or ideograph),
followed by those, digits, and/or underscores.
The header can be just a list of field names, as supported by many CSV-ish
parsers. Or, you can optionally provide more information for some or all
fields. The syntax and meanings are similar to Python type hints:
name:type=default,name2:type2,name3=default3,...
The types are shown in these examples:
myFlag:bool "1" or "0" (see below to use other values like T or F)
age:int decimal integer
cookie:xint hexadecimal integer
perm:oint octal integer
quant:anyint accepts octal, decimal, or hex
balance:float floating-point number
root:complex python notation
lastName:str string (this doesn't do much).
dob:date date (ISO 8601)
Other types will likely be added
(see https://www.w3.org/TR/xmlschema-2/#built-in-primitive-datatypes).
Defaults can be given after an "=", or "!" may be used to indicate
that a non-empty value is ''required''
(default values can contain quotes or delimiters only if escaped the same way as
other data in the file):
obsolete:bool=0
payGrade:float!
A ''normalizer'' name can be specified immediately after the type name,
separated by "|". Underneath, a normalizer is a function that takes the
field value (after unescaping and unquoting if applicable), and returns a
string (or raises an error on failure). They can be used for
case-folding, detecting special "reserved" values (such as "-" for Not
Applicable, which one might map to None), etc.
Custom normalizers can be set via the 'fnormalizer' parameter when
constructing a FieldIngo object in the API, but not via the header.
The following named normalizers are predefined, and can be given in the header::
UPPER -- force the value to uppercase
LOWER -- force the value to lowercase
NFKD, NFKC, NFC, NFD -- apply that Unicode normalization form
XSP -- apply XML space normalization (just space, TAB, CR, LF)
USP -- Unicode space normalization (all \\s)
TF -- fields beginning with [TtYy] go to 1, [FfNn] go to 0 (if the field is
declared type bool, those will then be recognized and cast to True and False.
ASCII -- non-ASCII characters to \\x, \\u, \\x{} escapes
A value ''constraint'' can also be included in [] immediately after the datatype
and normalizer name. The forms of constraints are type-specific:
* bool: Not currently applicable. A possible
future addition is to give the actual strings to be used for False and True,
since practice for these varies so much.
* int: the constraint can have 2 (possibly-signed) decimal integers, separated
by a comma, that are the minimum and maximum values allowed. These values
are inclusive bounds. The first defaults to 0, the second to the parsing
system's preferred maximum unsgigned integer value (sometimes 2**63 - 1).
* float: similar to those for int, but the minimum and maximum values are floats
(as always, be careful about roundoff errors). These values are inclusive bounds.
* complex: Not currently applicable.
* str: the constraint is a (Python-style) regular expression that each value
must match in its entirety. Such constraints commonly require escaping.
These constraints can be used to implement other things, such as enums:
compass:str[N|E|W|S|NE|NW|SE|SW]=N
province:str[NL|PE|NS|NB|QC|ON|MB|SK|AB|BC|YT|NT|NU]!
or length limits:
middleInitial:str[.{0,1}]
''Note'': regex constraints regard case.
TODO: Fix this, perhaps by adding a prefix flag, or \\L \\U, or....
When a ''type'' is specified, raw field values are cast to that type
for return (for example, by reader() and DictReader()).
When a field is empty and a default value
was set, the default value is cast to the specified type and returned.
Fields with no type hint in the header default to type 'str'.
Fields with no default set in the header get "" when omitted (which may then
be cast as needed for the datatype, for example to 0).
==What if there's no header record?==
For files lacking a header record, you can do several things:
* do nothing (and don't set '--header') -- in that case, field names like
"Field_01" are assigned as needed.
* add a header record to your data file(s).
* use the '--ifields' option on the command line, which takes exactly the
same syntax.
* provide the information via the API. To achieve this, construct
a 'FieldSchema' object and use 'append()' to define any number of fields (these
are represented by 'FieldInfo' objects, which can be constructed given only a name,
or with argument or type, constraint, required, and so on).
FieldInfo objects offer extra features such
as mapping reserved strings (say, "True", "False", "N/A", etc. to values.
A field for which no type is set is returned as a literal string unless
'autotype' is turned on (see next section).
===autoTyping===
If `autotype` is set on the DialextX, then any field which does not have an explicit
type set (via header, options, or API as described above), is passed to
a function which tries to cast it to an appropriate type.
This has potential problems. Some data uses "1" and "0" for booleans, which
could also be ints (or perhaps floats). Other data might use reserved
words such as "True", "T", or "#T",
which could of course be strings.
Because of this, autotyping never decides something is a boolean.
Similarly, any int could also be a float. Anything at all could be a string.
Leading and trailing whitespace is stripped before testing types.
However, items that remain as strings are returned with the whitespace intact
(unless the `skipinitialspace` or `skipfinalspace` option is also set).
If `reserved` is supplied, it must a dict of what to return for any special
values -- say, "--" for NaN, "#T" for True, or "999" for None, etc.
If `specialFloats` is set, then the strings "NaN", "inf", "-inf", and "+inf"
(all case-sensitive) are recognized as floats (otherwise they are strings).
TODO: -0 should also be distinguished in this case.
Complex numbers must be of the Python form "1+1j".
Numbers are taken as ints if, after space stripping, they contain Latin decimal
digits and nothing else (see Python str.isdigit(). Thus, "1." and "1.0" and
"1.1" and "1E+2" are all returned as floats.
Dates and times are recognized by regex matching against the expressions below,
which are intended for [ISO 8601] format. There are of course many other
formats out there, which this code does not recognize.
dateRegex = r'\\d\\d\\d\\d-\\d\\d-\\d\\d'
timeRegex = r'\\d\\d:\\d\\d:\\d\\d(\\.\\d+)?(Z|[-+]\\d+)?'
dateTimeRegex = dateRegex + 'T' + timeRegex
Anything not recognized as one of the types just described, is a string.
'''Note''': Autotyping is not used if you pass a list of types to
the `fsplit()` option `types`. With a list, each token is explicitly cast to
the given type (which may fail). Remember that Python `bool()`
takes any non-empty string as True, even "0" or "False".
=Related Commands=
Python `csv` [https://docs.python.org/3/library/csv.html].
My `csv2xml.py` -- Obsolete precursor to `fsplit.py`, but has a number of
possibly useful export formats.
See "also my related scripts such as `Record.py`, `homogeneous.py`,
`Datatypes.py` for some other features that may be added.
There is also (in progress) a suite of command-line filters built on top.
They mimic the *nix commands that support fields, but aim to unify the "CSV"-ish
support across filters (the usual ones differ in what they can and
cannot parse. Most don't support quoting, escaping, or Unicode.
See [http://github.com/sderose/CSV] and [http://github.com/sderose/XML/CONVERT].
When finished, the suite should include version of:
awk, colrm, column, cut , join, paste, sort, tab2space, uniq, xargs.
Perhaps also: colrm, expand, lam, look, sed, unexpand, fold, and a variant
of 'less' with integrated CSV pretty-printing.
=References=
Johann Mitlohner, et al. "Characteristics of Open Data CSV Files"
[https://www.researchgate.net/profile/Sebastian-Neumaier/publication/308568784_Characteristics_of_Open_Data_CSV_Files/links/6217490eb85f8c427cd62502/Characteristics-of-Open-Data-CSV-Files.pdf]
=Known bugs and Limitations=
Smoke tests for multi-character delimiters are failing.
Not sure whether you can (or should be able to) escape in mid-delimiter.
You cannot combine delimiterrepeat and delimitercycle.
Field names passed to DictReader, or from --ifields,
should probably be allowed to override any
read from a header record. But currently they're compared and an exception
is raised if they don't match. TODO: Check if still true.
'--convert' does not copy comment lines, even if an output dialect command
marker is specified. How best to return comments during parsing?
'escapechar' is recognized both inside and outside of quotes.
'escapechar' at end of line does not make the newline into data (but see
'quotednewline').
Unknown escapes such as \\q just mean the literal character, and no error
is issued even with strict=True. However, syntax errors within \\x \\u and \\U
are reported, and cause in exception if strict=True. The dict listing
known escapes (other than [xuU] is Escaping.escape2char. It can be modified, though
success is not guaranteed especially if you add entries that collide with
other special characters, such as quotes, delimiters, etc.
What's the right rule for a quote not immediately following a delimiter?
As in
1, "two", 3, 4"five"5, 6
Date and time processing is not supported for dates prior to the Gregorian
calendar change (October 1582). However, it will likely behave correctly
assuming the [[proleptic Gregorian calendar]].
timedelta assumes a year is 365.2425 days (the Gregorian average), and a
month is 30 days.
=To do=
===Checking===
* system for reporting number of fields off, fields not of datatypes.
* generate header w/ typehints and maybe defaults, given data?
* better tx of non-associating values and boolean codings
===Format variations===
* Add a convention for putting metasyntax in first record. Maybe like below
(though what is ok for escaping in values?
#CSVPP: (optname=value)+
* Add a QUOTING option like NONNUMERIC, but that treats strings of digits
as numerics, rather than going by type.
* Delimiter as "whitespace to nonspace transition" (like awk, column, sort).
* Support for a wider range of formats (probably via separate
packages or subclasses with much the same API), such as:
** HTML and XML tables (but see my `htmlTable2csv.py`.
** MediaWiki and MarkDown tables (but see my `markdown2Xml.py`).
** XSV (but see my xsv2others.py).
** JSON cases (but see my `json2xml.py`).
** s-expressions and perhaps CONLL (but see my `sexp2xml`).
** SQL INSERT statements, and generation of table def from schema, v/v.
** Tables extracted straight from a SQL API
** Fixed column widths, maybe via scanf?
===Functional additions
* Some kind of hysteresis for autotyping? Like, if this field has always been int
and we see "", make it 0; if always float and we see 12, make it 12.0.
* Option to create NamedTuple or tuple instead of list or dict?
* Should regex constraints be enclosed not by {}, but by // or by any \\W?
* Should csv skipinitialspace/skipfinalspace be overridable per-field?
Maybe something more for Unicode spaces, esp. hard-space per se?
* Add predefined dialects: RFC4081, Excel, and any that Python `csv` provides.
Also eponymous ones for each relevant *nix command?
* Way to change the set of backslash codes, such as \\b for BEL, etc.?
* Way to distinguish only-precise-to-the-hour from on-the-hour, etc.
* Add way to control case treatment of string constraints (and how does
constraint relate to normalization?)
===Testing===
* Add smoke tests for datetimes, autotype, quotednewline, normalizers,
constraints, complex headers.
* Make sure it can just take a csv.Dialect for the DialectX arguments.
===Coding stuff===
* Lose remaining refs to args from inside classes.
* Per RC4081 complain about rec-final delimiter if not quoted.
* Perhaps support Python \\N{unicode-name}?
* Allow years >9999, etc. See https://www.w3.org/TR/xmlschema-2/#isoformats
Don't forget leap seconds.
* Sync regex constraints w/ auto-anchored XSD approach.
* Support XSD regex \\#x and \\p{prop}?
* Add a way to get at the last raw record (since reader() parses it),
and the logical/physical record numbers.
=History=
Written by Steven J. DeRose, 2020-02-28.
* 2020-08-07: Add classes and API to look essentially like Python `csv`,
including `DictReader` and `DictWriter`.
Separate options into a "Dialect" class, add `add_my_arguments`.
Start `multidelimiter` (later renamed `delimiterrepeat`).
Add `comment`. Rewrite doc.
* 2020-09-06: Better error messages. Renamed 'types' to 'typeList'.
Refactor test cases and error handling.
* 2020-10-21: Start support for controlling order of fields with DictWriter.
Pull in `formatScalar` from my `alogging`.
* 2021-07-12: Start supporting delimiter dycling like `paste -d`.
* 2023-08-22: Improve help. Update for current Python.
* 2023-09-20ff: Build in __RFC4180__. Clean up support for \\u \\x, add
\\U and \\x{}. Add missing options to addargs(). Add 'header' option and
support for type-hint style header records. Make reader() smart about
unclosed quotes.
* 2023-10-31ff: Refactor. Promote FieldInfo from namedtuple to class.
Add parseHeaderStrToSchema(), handling of type hints and defaults, open/close
quote support. Add cleanField() to do strip, unquote, unescape, entities,
normalization, etc. Start main beyond basic smoke-test.
Add --convert vs. prettyprinting. Reorg quote pairs and
option defaulting. Add --showQuotes and --showDialectX. Refactor
reader() and writer() to be like csv ones (generator classes).
Ditch 'typelist'. Add DictWriterXSV, SAXReader.
Rename `multidelimiter` to `delimiterrepeat`, 'cycle' to 'delimitercycle'.
Add support for regex delimiters. Factor out class `Escaping`.
=Rights=
Copyright 2020-02-28 by Steven J. DeRose. This work is licensed under a
Creative Commons Attribution-Share-alike 3.0 unported license.
See [http://creativecommons.org/licenses/by-sa/3.0/ for more information].
For the most recent version, see [http://www.derose.net/steve/utilities]
or [https://github.com/sderose].
=Options=
"""
###############################################################################
#
class Escaping:
"""Manage encoding/decoding of special characters as backslash of other
escapes, entities, etc. Indiovidual cases are supported by Python built-ins,
but we need to be switchable to lots of combinations, including weird ones
the show up in legacy or quirky data.
Beware of regex and other escapes, like \\L \\U \\s \\d \\w
"""
escape2char = {
"0": "\x00", # U+00 null # WARNING: Collides with \\0777 octal
#"a": "\x07", # U+07 bell
#"b": "\x08", # U+08 backspace
"t": "\x09", # U+09 tab
"n": "\x0A", # U+0A line feed
"v": "\x0B", # U+0B vertical tab
"f": "\x0C", # U+0C form feed
"r": "\x0D", # U+0D carriage return
"e" : "\x1B", # U+1B escape
"\\": "\x5C", # U+5C backslash
}
char2escape = {
"\x00": "\\0", # U+00 null
"\x07": "\\a", # U+07 bell
#"\x08": "\\b", # U+08 backspace
"\x09": "\\t", # U+09 tab
"\x0A": "\\n", # U+0A line feed
#"\x0B": "\\v", # U+0B vertical tab
"\x0C": "\\f", # U+0C form feed
"\x0D": "\\r", # U+0D carriage return
"\x1B": "\\e", # U+1B escape
"\x5C": "\\\\", # U+5C backslash
}
def __init__(self):
assert False, "Don't instantiate static class 'Escaping'."
@staticmethod
def decodeEscape(s:str, i:int, lastToken:str, dx:'DialectX') -> (str, int):
"""Having found an escapechar at offset 'i', parse and return
* what the following sequence represents, and
* the length to be consumed (incl. the escapechar).
"""
if (i+1 >= len(s)):
syntaxError(s, i, lastToken, "escape at end of line", strict=dx.strict)
return "", 1
c1 = s[i+1]
if (c1 == "x"):
lg.info("got xescape at %s", s[i:])
if (not dx.xescapes):
syntaxError(s, i, lastToken,
"\\x escape but option is off", strict=dx.strict)
return ("\\x", 2)
mat = re.match(r"x([\da-f]{2})|x\{([\da-f]+)\}", s[i:])
if (not mat):
syntaxError(s, i, lastToken,
"Incomplete \\x escape", strict=dx.strict)
codePoint = None
escValue = dx.escapechar + "x"
escLength = 2
elif (mat.group(1)):
codePoint = int(mat.group(1), 16)
escValue = chr(codePoint)
escLength = 4
else:
codePoint = int(mat.group(2), 16)
escValue = chr(codePoint)
escLength = len(mat.group(2)) + 4
if (codePoint and codePoint > sys.maxunicode):
syntaxError(s, i, lastToken, "\\x outside Unicode range (%x)."
% (codePoint), strict=dx.strict)
escValue = dx.escapechar + "x"
escLength = 2
elif (c1 in "Uu"):
lg.info("got uescape at %s", s[i:])
if (not dx.uescapes):
syntaxError(s, i, lastToken,
"\\%s escape but option is off" % (c1), strict=dx.strict)
return ("\\" + c1, 2)
if (c1 == "u"):
mat = re.match(r"u[\da-f]{4}", s[i:])
if (not mat):
syntaxError(s, i, lastToken,
"Incomplete \\u escape", strict=dx.strict)
escValue = dx.escapechar + "u"
escValue = chr(int(s[i+1:i+5], 16))
escLength = 5
elif (c1 == "U"):
mat = re.match(r"U[\da-f]{8}", s[i:])
if (not mat):
syntaxError(s, i, lastToken,
"Incomplete \\U escape", strict=dx.strict)
escValue = dx.escapechar + "U"
else:
escValue = chr(int(s[i+1:i+9], 16))
escLength = 9
else:
lg.info("got char escape at %s", s[i:])
escValue, escLength = Escaping.unescapeViaMap(c1, i, strict=dx.strict)
return escValue, escLength
@staticmethod
def unescapeViaMap(c: str, i=int, strict:bool=False) -> (str, int):
"""If it wasn't some special/long escape like \\x \\u \\U, look up the
escaped char and return what it means, or just itself.
Also return the length (currently always 2, but needn't be).
"""
if (c in Escaping.escape2char): return Escaping.escape2char[c], 2
if (strict): # TODO Maybe this is never an error?
syntaxError(c, i, "", "Unrecognized escape \\%s" % (c), strict=strict)
return c, 2
# TODO Sync with regular 'csv' package
@staticmethod
def escapeOneChar(dx:'DialectX', c:Union[str, re.Match]) -> str:
"""Can be used on a raw character or an re.Match object with the
desired character in the 1st capture. Always recodes the character
somehow (or fails), so only call it when needed (see getProblemChars()).
"""
if (isinstance(c, re.Match)):
c = c.group(1)
assert c in dx.problemChars
if (c == dx.escapechar):
return c+c
if (c == dx.quotechar):
if (dx.quoting): return c # Caller must do the quoting! TODO Check
if (dx.doublequote): return c+c
elif (dx.entities): return dx.entify(c)
elif (dx.escapechar): return dx.escapechar + c
else: return dx.hexify(c)
if (dx.entities and c in "&<"): # TODO: Ewww, quote for in attrs.
# ">" doesn't normally need quoting (see "]]>")
return dx.entify(c)
# TODO syntaxError()?
return c
@staticmethod
def entify(c:str, useNames:bool=True) -> str:
"""Many options here -- like an order of preference among
named/hex/decimal XML character refs.
"""
n = ord(c)
if (useNames and n in html.entities.codepoint2name):
return "&%s;" % (html.entities.codepoint2name[n])
if (n > 0xFF): return "&#X%04x;" % (n)
return "&#X%02x;" % (n)
@staticmethod
def hexify(c:str) -> str:
"""TODO If not escapechar and xescapes and uescapes, what then?
"""
n = ord(c)
if (n <= 0xFF):
return "\\x%02x" % (n)
if (n <= 0xFFFF):
return "\\u%04x" % (n)
return "\\U%08x" % (n)
@staticmethod
def escapeXmlAttribute(s:str, attrQuoteChar:str='"') -> str:
"""Escape as needed for quoted attributes (default: double-quoted).
Quietly deletes any non-XML control characters!
"""
s = re.sub(r"[\x01-\x08\x0b\x0c\x0e-\x1f]", "", s)
s = s.replace(r"&", "&")
s = s.replace(r"<", "<")
if (attrQuoteChar == '"'): s = s.replace(r'"', """)
else: s = s.replace(r"'", "'")