-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsjdUtils.py
executable file
·1636 lines (1343 loc) · 57.3 KB
/
sjdUtils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
#
# sjdUtils: some generally useful stuff.
# 2011-12-09: Written by Steven J. DeRose.
#
import sys
import os
import re
import argparse
import random
import time
import math
from typing import Any, List, Dict
import unicodedata
import ColorManager
from alogging import ALogger
__metadata__ = {
"title" : "sjdUtils",
"description" : "Generally useful small routines.",
"rightsHolder" : "Steven J. DeRose",
"creator" : "http://viaf.org/viaf/50334488",
"type" : "http://purl.org/dc/dcmitype/Software",
"language" : "Python 3.7",
"created" : "2011-12-09",
"modified" : "2024-02-20",
"publisher" : "http://github.com/sderose",
"license" : "https://creativecommons.org/licenses/by-sa/3.0/"
}
__version__ = __metadata__['modified']
descr = """
=Description=
Provide some very basic useful utilities, mostly for escaping, colorizing,
timing, and handling error messages.
from sjdUtils import sjdUtils
su.setVerbose(level)
...
su = sjdUtils()
su.setColors(useColor)
...
Messaging support is now in `alogging`, which works a lot like Python's
`logging.Logger`, but has nicer layout (I think), support for -v levels, etc.
A nearly identical `sjdUtils.pl` is also available for Perl.
=Methods=
==General methods==
* '''sjdUtils'''(verbose=0, colors=False)
Constructor.
* '''getOption''' ''(name)''
Get the value of option ''name'', from the list below.
Returns `None` if you try to get an unknown option.
* '''setOption''' ''(name, value)''
Set option ''name'', from the list below, to ''value''.
* '''setVerbose''' ''(value)''
Tell the logger to set its level of reporting to ''value'' (higher=more).
* '''getVerbose''' ''()''
Return the logger's level of reporting.
** Option: ''colorEnabled''
Globally enables/disables use of color.
Scripts may wish to set this to True if environment variable ''CLI_COLOR'' is
set and the relevant output is going to a terminal.
See also the ''setColors''() method, below.
** Option: ''defaultEncoding''
Set the name of the character encoding to be used when not otherwise specified.
** Option: ''loremText'' (string)
The text to be returned by ''lorem(type='a')''.
Default: the usual ''lorem ipsum...''.
==XML-related methods==
=* '''indentXml'''(s, iString=" ", maxIndent=0, breakAttrs=0, elems={}, html=False)
Re-flow the XML string ''s'' to outline form, using repetitions of ''iString''
to create indentation (up to a maximum of ''maxIndent'' levels.
If ''breakAttrs'' is true, put each attribute on a separate line, too.
''elems'' is a dictionary that maps each element type name to either
`block` or `indent`, and affects line-breaking accordingly.
* '''indentXML''' -- synonym for '''indentXml'''.
* '''colorizeXmlTags(s, color="")'''
Surround XML markup with ANSI terminal escapes to display it
in the specified color. Default: the color for the "x" message type.
See also the ''ColorManager'' class, described below.
* '''colorizeXmlContent(s, color="")'''
Surround XML content (not markup) with ANSI terminal codes to display
it in the specified color. Default: the color for the "x" message type.
''See XmlRegexes packge for relevant 'isa' functions for character classes.''
==JSON-related methods==
* '''indentJson(s, iString=" ", maxIndent=0)'''
Like ''indentXml'', but for JSON.
==Simple string-formatting methods==
* '''rpad(s, width=0, padChar=" ", quoteChar="")'''
Like Python's ljust(), but can also quote before padding.
If ''quoteChar'' is specified, its first character is used as the
opening quote, and its last character as the closing quote.
* '''lpad(s, width=0, padChar="0", quoteChar="")'''
Like Python's rjust(), but can also quote before padding.
If ''quoteChar'' is specified, its first character is used as the
opening quote, and its last character as the closing quote.
* '''lpadc(s, width=0, padChar="0", quoteChar="", sepChar=",")'''
Like ''lpad'', but also inserts ''sepChar''s every three digits
(Python 3 has that feature in ''format'').
If ''quoteChar'' is specified, its first character is used as the
opening quote, and its last character as the closing quote.
''sepChar'' now accepts a few special values as well. If set to "COLOR"
(and color is enabled), alternate groups of three digits are colorized
(presently just red). If "UNDER", they are underlined (again assuming
color is enabled and the terminal supports it).
* '''align(list, delim=',', stripTrail=True, maxLen=None, padChar=' ')'''
Aligns the tokens or items within each member of ''list'' so they print
out in neat columns (assuming a monospace font).
'list' must be an array; either of (sub)-lists or of strings that can be
split into (sub-)lists using ''delim''.
For each "column" (going down 'mylist' and taking the nth item
in each (sub-)list, ''align()'' finds the max length that occurs.
Then it all the items in each column to that column's max length.
Finally, it assembles the items of each (sub-)list, separated by ''delim''
into a string. It returns a list of those (padded) strings.
If a column contains any non-numeric entries, then its values will all
be quoted (with double-quotes), and left-justified. Otherwise, its value
is unquoted and right-justified.
'''Note''': There is no provision for ignoring instances of ''delim''
if they are in quotes, backslashed, etc.
* '''toHNumber(n, base=1000)'''
Convert a number to human-readable form, like several *nix commands.
For example, "123456789" would become "123.45M".
This conversion generally loses precision.
Base must be 1000 or 1024.
* '''fromHNumber(n, base=1000)'''
Convert a number from human-readable form, the opposite of ''toHNumber''.
For example, "123M" would become "123000000".
Base must be 1000 or 1024.
* '''unquote(s, fancy=False)'''
Remove single or double quotes from around a string, if present.
A quote all by itself (such as ''"''), remains.
Polarized quotes must match up properly.
Deals with most but not all Unicode quotes. Mainly, it's not clear what to
do with:
U+201a 'SINGLE LOW-9 QUOTATION MARK',
U+201b 'SINGLE HIGH-REVERSED-9 QUOTATION MARK',
U+2032 - U+2037
U+275f 'HEAVY LOW SINGLE COMMA QUOTATION MARK ORNAMENT',
U+2760 'HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT',
* '''quote(s, curly=False, escape='\\')'''
Return ''s'' with a double-quote character (U+0022) added on each end.
If ''escape'' is some character E, first put E before each instance
of U+0022 in ''s''.
If ''curly'' is True, use curly quotes (U+201c and U+201d) instead of straight.
Other quote types are not currently supported, and curly ones are not
escaped by ''escape''.
* '''qjoin(items:List, sep:str=" ", curly:bool=False, escape:str='\\')'''
Apply '''quote()''' to each item in '''items''', and then join() them using
the separator string in '''sep'''. Unless '''quoteall''' is set, only
quote ones that seem to need it (say, that contain space, quotes, controls, or escapes.
==Unicode stuff==
* '''isUnicodeCodePoint'''(c)
Return whether the character is a valid Unicode code point.
This isn't terribly smart yet.
* '''strip_accents'''(s)
Remove diacritics from the string ''s''. This is done by normalizing ''s''
to "NFD" (thus separating diacritics from their base character
''where possible'', and then stripping non-spacing marks. This does not
convert things like "LATIN SMALL LETTER L WITH STROKE", however.
* '''[Useful constants]'''
The following variables are Unicode strings containing the relevant characters:
** '''UQuotes''' -- All quotation-mark characters
** '''ULSQuotes''' -- Left single quotes
** '''ULDQuotes''' -- Left double quotes
** '''ULQuotes''' -- Left quotes
** '''URSQuotes''' -- Right single quotes
** '''URDQuotes''' -- Right double quotes
** '''URQuotes''' -- Right quotes
==Miscellaneous methods==
* '''isNumeric()'''
* '''isInteger()'''
==Escaping methods==
* '''getUTF8'''
* '''showControls'''
Turn C0 control characters into their Unicode "Control Pictures" (tiny
mnemonics, at U+2400 and following).
* '''showInvisibles or vis'''
Many unprintable character visible.
* '''escapeRegex'''
Backslash regex metacharacters in a string.
* '''escapeXmlContent''' or '''escapeXml''' or '''escapeXmlText'''.
Turn '<', '&', and the '>' in ']]>' into '<', '&', and '>' instead.
* '''escapeXmlAttribute'''
Turn '<', '&', and '"' into '<', '&', and '"' instead.
You can set `apostrophes=True` if you also want to convert "'" to '''.
* '''escapeXmlPi'''
Turn "?>" into "?>". XML does not define a particular way to represent '?>'
within processing instructions, but you have to escape it somehow....
You can set `target=[string]` to something else if you prefer.
At least at the moment, this also nukes any non-XML control characters.
* '''escapeXmlComment'''
Turns "--" into em dash (U+2014). XML does not define how to represent '--'
within comments, but you have to escape it somehow....
You can set `target=[string]` to something else if you prefer.
* '''normalizeXmlSpace(s)'''
Normalize whitespace per XML definition. This affects Python (non-Unicode) \\s.
* '''expandXml(s)'''
Expand any special-character references in ''s''.
This uses the Python `HTMLParser` package's "unescape()" method.
* '''backslash(s)'''
Put a backslash before each t, r, n, t, and \\ within ''s''.
In addition, replace non-ASCII characters with backslash + xXX or uXXXX
(depending on whether their numeric value fits in 2 hexadecimal digits or not).
* '''unbackslash(s)'''
Unescapes (converts) backslash-codes within ''s'' to literal characters.
Handles [abefnrt\\], as well as 0777, xFF, and uFFFF styles.
* '''escapeURI(s)'''
Turn ASCII characters prohibited in URIs, into %FF style.
'''Warning''': May not work for Unicode yet.
* '''unescapeURI(s, escapeChar='%')'''
Replace %FF style escapes (as used in URIs) with literal characters.
You can change ''escapeChar'' to use it for other things, like '=' for MIME.
'''Warning''': May not work for Unicode yet.
==Other (miscellaneous) methods==
* '''isoDateTime(t?)'''
Returns the present date and time in (one) ISO format, 19 characters long.
Example: 2011-11-30T23:22:05.
* '''isoDate(t?)'''
Returns the date portion of isoDateTime. Example: 2011-11-30.
* '''isoTime(t?)'''
Returns the time portion of isoDateTime. Example: 23:22:05.
* '''elapsedTime(start, end?, seconds=False)'''
Returns the difference between ''start'' and ''end'', which should be values
directly from ''time()'' (not returned values from ''isoTime'', for example).
The elapsed time is in the form `hh:mm:ss` unless
''seconds'' is True, in which case you just get the raw number of seconds.
* '''lorem(length=79, loremType="a", mode='frequency', xtab=None)'''
Return some sample text of a given type and length. The `loremType` are:
a -- ASCII text, by detault the usual "lorem ipsum", but you can set a
different text via the `loremText` option.
r -- Randomly generated ASCII text (see ''randomChar''(), below)
`randomizeChars`, `unicodeMin`, and `unicodeMax` can be set,
lest ASCII vowel bytes in non-Unicode affect pther non-Latin-1 chars.
`mode` is simply passed on to `randomChar()` when loremType="r".
`xtab` can be a translation table to apply to the generated text.
See also `getRandomXtab()`.
* '''getRandomXtab(fromChars='aeiouAEIOU', uMin=0x00A1, uMax=0x2FF)'''
Generate a translation table, mapping each of the `fromChars` to a random
Unicode character in the specified inclusive range.
All code points seem to be assigned in at least these ranges:
0x00A1 to 0x2FF Latin
0x0400 to 0x52F Cyrillic
0x2200 to 0x23F3 Math ops and technical
0x3041 to 0x3096 Hiragana
* '''randomChar(rgen=random.Random(), mode='uniform')'''
Generates a single random character (used by ''lorem(type='r')'')
If ''mode'' is `frequency`, ASCII lowercase letters and space are generated,
in accordance with approximate probabilities in English text.
Option `letterFreqs` holds a dictionary of char->frequencies.
The keys include 26 lower-case letters, plus space.
If ''mode'' is `uniform`, ASCII printable characters are generated,
with uniform probabilities.
* '''availableFileNum'''(base, width=4, min=1, max=1000, sep='', which='free')
Find a filename from a numbered series. ''base'' is split into path, filename,
and extension. Candidate filenames are of the form:
path + '/' + filename + sep + number + '.' + extension
except that, if there was no extension in ''base'', the final '.' + extension
will not be included. ''number'' must be exactly ''width'' digits
(with extra zeros on the left as needed).
The script will look for existing files whose names match this pattern
(shell globs are not supported!).
What filename is then returned, depends on the ''which'' parameter:
** '''next''': the file with number one greater than the
last extant file, is returned. '''Warning''': If this overflows into
the ''width''+1st digit, the longer name is quietly returned.
** '''firstfree''': the first such filename that does not
exist is returned. There could be additional files with greater numbers.
** '''lastappend''': the last existing file if
it is appendable, otherwise the first such file that does not exist.
* '''Note''': On many file systems, directories with more than about 1000 files
in them get really slow. This method is still experimental.
See also the `incrementFilename.py` command.
=The ColorManager class=
This is a separate class defined in `ColorManager.py`,
to make it easy to manage ANSI terminal color escape sequences.
Its methods
are also patched in to `sjdUtils.py`, so when color is enabled you can just
call them as if they were methods of `sjdUtils.py` itself.
The color names available are defined in `Color/colorNames.pod`,
which supercedes anything in specific scripts (although they ''should'' match).
A dictionary of the usable names is in `colorStrings`; a printable
form can be retrieve via ''getColorStrings()'', or a printable list
via ''tostring()''.
* '''__init__(effects=None)'''
Set up the color manager. ''effects'' is merely passed down to
''setupColors()''.
* '''setupColors(effects=None)'''
(iternal) Work out all known color, effect, and combination names, and
put them in a hash that maps them to their escape sequences.
If 'effects' is not None, it must be a list of the effect names
to include. That list may be empty.
* '''addColor(newName, oldName)'''
Adds a synonym for an existing color.
* '''isColorName(name)'''
Returns True iff ''name'' is a known color name.
* '''getColorString(name)'''
* '''getColorStrings(paramColor, defaultColor)'''
* '''tostring(sampleText='sample', filter=None)'''
Return a printable buffer with one line for each defined color name.
Each line consist of colorized ''sampleText'' and then the color name.
If ''filter'' is supplied, it is treated as a regular expression, and any
color names that do not match it are excluded. This is useful because
there are about 1000 combinations available.
* '''colorize(argColor='red', s="", endAs="off")'''
Return ''s'', but with the ANSI terminal escape to display it in the specified
''argColor'' added at the start, and the escape to switch to color ''endAs''
added at the end.
* '''uncolorize''' ''(s)''
Remove any ANSI terminal color escapes from ''s'' (via `ColorManager.py`).
Also available as standalone Python script `uncolorize`.
* '''uncoloredLen''' ''(s)''
Return the length of ''s'', but ignoring any ANSI terminal color strings.
This is just shorthand for `len(uncolorize(s))`.
=Known bugs and limitations=
If you ''colorize''() a string, it resets the color to default at the end.
Thus, if you insert colorized string A within colorized string B, the portion
of B following A will ''not'' be colorized.
You can specify the ''endAs'' option when colorizing A to avoid this.
Some method names could be better.
Randomly-generated text should be more flexible: mixed-case, Markov generator....
But may not be useful enough to keep around anyway.
Human-readable numbering should also support factors of 1024.
ColorManager has no support for 256-color terminals.
showControls seems unhappy with CR (0x0D).
escaping calls aren't testing well at the moment.
indentJSON should use strip() or similar.
=To do=
* Add colorizing option to lpadc?
* Resync Perl version.
=History=
* 2011-12-09: Port from Perl to Python by Steven J. DeRose.
* 2012-01-09 sjd: Sync port.
* 2012-01-24 sjd: Make into an actual class; globals too hard otherwise.
* 2012-01-30 sjd: Fix color escapes.
* 2012-02-24 sjd: Add calls to 'inspect' to report error locations.
* 2012-02-27 sjd: Add getVerbose(), text arg to _warn methods. Add warn().
Generalize message types.
* 2012-04-11 sjd: Sync with Perl version.
* 2012-11-01 sjd: Add indentJson(). Drop targetFile. Add vMsg, etc.
* Sync more w/ Perl version: xml type checks,....
* 2012-11-07 sjd: Fixing isoTime().
* 2012-11-19 sjd: Fix escapeURI().
* 2013-03-04 sjd: Add vPush(), vPop().
* 2013-03-25 sjd: Implement vDepth indenting, add pline().
* 2013-06-28 sjd: Partial sync w/ Perl version.
* 2014-04-22: Add getVerbose. Fix SUset for 'verbose'. Sync msgTypes w/ Perl.
Fiddle with messaging.
* 2014-06-09ff: Improve indentJson. Add synonym indentJSON, availableFileNum.
* 2014-06-1f: Improve availableFileNum.
* 2014-07-01: Add setOption/getOption synonym, clean up traceback a little.
* 2014-07-17: Fix colorEnabled. Sync vPush() etc. w/ Perl version.
* 2014-09-05: Add toUTF8(), handle unicode messages, add types for lorem().
* 2014-12-17: Add 'stat' parameter to eMsg, vMsg, hMsg, Msg to keep count of
named statistics. Also add bumpStat(), getStat(), showStatS().
* 2015-01-15f: Improve stats handling/escaping. Add appendStat(), expandXml().
* 2015-03-26: Clean up defineMsgType(). Add 'infix' argument for messages.
* 2015-07-27: Add Python logger-like message calls, with fill-ins.
* Add uncoloredLength, make pline() ignore color escapes.
* 2015-10-13f: Move messaging support into separate 'alogging' package,
* which has been ported to sit on top of Python's 'logging' package.
Split color to internal class. Add XMLExprs() class.
Add quote/unquote methods. Remove teefile option. Implement try_module().
Write unit tests and fix several bugs. Add HNumber 'base' arg.
* 2016-09-09: Add strip_accents().
* 2016-10-04: Move more stuff into XmlRegexes class (nee XMLExprs).
* 2016-10-31: Move ColorManager out to be a separate package.
* 2016-12-13: Add align().
* 2018-01-11: Move XmlRegexes out to separate package.
* 2018-03-20: Don't die if ColorManager not available.
* 2018-08-16: Keep chr() defined even in Python 3.
* 2018-09-25ff: Clean up more PY 2 vs. 3 details.
* 2018-10-05: add className()
* 2018-11-27: Add splitPlus().
* 2020-01-22: Move doc, POD->MarkDown, lint, metadata.
* 2020-08-27: unbackslash() for both Python 2 and 3.
* 2020-09-03: Add shrinkuser().
* 2024-01-18: Add qjoin(), clean up quote(), more type hints.
=Rights=
Copyright 2011, Steven J. DeRose. This work is licensed under a Creative Commons
Attribution-Share Alike 3.0 Unported License. For further information on
this license, see [http://creativecommons.org/licenses/by-sa/3.0/].
For the most recent version, see [http://www.derose.net/steve/utilities] or
[http://github/com/sderose].
=Options=
"""
# Provide way to get class-name regardless of old/new, 2/3
# Old-style classes have type 'instance', kinda pointless.
# See https://stackoverflow.com/questions/510972/
#
def className(obj):
return obj.__class__.__name__
# Both:
# from html.parser import HTMLParser
# from io import open
# from builtins import chr
#
###############################################################################
# Useful sets of Unicode characters
#
UQuotes = (
chr(0x0022) + # 'QUOTATION MARK'
chr(0x0027) + # 'APOSTROPHE'
chr(0x301f) + # 'LOW DOUBLE PRIME QUOTATION MARK'
chr(0xff02) + # 'FULLWIDTH QUOTATION MARK'
chr(0xff07) + # 'FULLWIDTH APOSTROPHE'
""
)
ULSQuotes = (
chr(0x2018) + # 'LEFT SINGLE QUOTATION MARK',
#chr(0x201a) + # 'SINGLE LOW-9 QUOTATION MARK',
#chr(0x201b) + # 'SINGLE HIGH-REVERSED-9 QUOTATION MARK',
# 2032 - 2037 ??? FIX ???
chr(0x2039) + # 'SINGLE LEFT-POINTING ANGLE QUOTATION MARK',
chr(0x275b) + # 'HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT',
#chr(0x275f) + # 'HEAVY LOW SINGLE COMMA QUOTATION MARK ORNAMENT',
chr(0x276e) + # 'HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT',
""
)
ULDQuotes = (
chr(0x00ab) + # 'LEFT-POINTING DOUBLE ANGLE QUOTATION MARK',
chr(0x201c) + # 'LEFT DOUBLE QUOTATION MARK',
chr(0x201e) + # 'DOUBLE LOW-9 QUOTATION MARK',
chr(0x275d) + # 'HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT',
#chr(0x2760) + # 'HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT',
chr(0x301d) + # 'REVERSED DOUBLE PRIME QUOTATION MARK',
""
)
ULQuotes = ULSQuotes + ULDQuotes
URSQuotes = (
chr(0x2019) + # 'RIGHT SINGLE QUOTATION MARK',
chr(0x203a) + # 'SINGLE RIGHT-POINTING ANGLE QUOTATION MARK',
chr(0x275c) + # 'HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT',
chr(0x276f) + # 'HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT',
""
)
URDQuotes = (
chr(0x00bb) + # 'RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK',
chr(0x201d) + # 'RIGHT DOUBLE QUOTATION MARK',
chr(0x201f) + # 'DOUBLE HIGH-REVERSED-9 QUOTATION MARK',
chr(0x275e) + # 'HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT',
chr(0x301e) + # 'DOUBLE PRIME QUOTATION MARK',
""
)
URQuotes = URSQuotes + URDQuotes
char2escape = {
"\a" : "\\a", # bell
"\b" : "\\b", # backspace
#"\e" : "\\e", # escape
"\f" : "\\f", # form feed
"\n" : "\\n", # line feed
"\r" : "\\r", # carriage return
"\t" : "\\t", # tab
"\\" : "\\\\", # backslash
}
spaceCodes = {
"SP" : chr(0x2420), # SP
"B" : chr(0x2422), # B/
"U" : chr(0x2423), # _
"OK" : ' ',
"NBSP" : chr(0x00A0), # Non-breaking space
}
lfCodes = {
"LF" : chr(0x240A),
"NL" : chr(0x2424),
"OK" : "\n",
}
# See http://stackoverflow.com/questions/517923
#
def strip_accents(s:str) -> str:
return ''.join(c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn')
#straightenSingles = unicode.maketrans(
# ULQuotes+URQuotes, "'" * (len(ULQuotes)+len(URQuotes)))
#straightenDoubles = unicode.maketrans(
# ULQuotes+URQuotes, '"' * (len(ULQuotes)+len(URQuotes)))
###############################################################################
# Functions used on RHS of regex changes
#
def UEscapeFunction(mat) -> str:
return("\\u%04x;" % (ord(mat.group(1))))
def controlSymbolsFunction(mat):
return(chr(0x2400 + ord(mat.group(1))))
def escHexFn(m) -> str:
utf = m.group(1).encode('utf-8')
buf = ""
for byt in utf: buf += "_%02x" % (ord(byt))
return(buf)
def unescFn(m) -> str:
return(chr(hex(m.group(1))))
###############################################################################
#
class sjdUtils:
"""A variety of low-level utilities used by many of my scripts.
Includes pretty-printing XML and JSON, times and dates, lorem text,
escaping and unescaping special characters, etc.
"""
def __init__(self, verbose:int=0, colors=1, logger=None, old:bool=False):
self.version = __version__
self.localeInfo = None
self.htmlp = None
self.lg = logger or ALogger(1)
# Define some of the logger's methods locally, for backward
# compatibility:
if (old):
#self.vMsg = self.lg.vMsg
#self.eMsg = self.lg.vMsg
raise ValueError("Unexpected 'old' option to sjdUtils.")
self.showStats = self.lg.showStats
# Init some extra color stuff:
self.colorManager = None
self.addColor = None
self.getColorString = None
self.getColorStrings = None
self.tostring = None
#self.colorize = None
#self.uncolorize = None
self.uncoloredLen = None
self.options = {
"colorEnabled": 0, # Using terminal color?
"loremText": ( # Traditional filler text
"Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do "
+ "eiusmod tempor incididunt ut labore et dolore magna aliqua. "
+ "Ut enim ad minim veniam, quis nostrud exercitation ullamco "
+ "laboris nisi ut aliquip ex ea commodo consequat. "
+ "Duis aute irure dolor in reprehenderit in voluptate "
+ "velit esse cillum dolore eu fugiat nulla pariatur. "
+ "Excepteur sint occaecat cupidatat non proident, sunt in "
+ "culpa qui officia deserunt mollit anim id est laborum."
),
"letterFreqs" : []
}
# See loremText(type='r'), below.
# See also Charsets/letterFrequencies.py.
#
self.letterFreqsTotal = 0 # setUtilsOption() sets this.
self.setUtilsOption("letterFreqs", [ # Rough, from Wikipedia
('e', 12702),
(' ', 17000), # Estimate for non-word chars
('t', 9056),
('a', 8167),
('o', 7507),
('i', 6966),
('n', 6749),
('s', 6327),
('h', 6094),
('r', 5987),
('d', 4253),
('l', 4025),
('c', 2782),
('u', 2758),
('m', 2406),
('w', 2360),
('f', 2228),
('g', 2015),
('y', 1974),
('p', 1929),
('b', 1492),
('v', 978),
('k', 772),
('j', 153),
('x', 150),
('q', 95),
('z', 74)
])
self.multipliers = [
# Suf 1000 1024 Name
( "K", 1E+3, 1<<10, "Kilo" ), # Karl
( "M", 1E+6, 1<<20, "Mega" ), # Marx
( "G", 1E+9, 1<<30, "Giga" ), # gave
( "T", 1E12, 1<<40, "Tera" ), # the
( "P", 1E15, 1<<50, "Peta" ), # people
( "E", 1E18, 1<<60, "Exa" ), # eleven
( "Z", 1E21, 1<<70, "Zetta" ), # zeppelins,
( "Y", 1E24, 1<<80, "Yotta" ), # yo!
]
self.unescape = re.compile(r'_[0-9a-f][0-9a-f]')
self.setColors(colors)
self.setVerbose(verbose)
return(None)
def getLogger(self):
"""At least for now, just have one logger via sjdUtils.
"""
return(self.lg)
def setColors(self, enabled:bool) -> None:
"""Set up or take down the ColorManager package.
Monkey-patch keys methods so they're easily available from here.
"""
haveColorManager = self.try_module('ColorManager')
if (not haveColorManager): return
if (not enabled):
self.addColor = None
self.getColorString = None
self.getColorStrings = None
self.tostring = None
self.uncoloredLen = None
self.colorManager = None
self.options['colorEnabled'] = False
else:
self.colorManager = ColorManager.ColorManager()
self.addColor = self.colorManager.addColor
self.getColorString = self.colorManager.getColorString
self.getColorStrings = self.colorManager.getColorStrings
self.tostring = self.colorManager.tostring
self.uncoloredLen = self.colorManager.uncoloredLen
self.options['colorEnabled'] = True
return
def getColorManager(self):
return(self.colorManager)
def colorize(self, argColor:str='red', s:str="", endAs:str="off") -> str:
if (not self.colorManager): return s
return self.colorManager.colorize(argColor, s, endAs)
def uncolorize(self, s:str) -> str:
if (not self.colorManager): return s
return self.colorManager.uncolorize(s)
###########################################################################
# Options
#
def setUtilsOption(self, name:str, value=1):
return(self.setOption(name, value))
def setOption(self, name:str, value=1):
"""Set option `name`, from the list below, to `value`.
* Option: `colorEnabled`
Globally enables/disables use of color.
Script may wish to set this to True if environment variable `CLI_COLOR` is
set and the relevant output is going to a terminal.
See also the `setColors()` method, below.
* Option: defaultEncoding
Set the name of the character encoding to be used when not otherwise specified.
* Option: loremText (string)
The text to be returned by lorem(type='a').
Default: the usual lorem ipsum....
"""
self.options[name] = value
if (name=="letterFreqs"):
self.letterFreqsTotal = 0
for tup in (self.options["letterFreqs"]):
self.letterFreqsTotal += tup[1]
return
def getUtilsOption(self, name:str):
return(self.getOption(name))
def getOption(self, name:str):
"""Return the current value of an option (see setOption()).
"""
return(self.options[name])
def getVersion(self):
return(self.version)
def setVerbose(self, v:int) -> int:
"""Tell the logger to set the level of message to be reported.
"""
if (self.lg): self.lg.setVerbose(v)
return(self.setUtilsOption("verbose", v))
def getVerbose(self) -> int:
"""Return the logger's level of message currently set to be reported.
"""
return(self.getUtilsOption("verbose"))
###########################################################################
# XML stuff
#
def indentXML(
self, s:str, iString:str=" ", maxIndent:int=0,
breakAttrs:bool=False, elems:Dict=None, html:bool=False) -> str:
"""Breaks before start-tags and end-tags, etc.
Puts in spurious breaks if "<" occurs within PI, comment, CDATA MS.
If you want it really right, use my DomExtensions::collectAllXml().
"""
return(self.indentXml(
s, iString=iString, maxIndent=maxIndent,
breakAttrs=breakAttrs, elems=elems, html=html))
# Also inline-block:
# "del", "iframe", "ins", "map", "object", "script", "button",
htmlInlineElements = [
"a", "abbr", "acronym", "b", "bdo", "big",
"cite", "code", "dfn", "em", "i", "img",
"input", "kbd", "label", "legend", "optgroup", "option",
"select", "textarea", "q", "s", "small", "span",
"strike", "strong", "sub", "sup", "tt", "var",
"applet", "center", "dir", "font", "samp", "strike",
"address", "area", "audio", "bm", "details", "command",
"datalist", "font", "u",
]
def indentXml(
self, s:str, iString:str=" ", maxIndent:int=0,
breakAttrs:bool=False, elems:Dict=None, html:bool=False) -> str:
"""Insert newlines and indentation in an XML string. Does not use
an actual parser, but is quick and pretty reliable.
@param iString: String to repeat to make indentation
@param maxIndent: Don't indent more than this many levels
@param breakAttrs: Put attributes on their own lines
@param elems: Dict of elements, map each to 'inline' or 'block'
@param html: Apply HTML 'inline' element list (can override w/ elems)
"""
if (elems is None): elems = {}
if (html):
for e in sjdUtils.htmlInlineElements:
if (e not in elems): elems[e] = 'inline'
s = s.replace('\n', '', )
s = re.sub(r'(<[^/])', "\n\\1", s)
lines = re.split(r'\n', s)
#print(lines)
depth = 0
ind = ""
for i in (range(0, len(lines))):
effectiveDepth = depth
if (maxIndent and depth>maxIndent):
effectiveDepth = maxIndent
ind = iString * effectiveDepth
iLine = ind + lines[i]
if (re.match(r'</', lines[i])): # end-tag
lines[i] = iLine[2:]
depth -= 1
elif (re.match(r'<[_.\w][^>]*/>', lines[i])): # empty
lines[i] = iLine
elif (re.match(r'<(br|hr)\b', lines[i])): # HTML empty
lines[i] = iLine
elif (re.match(r'<\w', lines[i])): # start-tag
if (breakAttrs):
iLine = re.sub(r'([-:_.\w\d]+=)',
"\n"+ind+"\\1", iLine)
lines[i] = iLine
depth += 1
else:
lines[i] = iLine
s = "\n".join(lines)
# No break before end-tag after text
#s = re.sub(r's([^> ])\n\s+<', "\\1", s)
for e in (elems):
if (elems[e] == "inline"):
s = re.sub(r'\n\s*<'+e+r'\b', r'\t<'+e, s)
#elif (elems[e] == "block"):
# s = re.sub('\n(\s*<'+e+r')\b', '\n\n\1', s)
return(s+"\n")
# indentXML
def colorizeXmlTags(self, s:str, color:str="") -> str:
"""Surround XML markup with ANSI terminal escapes to display it
in the specified color (default: the color for the "x" message type).
"""
self.setColors(1)
(con, coff) = self.lg.getPickedColorString(color, "x")
s = re.sub(r'(<.*?>)', con+"\\1"+coff, s)
return(s)
def colorizeXmlContent(self, s:str, color:str="") -> str:
"""Surround XML content (not markup) with ANSI terminal codes to display
it in the specified color (default: the color for the "x" message type).
"""
#print(type(s))
if (not s): return(s)
self.setColors(1)
(con, coff) = self.lg.getPickedColorString(color, "x")
rhs = re.sub(r'\[', "\\[", ">"+con+"\\1"+coff+"<")
#print("*** rhs '%s'\ns '%s'" % (repr(rhs), s))
s = re.sub(self.colorManager.colorizeXmlContentExpr, rhs, s)
return(s)
###########################################################################
#
def getJsonIndent(self, level:int, maxIndent:int, iString:str=None) -> str:
"""Internal. Return a newline plus indentation for indentJson>().
"""
if (iString is None): iString = self.options["iString"]
effLevel = level
if (maxIndent and level>maxIndent): effLevel = maxIndent
return("\n" + (iString * effLevel))
def indentJson(self, s:str, iString:str=" ", maxIndent:int=0) -> str:
buf = ""
level = 0
inQuote = False
ss = "%s" % (s)
#print(ss)
sslen = len(ss)
i = 0
while (i<sslen):