Skip to content

Commit 12b4b17

Browse files
committed
add char-asian-punctuation?
1 parent deadc3d commit 12b4b17

File tree

3 files changed

+22
-23
lines changed

3 files changed

+22
-23
lines changed

character.rkt

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131

3232
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3333
;;; http://www.unicode.org/reports/tr44/#General_Category_Values
34+
;;; https://www.unicodepedia.com/groups/
3435
(define char-letter? : (-> Char Boolean)
3536
(lambda [ch]
3637
(and (memq (char-general-category ch) '(lu ll lt lm lo))
@@ -46,21 +47,6 @@
4647
(and (memq (char-general-category ch) '(mn mc me))
4748
#true)))
4849

49-
(define char-number? : (-> Char Boolean)
50-
(lambda [ch]
51-
(and (memq (char-general-category ch) '(nd nl no))
52-
#true)))
53-
54-
(define char-punctuation? : (-> Char Boolean)
55-
(lambda [ch]
56-
(and (memq (char-general-category ch) '(pc pd ps pe pi pf po))
57-
#true)))
58-
59-
(define char-symbol? : (-> Char Boolean)
60-
(lambda [ch]
61-
(and (memq (char-general-category ch) '(sm sc sk so))
62-
#true)))
63-
6450
(define char-math? : (-> Char Boolean)
6551
(lambda [ch] ; symbol other
6652
(eq? (char-general-category ch) 'so)))
@@ -79,6 +65,20 @@
7965
(lambda [ch] ; letter other
8066
(eq? (char-general-category ch) 'lo)))
8167

68+
(define char-asian-punctuation? : (-> Char Boolean)
69+
(lambda [ch]
70+
(define unicode (char->integer ch))
71+
72+
(or
73+
; General Punctuation Block
74+
(<= #x2000 unicode #x206F)
75+
76+
; CJK Symbol and Punctuation Block
77+
(<= #x3000 unicode #x303F)
78+
79+
; Halfwidth and Fullwidth Forms Block
80+
(<= #xFF00 unicode #xFFEE))))
81+
8282
(define char-emoji? : (-> Char Boolean)
8383
(lambda [ch] ; symbol other
8484
(eq? (char-general-category ch) 'so)))

digitama/tamer/stat.rkt

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -126,15 +126,9 @@
126126
(define/override (table-of-contents part ri) empty-block)
127127
(define/override (local-table-of-contents part ri) empty-block)
128128

129-
(define/override (get-substitutions)
130-
'((#rx"---" "\U2014")
131-
(#rx"--" "\U2013")
132-
(#rx"``" "\U201C")
133-
(#rx"''" "\U201D")
134-
(#rx"'" "\U2019")))
135-
136129
(define/override (render-part self ri)
137-
(parameterize ([default-word-group 'head])
130+
(parameterize ([default-word-group 'head]
131+
[current-output-port /dev/stdout])
138132
(super render-part self ri)))
139133

140134
(define/override (render-paragraph self parent ri)
@@ -173,6 +167,7 @@
173167
[else (string-word-count-done wstat++)])))
174168

175169
(define (word-count-symbol self part [group (default-word-group)])
170+
; `decode-string` produces these before we see the `part`
176171
(case self
177172
[(mdash) (word-count "\U2014" part group)]
178173
[(ndash) (word-count "\U2013" part group)]

digitama/unicode.rkt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@
6262
[(not (eq? c #\linefeed)) (wc as ls++ ws++ nl es #false #false idx++)]
6363
[(and prev-cr?) (wc as ls++ ws nl es #false #false idx++)] ; CRLF
6464
[else (wc as ls++ ws++ (unsafe-idx+ nl 1) es #false #false idx++)]))]
65+
[(char-punctuation? c)
66+
(if (char-asian-punctuation? c)
67+
(wc (unsafe-idx+ as 1) ls ws nl es #false #false idx++)
68+
(wc as ls ws nl es #true #false idx++))]
6569
[else (wc as ls ws nl es #true #false idx++)]))
6670

6771
(make-word-statistics (+ (word-statistics-asian pstats) as)

0 commit comments

Comments
 (0)