Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 262a27a

Browse files
committedAug 12, 2016
Speed up tokenization and font-lock
1 parent def0e2f commit 262a27a

File tree

5 files changed

+267
-49
lines changed

5 files changed

+267
-49
lines changed
 

‎haskell-font-lock.el

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -314,9 +314,9 @@ like ::, class, instance, data, newtype, type."
314314
(goto-char end)))
315315

316316

317-
(defun haskell-font-lock--put-face-on-type-or-constructor ()
318-
"Private function used to put either type or constructor face
319-
on an uppercase identifier."
317+
(defun haskell-font-lock--select-face-on-type-or-constructor ()
318+
"Private function used to select either type or constructor face
319+
on an uppercase identifier."
320320
(cl-case (haskell-lexeme-classify-by-first-char (char-after (match-beginning 1)))
321321
(varid (when (member (match-string 0) haskell-font-lock--reverved-ids)
322322
;; Note: keywords parse as keywords only when not qualified.
@@ -354,6 +354,15 @@ like ::, class, instance, data, newtype, type."
354354
(add-text-properties (match-end 0) (point) '(font-lock-multiline t haskell-type t)))
355355
'haskell-operator-face))))
356356

357+
(defun haskell-font-lock--put-face-on-type-or-constructor ()
358+
"Private function used to put either type or constructor face
359+
on an uppercase identifier."
360+
(let ((face (haskell-font-lock--select-face-on-type-or-constructor)))
361+
(when (and face
362+
(not (text-property-not-all (match-beginning 0) (match-end 0) 'face nil)))
363+
(put-text-property (match-beginning 0) (match-end 0) 'face face))))
364+
365+
357366
(defun haskell-font-lock-keywords ()
358367
;; this has to be a function because it depends on global value of
359368
;; `haskell-font-lock-symbols'
@@ -448,9 +457,15 @@ like ::, class, instance, data, newtype, type."
448457

449458
(,(concat "`" haskell-lexeme-qid-or-qsym "`") 0 'haskell-operator-face)
450459

451-
(,haskell-lexeme-qid-or-qsym
460+
(,haskell-lexeme-idsym-first-char
452461
(0 (unless (or (elt (syntax-ppss) 3) (elt (syntax-ppss) 4))
453-
(haskell-font-lock--put-face-on-type-or-constructor))))))
462+
(when (save-excursion
463+
(goto-char (match-beginning 0))
464+
(haskell-lexeme-looking-at-qidsym))
465+
(goto-char (match-end 0))
466+
;; note that we have to put face ourselves here because font-lock
467+
;; will use match data from the original matcher
468+
(haskell-font-lock--put-face-on-type-or-constructor)))))))
454469
keywords))
455470

456471

‎haskell-lexeme.el

Lines changed: 162 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
(modify-category-entry key ?P))))
3333
unicode-category-table)
3434

35-
(dolist (key (string-to-list "!#$%&*+./<=>?@^|~\\-"))
35+
(dolist (key (string-to-list "!#$%&*+./<=>?@^|~\\-:"))
3636
(modify-category-entry key ?P)))
3737

3838
(defconst haskell-lexeme-modid
@@ -43,28 +43,31 @@ Note that GHC accepts Unicode category UppercaseLetter as a first
4343
character. Following letters are from Unicode categories
4444
UppercaseLetter, LowercaseLetter, OtherLetter, TitlecaseLetter,
4545
ModifierLetter, DecimalNumber, OtherNumber, backslash or
46-
underscore.
47-
48-
Note that this differs from constructor identifier as the latter
49-
one can have any number of hash character at the end to
50-
accommodate MagicHash extension.")
46+
underscore.")
5147

5248
(defconst haskell-lexeme-id
53-
"[[:alpha:]_][[:alnum:]'_]*#*"
49+
"[[:alpha:]_][[:alnum:]'_]*"
5450
"Regexp matching a valid Haskell identifier.
5551
5652
GHC accepts a string starting with any alphabetic character or
5753
underscore followed by any alphanumeric character or underscore
5854
or apostrophe.")
5955

6056
(defconst haskell-lexeme-sym
61-
"\\(:?\\cP\\|:\\)+"
57+
"\\cP+"
6258
"Regexp matching a valid Haskell variable or constructor symbol.
6359
6460
GHC accepts a string of chars from the set
6561
[:!#$%&*+./<=>?@^|~\\-] or Unicode category Symbol for chars with
6662
codes larger than 128 only.")
6763

64+
(defconst haskell-lexeme-idsym-first-char
65+
"\\(?:[[:alpha:]_]\\|\\cP\\)"
66+
"Regexp matching first character of a qualified or unqualified
67+
identifier or symbol.
68+
69+
Useful for `re-search-forward'.")
70+
6871
(defconst haskell-lexeme-modid-opt-prefix
6972
(concat "\\(?:" haskell-lexeme-modid "\\.\\)*")
7073
"Regexp matching a valid Haskell module prefix, potentially empty.
@@ -80,6 +83,53 @@ dot. For path component syntax see `haskell-lexeme-modid'.")
8083
8184
Note that (match-string 1) returns the unqualified part.")
8285

86+
(defun haskell-lexeme-looking-at-qidsym ()
87+
"Non-nil when point is just in front of an optionally qualified
88+
identifier or symbol.
89+
90+
Using this function is more efficient than matching against the
91+
regexp `haskell-lexeme-qid-or-qsym'.
92+
93+
Returns:
94+
'qid - if matched a qualified id: 'Data.Map' or 'Map'
95+
'qsym - if matched a qualified id: 'Monad.>>=' or '>>='
96+
'qprefix - if matched only modid prefix: 'Data.'
97+
98+
After successful 'qid or 'qsym match (match-string 1) will return
99+
the unqualified part (if any)."
100+
(let ((begin (point))
101+
(match-data-old (match-data)))
102+
(save-excursion
103+
(while (looking-at (concat haskell-lexeme-modid "\\."))
104+
(goto-char (match-end 0)))
105+
(cond
106+
((looking-at haskell-lexeme-id)
107+
(let ((beg (match-beginning 0))
108+
(end (match-end 0)))
109+
110+
;; check is MagicHash is present at the end of the token
111+
(goto-char end)
112+
(when (looking-at "#+")
113+
(setq end (match-end 0)))
114+
115+
(set-match-data
116+
(list begin end
117+
beg end)))
118+
'qid)
119+
((looking-at haskell-lexeme-sym)
120+
(set-match-data
121+
(list begin (match-end 0)
122+
(match-beginning 0) (match-end 0)))
123+
'qsym)
124+
((equal begin (point))
125+
(set-match-data match-data-old)
126+
nil)
127+
(t
128+
(set-match-data
129+
(list begin (point)
130+
nil nil))
131+
'qprefix)))))
132+
83133
(defconst haskell-lexeme-qid
84134
(rx-to-string `(: (regexp "'*")
85135
(regexp ,haskell-lexeme-modid-opt-prefix)
@@ -148,7 +198,7 @@ strictly only escape sequences defined in Haskell Report.")
148198
(group (* (| (regexp "\\\\[ \t\n\r\v\f]*\\\\")
149199
(regexp "\\\\[ \t\n\r\v\f]+")
150200
(regexp "\\\\[^ \t\n\r\v\f]")
151-
(regexp "[^\"\n\\]"))))
201+
(* (regexp "[^\"\n\\]")))))
152202
(group (| "\"" (regexp "$") (regexp "\\\\?\\'")
153203
))))
154204
"Regexp matching a string literal lookalike.
@@ -166,30 +216,111 @@ Regexp has subgroup expressions:
166216
(match-text 3) matches the closing quote, or a closing
167217
newline or empty string at the end of the buffer.")
168218

169-
(defconst haskell-lexeme-quasi-quote-literal
170-
(rx-to-string `(: "[" (optional "$")
171-
(group (regexp ,haskell-lexeme-id))
172-
(group "|")
173-
(group (* (| (not (any "|"))
174-
(: "|" (not (any "]"))))
175-
))
176-
(group (| "|" eos))
177-
(| "]" eos)))
178-
"Regexp matching a quasi quote literal.
219+
(defun haskell-lexeme-looking-at-string-literal ()
220+
"Non-nil when point is at a string literal lookalike.
221+
222+
Note that this function matches more than Haskell Report
223+
specifies because we want to support also code under edit.
224+
225+
String literals end with double quote or unescaped newline or end
226+
of buffer.
227+
228+
After successful match:
229+
(match-text 1) matches the opening doublequote.
230+
(match-text 2) matches the inside of the string.
231+
(match-text 3) matches the closing quote, or a closing
232+
newline or is nil when at the end of the buffer."
233+
(when (looking-at "\"")
234+
(save-excursion
235+
(let ((begin (point)))
236+
(goto-char (match-end 0))
237+
(let (finish)
238+
(while (and (not finish)
239+
(re-search-forward "[\"\n\\]" nil 'goto-eob))
240+
(cond
241+
((equal (match-string 0) "\\")
242+
(if (looking-at "[ \t\n\r\v\f]+\\\\?")
243+
(goto-char (match-end 0))
244+
(goto-char (1+ (point)))))
245+
246+
((equal (match-string 0) "\"")
247+
(set-match-data
248+
(list begin (match-end 0)
249+
begin (1+ begin)
250+
(1+ begin) (match-beginning 0)
251+
(match-beginning 0) (match-end 0)))
252+
(setq finish t))
253+
254+
((equal (match-string 0) "\n")
255+
(set-match-data
256+
(list begin (match-beginning 0)
257+
begin (1+ begin)
258+
(1+ begin) (match-beginning 0)
259+
nil nil))
260+
(setq finish t))))
261+
(unless finish
262+
;; string closed by end of buffer
263+
(set-match-data
264+
(list begin (point)
265+
begin (1+ begin)
266+
(1+ begin) (point)
267+
nil nil))))))
268+
;; there was a match
269+
t))
270+
271+
(defun haskell-lexeme-looking-at-quasi-quote-literal ()
272+
"Non-nil when point is just in front of Template Haskell
273+
quaisquote literal.
179274
180275
Quasi quotes start with '[xxx|' or '[$xxx|' sequence and end with
181-
'|]'. The 'xxx' is a quoter name. There is no escaping mechanism
276+
'|]'. The 'xxx' is a quoter name. There is no escaping mechanism
182277
provided for the ending sequence.
183278
184279
Regexp has subgroup expressions:
185280
(match-text 1) matches the quoter name (without $ sign if present).
186281
(match-text 2) matches the opening vertical bar.
187282
(match-text 3) matches the inside of the quoted string.
188283
(match-text 4) matches the closing vertical bar
189-
or empty string if at the end of the buffer.
190-
191-
Note that this regexp admits 'e', 't', 'd', 'p' as quoter names
192-
although template Haskell explicitly rejects those.")
284+
or nil if at the end of the buffer.
285+
286+
Note that this function excludes 'e', 't', 'd', 'p' as quoter
287+
names according to Template Haskell specification."
288+
(let ((match-data-old (match-data)))
289+
(if (and
290+
(looking-at (rx-to-string `(: "[" (optional "$")
291+
(group (regexp ,haskell-lexeme-id))
292+
(group "|"))))
293+
(equal (haskell-lexeme-classify-by-first-char (char-after (match-beginning 1)))
294+
'varid)
295+
(not (member (match-string 1) '("e" "t" "d" "p"))))
296+
(save-excursion
297+
;; note that quasi quote syntax does not have any escaping
298+
;; mechanism and if not closed it will span til lthe end of buffer
299+
(goto-char (match-end 0))
300+
(let ((match-data (match-data))
301+
(match-data-2 (and (re-search-forward "|]" nil t)
302+
(match-data))))
303+
(if match-data-2
304+
(set-match-data
305+
(list
306+
(nth 0 match-data) (nth 1 match-data-2) ;; whole match
307+
(nth 2 match-data) (nth 3 match-data) ;; quoter name
308+
(nth 4 match-data) (nth 5 match-data) ;; opening bar
309+
(nth 5 match-data) (nth 0 match-data-2) ;; inner string
310+
(nth 0 match-data-2) (1+ (nth 0 match-data-2)))) ;; closing bar
311+
312+
(set-match-data
313+
(list
314+
(nth 0 match-data) (point-max) ;; whole match
315+
(nth 2 match-data) (nth 3 match-data) ;; quoter name
316+
(nth 4 match-data) (nth 5 match-data) ;; opening bar
317+
(nth 5 match-data) (point-max) ;; inner string
318+
nil nil)) ;; closing bar
319+
))
320+
t)
321+
;; restore old match data if not matched
322+
(set-match-data match-data-old)
323+
nil)))
193324

194325
(defun haskell-lexeme-classify-by-first-char (char)
195326
"Classify token by CHAR.
@@ -258,7 +389,8 @@ See `haskell-lexeme-classify-by-first-char' for details."
258389
(point (point-marker)))
259390
(or
260391
(and
261-
(equal (string-to-syntax "<") (syntax-after (point)))
392+
(equal (string-to-syntax "<")
393+
(get-char-property (point) 'syntax-table))
262394
(progn
263395
(set-match-data (list point (set-marker (make-marker) (line-end-position))))
264396
'literate-comment))
@@ -271,21 +403,13 @@ See `haskell-lexeme-classify-by-first-char' for details."
271403
'nested-comment))
272404
(and (looking-at haskell-lexeme-char-literal)
273405
'char)
274-
(and (looking-at haskell-lexeme-string-literal)
406+
(and (haskell-lexeme-looking-at-string-literal)
275407
'string)
276408
(and (looking-at "[][(){}`,;]")
277-
(let ((match-data (match-data)))
278-
(if (and (equal "[" (match-string-no-properties 0))
279-
(looking-at haskell-lexeme-quasi-quote-literal))
280-
(if (or (member (match-string-no-properties 1) '("e" "d" "p" "t"))
281-
(not (equal (haskell-lexeme-classify-by-first-char (char-after (match-beginning 1)))
282-
'varid)))
283-
(progn
284-
(set-match-data match-data)
285-
'special)
286-
'template-haskell-quasi-quote)
287-
'special)))
288-
(and (looking-at haskell-lexeme-qid-or-qsym)
409+
(if (haskell-lexeme-looking-at-quasi-quote-literal)
410+
'template-haskell-quasi-quote
411+
'special))
412+
(and (haskell-lexeme-looking-at-qidsym)
289413
(if (save-match-data
290414
(string-match "\\`---*\\'" (match-string-no-properties 0)))
291415
(progn

‎haskell-mode.el

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -567,13 +567,14 @@ be set to the preferred literate style."
567567
;; when a generic delimiter is not closed so in case
568568
;; string ends at the end of the buffer we will use
569569
;; plain string
570-
(when (and (equal (match-beginning 3) (match-end 3))
571-
(not (equal (match-beginning 3) (point-max))))
570+
(when (and (not (match-beginning 3))
571+
(not (equal (match-end 2) (point-max))))
572572
(put-text-property (match-beginning 1) (match-end 1) 'syntax-table (string-to-syntax "|"))
573-
(put-text-property (match-beginning 3) (1+ (match-end 3)) 'syntax-table (string-to-syntax "|")))))
573+
(put-text-property (match-end 2 ) (1+ (match-end 2)) 'syntax-table (string-to-syntax "|")))))
574574
((equal token-kind 'template-haskell-quasi-quote)
575575
(put-text-property (match-beginning 2) (match-end 2) 'syntax-table (string-to-syntax "\""))
576-
(put-text-property (match-beginning 4) (match-end 4) 'syntax-table (string-to-syntax "\""))
576+
(when (match-beginning 4)
577+
(put-text-property (match-beginning 4) (match-end 4) 'syntax-table (string-to-syntax "\"")))
577578
(save-excursion
578579
(goto-char (match-beginning 3))
579580
(let ((limit (match-end 3)))

‎tests/haskell-c2hs-tests.el

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
(ert-deftest haskell-c2hs-qualified-import-hook ()
1717
"C2HS qualified import hook"
1818
(check-properties
19-
'("{#import qualified Foo#}")
19+
'("{#import qualified Foo #}")
2020
'(("{#" t haskell-c2hs-hook-pair-face)
2121
("import" "w" haskell-c2hs-hook-name-face)
2222
("qualified" "w" haskell-c2hs-hook-name-face)
@@ -284,7 +284,7 @@
284284
(ert-deftest haskell-c2hs-const-hook ()
285285
"C2HS const hook"
286286
(check-properties
287-
'("{#const FOO_BAR#}")
287+
'("{#const FOO_BAR #}")
288288
'(("{#" t haskell-c2hs-hook-pair-face)
289289
("const" "w" haskell-c2hs-hook-name-face)
290290
("#}" t haskell-c2hs-hook-pair-face))

‎tests/haskell-lexeme-tests.el

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ order."
1616
(save-current-buffer
1717
(set-buffer (get-buffer-create "*haskell-mode-buffer*"))
1818

19+
(when (fboundp 'jit-lock-debug-mode)
20+
;; to see stack traces from inside font-lock
21+
(jit-lock-debug-mode))
22+
1923
;; Note that all of this should work both in haskell-mode and
2024
;; outside of it. Currently we test only haskell-mode setup.
2125
(if literate
@@ -43,6 +47,29 @@ order."
4347
(goto-char (match-end 0)))
4448
(should (equal nil left-lexemes)))))
4549

50+
(defun check-lexemes-nocheck (lines-or-contents &optional literate)
51+
"Checks if tokenization works as expected.
52+
53+
LINES is a list of strings that will be inserted to a new
54+
buffer."
55+
(when (get-buffer "*haskell-mode-buffer*")
56+
(kill-buffer "*haskell-mode-buffer*"))
57+
(save-current-buffer
58+
(set-buffer (get-buffer-create "*haskell-mode-buffer*"))
59+
60+
(if (consp lines-or-contents)
61+
(dolist (line lines-or-contents)
62+
(insert line)
63+
(insert "\n"))
64+
(insert lines-or-contents))
65+
66+
(font-lock-fontify-buffer)
67+
68+
;; here we check only if tokenization did not end in exception thrown
69+
(goto-char (point-min))
70+
(while (haskell-lexeme-looking-at-token)
71+
(goto-char (match-end 0)))))
72+
4673
(ert-deftest haskell-lexeme-classify-chars-1 ()
4774
(should (equal 'varsym (haskell-lexeme-classify-by-first-char ?=)))
4875
(should (equal 'conid (haskell-lexeme-classify-by-first-char ?L)))
@@ -270,3 +297,54 @@ order."
270297
"code"
271298
"no code")
272299
'literate))
300+
301+
(ert-deftest haskell-lexeme-big-01-quasi-literal ()
302+
(check-lexemes-nocheck
303+
(concat "x = " "[th|"
304+
(make-string (* 10 1000 1000) ? )
305+
"|]")))
306+
307+
(ert-deftest haskell-lexeme-big-02-string ()
308+
(check-lexemes-nocheck
309+
(concat "x = " "\""
310+
(make-string (* 10 1000 1000) ? )
311+
"\"")))
312+
313+
(ert-deftest haskell-lexeme-big-03-string-with-escapes ()
314+
(check-lexemes-nocheck
315+
(concat "x = " "\""
316+
(let ((result "\\x01\\&,..\\NUL"))
317+
(dotimes (i 17)
318+
(setq result (concat result result)))
319+
result)
320+
"\"")))
321+
322+
(ert-deftest haskell-lexeme-big-04-long-id ()
323+
(check-lexemes-nocheck
324+
(concat "x = " (make-string 1000000 ?x))))
325+
326+
(ert-deftest haskell-lexeme-big-05-long-sym()
327+
(check-lexemes-nocheck
328+
(concat "x = " (make-string 1000000 ?+))))
329+
330+
(ert-deftest haskell-lexeme-big-06-long-module-name()
331+
(check-lexemes-nocheck
332+
(concat "x = " (make-string 10000000 ?M) ".x")))
333+
334+
(ert-deftest haskell-lexeme-big-07-many-modules-id()
335+
(check-lexemes-nocheck
336+
(concat "x = "
337+
(let ((result "M."))
338+
(dotimes (i 20)
339+
(setq result (concat result result)))
340+
result)
341+
"x")))
342+
343+
(ert-deftest haskell-lexeme-big-08-many-modules-sym()
344+
(check-lexemes-nocheck
345+
(concat "x = "
346+
(let ((result "M."))
347+
(dotimes (i 20)
348+
(setq result (concat result result)))
349+
result)
350+
"++")))

0 commit comments

Comments
 (0)
Please sign in to comment.