@@ -12,7 +12,7 @@ module Solid.PP.NewLexer where
12
12
13
13
import Prelude hiding (span , mod , takeWhile )
14
14
15
- import Data.Char
15
+ import Data.Char hiding ( isSymbol )
16
16
import Data.Functor
17
17
import Data.Text (Text )
18
18
import Data.Text.Internal (Text (.. ))
@@ -28,20 +28,29 @@ data Location = Location {
28
28
, column :: ! Int
29
29
} deriving (Show , Eq ) -- FIXME: Eq should not be used in production code; only compare offset instead
30
30
31
- data SrcSpan = SrcSpan {
31
+ adjustOffset :: Int -> Location -> Location
32
+ adjustOffset n Location {.. } = Location {
33
+ offset = offset + n
34
+ , charOffset = charOffset + n
35
+ , line
36
+ , column = column + n
37
+ }
38
+ {-# INLINE adjustOffset #-}
39
+
40
+ data Span = Span {
32
41
start :: Location
33
42
, end :: Location
34
43
} deriving (Show , Eq ) -- FIXME: Eq should not be used in production code; only compare offset instead
35
44
36
- data Tok = Tok {
45
+ data Token = Token {
37
46
tokenType :: TokenType
38
- , span :: SrcSpan
47
+ , span :: Span
39
48
} deriving (Show , Eq )
40
49
41
- textSpan :: Text -> Tok -> Text
50
+ textSpan :: Text -> Token -> Text
42
51
textSpan input token = textSpan__ input token. span
43
52
44
- textSpan__ :: Text -> SrcSpan -> Text
53
+ textSpan__ :: Text -> Span -> Text
45
54
textSpan__ input span = textSpan_ input start end
46
55
where
47
56
start = span . start. offset
@@ -51,27 +60,59 @@ textSpan_ :: Text -> Int -> Int -> Text
51
60
textSpan_ (Text arr _ _) start end = Text arr start (end - start)
52
61
53
62
data TokenType =
54
- Keyword
63
+ -- Keyword
64
+ Constructor
55
65
| Identifier
56
- | QualifiedIdentifier Text Text
57
- | Constructor
58
- | QualifiedConstructor Text Text
59
- | IncompleteQualifiedName Text
60
- | Operator Text
66
+ | Symbol Text
61
67
| Integer
62
68
| String
63
- | Symbol Char
69
+ | UnterminatedString
70
+ | Special Char
64
71
| Comment
65
72
| EndOfFile
73
+
74
+ -- synthetic tokens
75
+ | QualifiedIdentifier
76
+ | QualifiedConstructor
77
+ | IncompleteQualifiedName
78
+ | Projection
66
79
deriving (Show , Eq )
67
80
81
+ union :: Span -> Span -> Span
82
+ union start end = Span start. start end. end
83
+
84
+ synthesize :: [Token ] -> [Token ]
85
+ synthesize = loop
86
+ where
87
+ loop :: [Token ] -> [Token ]
88
+ loop = \ case
89
+ [] -> []
90
+ Token (Symbol " ." ) start : Token Identifier end : rest | start. end. offset == end. start. offset -> Token Projection (Span start. start end. end) : loop rest
91
+ Token Constructor start : Token (Symbol " ." ) end : rest | start. end. offset == end. start. offset -> qualifiedName (union start end) rest
92
+ token : rest -> token : loop rest
93
+
94
+ qualifiedName :: Span -> [Token ] -> [Token ]
95
+ qualifiedName start = \ case
96
+ Token Constructor name : Token (Symbol " ." ) end : rest | name. end. offset == end. start. offset -> qualifiedName span rest
97
+ where
98
+ span = union start end
99
+
100
+ Token t end : rest | start. end. offset == end. start. offset -> case t of
101
+ Constructor -> accept QualifiedConstructor
102
+ Identifier -> accept QualifiedIdentifier
103
+ _ -> undefined
104
+ where
105
+ accept t = Token t (Span start. start end. end) : loop rest
106
+ tokens@ (_ : _) -> Token IncompleteQualifiedName start : loop tokens
107
+ [] -> [Token IncompleteQualifiedName start]
108
+
68
109
data Lexer = Lexer {
69
110
current :: Location
70
111
, input :: Text
71
112
} deriving (Show , Eq )
72
113
73
114
data WithSrcSpan a = WithSrcSpan {
74
- span :: SrcSpan
115
+ span :: Span
75
116
, value :: a
76
117
} deriving (Show , Eq )
77
118
@@ -111,7 +152,7 @@ takeWhile p = do
111
152
let (match, rest) = T. span p lexer. input
112
153
end = advanceText lexer. current match
113
154
put $ Lexer end rest
114
- pure $ WithSrcSpan (SrcSpan lexer. current end) match
155
+ pure $ WithSrcSpan (Span lexer. current end) match
115
156
116
157
takeUntil :: (Char -> Bool ) -> LexerM (WithSrcSpan Text )
117
158
takeUntil p = takeWhile (not . p)
@@ -125,29 +166,32 @@ consumeChar = do
125
166
let start = lexer. current
126
167
end = advanceChar start c
127
168
put $ Lexer end rest
128
- pure $ WithSrcSpan (SrcSpan start end) c
169
+ pure $ WithSrcSpan (Span start end) c
170
+
171
+ consumeChar_ :: LexerM ()
172
+ consumeChar_ = void consumeChar
129
173
130
174
peekChar :: LexerM Char
131
175
peekChar = do
132
176
lexer <- get
133
177
return $ if T. null lexer. input then '\ 0 ' else T. head lexer. input
134
178
135
179
-- Lexer driver
136
- tokenize :: Text -> [Tok ]
180
+ tokenize :: Text -> [Token ]
137
181
tokenize input@ (Text _ off _) = loop (Lexer (Location off 0 1 1 ) input)
138
182
where
139
- loop :: Lexer -> [Tok ]
183
+ loop :: Lexer -> [Token ]
140
184
loop lexer = case lexOne. unLexerM lexer of
141
- (_, Tok EndOfFile _) -> []
185
+ (_, Token EndOfFile _) -> []
142
186
(new, token) -> token : loop new
143
187
144
- lexOne :: LexerM Tok
188
+ lexOne :: LexerM Token
145
189
lexOne = do
146
190
lexer <- get
147
191
mc <- peekChar
148
192
case mc of
149
193
c
150
- | c == '\ 0 ' -> return (Tok EndOfFile $ SrcSpan lexer. current lexer. current)
194
+ | c == '\ 0 ' -> return (Token EndOfFile $ Span lexer. current lexer. current)
151
195
{-
152
196
| T.isPrefixOf "--" <$> (input <$> get) -> do
153
197
comment <- takeUntil (== '\n')
@@ -165,93 +209,55 @@ lexOne = do
165
209
else Identifier word.value
166
210
pure $ Token typ word.span
167
211
-}
168
- pure $ Tok Identifier word. span
212
+ pure $ Token Identifier word. span
169
213
170
214
| isUpper c -> do
171
215
word <- takeWhile isIdChar
172
- pure $ Tok Constructor word. span
173
- -- LexerM qualifiedName
216
+ pure $ Token Constructor word. span
174
217
175
218
| isDigit c -> do
176
219
num <- takeWhile isDigit
177
- pure $ Tok Integer num. span
220
+ pure $ Token Integer num. span
178
221
179
222
| c == ' "' -> do
180
- string
223
+ t <- string
181
224
new <- get
182
- pure $ Tok String (SrcSpan lexer. current new. current)
225
+ pure $ Token t (Span lexer. current new. current)
226
+
227
+ | isSymbol c -> do
228
+ op <- takeWhile isSymbol
229
+ case op. value of
230
+ " --" -> do
231
+
232
+ -- FIXME: improve performance
233
+ --
234
+ -- 1. don't need to update column
235
+ -- 2. skip \n and increase line
236
+ ignore <- takeWhile (/= ' \n ' )
237
+
238
+ lexOne
239
+ _ -> pure $ Token (Symbol op. value) op. span
183
240
184
- | c `elem` operators -> do
185
- op <- takeWhile (`elem` operators)
186
- pure $ Tok (Operator op. value) op. span
187
- | c `elem` symbols -> do
241
+ | c `elem` special -> do
188
242
sym <- consumeChar
189
- pure $ Tok ( Symbol sym. value) sym. span
243
+ pure $ Token ( Special sym. value) sym. span
190
244
| otherwise -> do
191
245
ch <- consumeChar
192
- pure $ Tok Comment ch. span -- FIXME
246
+ pure $ Token Comment ch. span -- FIXME
193
247
194
- string :: LexerM ( WithSrcSpan Char )
248
+ string :: LexerM TokenType
195
249
string = loop
196
250
where
251
+ loop :: LexerM TokenType
197
252
loop = do
198
- _ <- consumeChar
199
- _ <- takeUntil (\ c -> c == ' "' || c == ' \\ ' )
253
+ consumeChar_
254
+ _ <- takeUntil (\ c -> c == ' "' || c == ' \\ ' || c == ' \n ' )
200
255
c <- peekChar
201
256
if
202
- | c == ' "' -> consumeChar
203
- | c == ' \\ ' -> consumeChar >> loop
204
- | otherwise -> undefined -- partial string - eof
205
-
206
- qualifiedName :: Lexer -> (Lexer , Tok )
207
- qualifiedName Lexer {.. } = scanConstructor - 1 0
208
- where
209
- scanConstructor :: Int -> Int -> (Lexer , Tok )
210
- scanConstructor lastDot ! i
211
- | c == ' .' = scanIdentifier (i + d)
212
- | isIdChar c = scanConstructor lastDot (i + d)
213
- | otherwise = done
214
- where
215
- done :: (Lexer , Tok )
216
- done = accept i \ match ->
217
- if lastDot < 0 then
218
- Constructor
219
- else
220
- QualifiedConstructor (Unsafe. takeWord8 (lastDot - 1 ) match) (Unsafe. dropWord8 lastDot match)
221
-
222
- Iter c d = safeIter input i
223
-
224
- scanIdentifier :: Int -> (Lexer , Tok )
225
- scanIdentifier ! i
226
- | isLower c = accept (findEndOfId i) \ match ->
227
- let
228
- mod = Unsafe. takeWord8 (i - 1 ) match
229
- name = Unsafe. dropWord8 i match
230
- tok = QualifiedIdentifier mod name
231
- in tok
232
- | isIdChar c = scanConstructor i (i + d)
233
- | otherwise = accept i IncompleteQualifiedName
234
- where
235
- Iter c d = safeIter input i
236
-
237
- findEndOfId :: Int -> Int
238
- findEndOfId ! i
239
- | isIdChar c = findEndOfId (i + d)
240
- | otherwise = i
241
- where
242
- Iter c d = safeIter input i
243
-
244
- accept :: Int -> (Text -> TokenType ) -> (Lexer , Tok )
245
- accept n f =
246
- let
247
- match = Unsafe. takeWord8 n input
248
- rest = Unsafe. dropWord8 n input
249
- new = Lexer {
250
- current = advanceText current match
251
- , input = rest
252
- }
253
- in (new, Tok (f match) (SrcSpan current new. current))
254
-
257
+ | c == ' "' -> consumeChar_ >> pure String
258
+ | c == ' \\ ' -> consumeChar_ >> loop
259
+ | c == ' \n ' -> pure UnterminatedString
260
+ | otherwise -> pure UnterminatedString
255
261
256
262
isIdChar :: Char -> Bool
257
263
isIdChar c = isAlphaNum c || c == ' _'
@@ -265,11 +271,18 @@ keywords =
265
271
, " module" , " newtype" , " of" , " then" , " type" , " where" , " forall"
266
272
]
267
273
268
- operators :: [Char ]
269
- operators = " :!#$%&*+./<=>?@\\ ^|-~"
270
-
271
274
symbols :: [Char ]
272
- symbols = " (),;[]{}"
275
+ symbols = " :!#$%&*+./<=>?@\\ ^|-~"
276
+
277
+
278
+ -- .. | : | :: | = | \ | | | <- | -> | @ | ~ | =>
279
+ reservedop = [" .." , " :" , " ::" , " =" , " \\ " , " |" , " <-" , " ->" , " @" , " ~" , " =>" ]
280
+
281
+ isSymbol :: Char -> Bool
282
+ isSymbol = (`elem` symbols)
283
+
284
+ special :: [Char ]
285
+ special = " (),;[]{}"
273
286
274
287
advanceChar :: Location -> Char -> Location
275
288
advanceChar (Location offset o l c) ch
0 commit comments