2
2
{-# LANGUAGE MultiWayIf #-}
3
3
{-# LANGUAGE RecordWildCards #-}
4
4
5
+ {-# LANGUAGE NoFieldSelectors #-}
6
+ {-# LANGUAGE DuplicateRecordFields #-}
5
7
{-# LANGUAGE OverloadedStrings #-}
6
8
{-# LANGUAGE OverloadedRecordDot #-}
7
9
{-# LANGUAGE BlockArguments #-}
8
10
9
11
module Solid.PP.NewLexer where
10
12
11
- import Prelude hiding (mod , takeWhile )
13
+ import Prelude hiding (span , mod , takeWhile )
12
14
13
15
import Data.Char
14
16
import Data.Functor
15
17
import Data.Text (Text )
16
18
import Data.Text.Internal (Text (.. ))
17
19
import Data.Text.Unsafe (Iter (.. ))
20
+ import Data.Text.Internal.Encoding.Utf8
18
21
import qualified Data.Text.Unsafe as Unsafe
19
22
import qualified Data.Text as T
20
23
21
- data SrcLoc = SrcLoc
22
- { offset :: ! Int
23
- , line :: ! Int
24
- , column :: ! Int
25
- } deriving (Show , Eq )
24
+ data Location = Location {
25
+ offset :: ! Int
26
+ , charOffset :: ! Int
27
+ , line :: ! Int
28
+ , column :: ! Int
29
+ } deriving (Show , Eq ) -- FIXME: Eq should not be used in production code; only compare offset instead
30
+
31
+ data SrcSpan = SrcSpan {
32
+ start :: Location
33
+ , end :: Location
34
+ } deriving (Show , Eq ) -- FIXME: Eq should not be used in production code; only compare offset instead
35
+
36
+ data Tok = Tok {
37
+ tokenType :: TokenType
38
+ , span :: SrcSpan
39
+ } deriving (Show , Eq )
40
+
41
+ textSpan :: Text -> Tok -> Text
42
+ textSpan input token = textSpan__ input token. span
26
43
27
- data SrcSpan = SrcSpan
28
- { start :: SrcLoc
29
- , end :: SrcLoc
30
- } deriving (Show , Eq )
44
+ textSpan__ :: Text -> SrcSpan -> Text
45
+ textSpan__ input span = textSpan_ input start end
46
+ where
47
+ start = span . start. offset
48
+ end = span . end. offset
31
49
32
- data Token = Token
33
- { tokType :: TokenType
34
- , tokSpan :: SrcSpan
35
- } deriving (Show , Eq )
50
+ textSpan_ :: Text -> Int -> Int -> Text
51
+ textSpan_ (Text arr _ _) start end = Text arr start (end - start)
36
52
37
53
data TokenType =
38
- TokKeyword Text
39
- | Identifier Text
54
+ Keyword
55
+ | Identifier
40
56
| QualifiedIdentifier Text Text
41
- | Constructor Text
57
+ | Constructor
42
58
| QualifiedConstructor Text Text
43
59
| IncompleteQualifiedName Text
44
- | TokOperator Text
45
- | Integer Text
46
- | TokSymbol Char
47
- | TokComment Text
60
+ | Operator Text
61
+ | Integer
62
+ | String
63
+ | Symbol Char
64
+ | Comment
48
65
| EndOfFile
49
66
deriving (Show , Eq )
50
67
51
68
data Lexer = Lexer {
52
- current :: SrcLoc
69
+ current :: Location
53
70
, input :: Text
54
71
} deriving (Show , Eq )
55
72
56
- data WithSrcSpan a = WithSrcSpan
57
- { span :: SrcSpan
58
- , value :: a
59
- } deriving (Show , Eq )
73
+ data WithSrcSpan a = WithSrcSpan {
74
+ span :: SrcSpan
75
+ , value :: a
76
+ } deriving (Show , Eq )
60
77
61
78
newtype LexerM a = LexerM { unLexerM :: Lexer -> (Lexer , a ) }
62
79
@@ -76,10 +93,6 @@ instance Monad LexerM where
76
93
LexerM mb = f a
77
94
in mb s'
78
95
79
- -- running
80
- runLexer :: LexerM a -> Text -> a
81
- runLexer (LexerM m) txt = snd (m (Lexer (SrcLoc 0 1 1 ) txt))
82
-
83
96
-- primitives
84
97
get :: LexerM Lexer
85
98
get = LexerM \ s -> (s, s)
@@ -120,20 +133,21 @@ peekChar = do
120
133
return $ if T. null lexer. input then '\ 0 ' else T. head lexer. input
121
134
122
135
-- Lexer driver
123
- tokenize :: Text -> [Token ]
124
- tokenize input = loop (Lexer (SrcLoc 0 1 1 ) input)
136
+ tokenize :: Text -> [Tok ]
137
+ tokenize input@ ( Text _ off _) = loop (Lexer (Location off 0 1 1 ) input)
125
138
where
126
- loop lexer = case unLexerM lexOne lexer of
127
- (_, Token EndOfFile _) -> []
139
+ loop :: Lexer -> [Tok ]
140
+ loop lexer = case lexOne. unLexerM lexer of
141
+ (_, Tok EndOfFile _) -> []
128
142
(new, token) -> token : loop new
129
143
130
- lexOne :: LexerM Token
144
+ lexOne :: LexerM Tok
131
145
lexOne = do
132
146
lexer <- get
133
147
mc <- peekChar
134
148
case mc of
135
149
c
136
- | c == '\ 0 ' -> return (Token EndOfFile $ SrcSpan lexer. current lexer. current)
150
+ | c == '\ 0 ' -> return (Tok EndOfFile $ SrcSpan lexer. current lexer. current)
137
151
{-
138
152
| T.isPrefixOf "--" <$> (input <$> get) -> do
139
153
comment <- takeUntil (== '\n')
@@ -145,45 +159,69 @@ lexOne = do
145
159
146
160
| isLower c || c == ' _' -> do
147
161
word <- takeWhile isIdChar
162
+ {-
148
163
let typ = if word.value `elem` keywords
149
164
then TokKeyword word.value
150
165
else Identifier word.value
151
166
pure $ Token typ word.span
167
+ -}
168
+ pure $ Tok Identifier word. span
152
169
153
- | isUpper c -> LexerM qualifiedName
170
+ | isUpper c -> do
171
+ word <- takeWhile isIdChar
172
+ pure $ Tok Constructor word. span
173
+ -- LexerM qualifiedName
154
174
155
175
| isDigit c -> do
156
176
num <- takeWhile isDigit
157
- pure $ Token (Integer num. value) num. span
177
+ pure $ Tok Integer num. span
178
+
179
+ | c == ' "' -> do
180
+ string
181
+ new <- get
182
+ pure $ Tok String (SrcSpan lexer. current new. current)
183
+
158
184
| c `elem` operators -> do
159
185
op <- takeWhile (`elem` operators)
160
- pure $ Token ( TokOperator op. value) op. span
186
+ pure $ Tok ( Operator op. value) op. span
161
187
| c `elem` symbols -> do
162
188
sym <- consumeChar
163
- pure $ Token ( TokSymbol sym. value) sym. span
189
+ pure $ Tok ( Symbol sym. value) sym. span
164
190
| otherwise -> do
165
191
ch <- consumeChar
166
- pure $ Token ( TokComment $ T. pack [ch . value]) ch. span -- FIXME
192
+ pure $ Tok Comment ch. span -- FIXME
167
193
168
- qualifiedName :: Lexer -> (Lexer , Token )
194
+ string :: LexerM (WithSrcSpan Char )
195
+ string = loop
196
+ where
197
+ loop = do
198
+ _ <- consumeChar
199
+ _ <- takeUntil (\ c -> c == ' "' || c == ' \\ ' )
200
+ c <- peekChar
201
+ if
202
+ | c == ' "' -> consumeChar
203
+ | c == ' \\ ' -> consumeChar >> loop
204
+ | otherwise -> undefined -- partial string - eof
205
+
206
+ qualifiedName :: Lexer -> (Lexer , Tok )
169
207
qualifiedName Lexer {.. } = scanConstructor - 1 0
170
208
where
171
- scanConstructor :: Int -> Int -> (Lexer , Token )
209
+ scanConstructor :: Int -> Int -> (Lexer , Tok )
172
210
scanConstructor lastDot ! i
173
211
| c == ' .' = scanIdentifier (i + d)
174
212
| isIdChar c = scanConstructor lastDot (i + d)
175
213
| otherwise = done
176
214
where
177
- done :: (Lexer , Token )
215
+ done :: (Lexer , Tok )
178
216
done = accept i \ match ->
179
217
if lastDot < 0 then
180
- Constructor match
218
+ Constructor
181
219
else
182
220
QualifiedConstructor (Unsafe. takeWord8 (lastDot - 1 ) match) (Unsafe. dropWord8 lastDot match)
183
221
184
222
Iter c d = safeIter input i
185
223
186
- scanIdentifier :: Int -> (Lexer , Token )
224
+ scanIdentifier :: Int -> (Lexer , Tok )
187
225
scanIdentifier ! i
188
226
| isLower c = accept (findEndOfId i) \ match ->
189
227
let
@@ -203,7 +241,7 @@ qualifiedName Lexer{..} = scanConstructor -1 0
203
241
where
204
242
Iter c d = safeIter input i
205
243
206
- accept :: Int -> (Text -> TokenType ) -> (Lexer , Token )
244
+ accept :: Int -> (Text -> TokenType ) -> (Lexer , Tok )
207
245
accept n f =
208
246
let
209
247
match = Unsafe. takeWord8 n input
@@ -212,7 +250,7 @@ qualifiedName Lexer{..} = scanConstructor -1 0
212
250
current = advanceText current match
213
251
, input = rest
214
252
}
215
- in (new, Token (f match) (SrcSpan current new. current))
253
+ in (new, Tok (f match) (SrcSpan current new. current))
216
254
217
255
218
256
isIdChar :: Char -> Bool
@@ -233,12 +271,12 @@ operators = ":!#$%&*+./<=>?@\\^|-~"
233
271
symbols :: [Char ]
234
272
symbols = " (),;[]{}"
235
273
236
- advanceChar :: SrcLoc -> Char -> SrcLoc
237
- advanceChar (SrcLoc o l c) ch
238
- | ch == ' \n ' = SrcLoc (o + 1 ) (l + 1 ) 1
239
- | otherwise = SrcLoc (o + 1 ) l (c + 1 )
274
+ advanceChar :: Location -> Char -> Location
275
+ advanceChar (Location offset o l c) ch
276
+ | ch == ' \n ' = Location (offset + utf8Length ch) (o + 1 ) (l + 1 ) 1
277
+ | otherwise = Location (offset + utf8Length ch) (o + 1 ) l (c + 1 )
240
278
241
- advanceText :: SrcLoc -> Text -> SrcLoc
279
+ advanceText :: Location -> Text -> Location
242
280
advanceText = T. foldl' advanceChar
243
281
244
282
safeIter :: Text -> Int -> Iter
0 commit comments