Skip to content

Commit 2a65d2c

Browse files
committed
Add byte offset
1 parent 9673d85 commit 2a65d2c

File tree

2 files changed

+181
-81
lines changed

2 files changed

+181
-81
lines changed

solid-pp/src/Solid/PP/NewLexer.hs

Lines changed: 91 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -2,61 +2,78 @@
22
{-# LANGUAGE MultiWayIf #-}
33
{-# LANGUAGE RecordWildCards #-}
44

5+
{-# LANGUAGE NoFieldSelectors #-}
6+
{-# LANGUAGE DuplicateRecordFields #-}
57
{-# LANGUAGE OverloadedStrings #-}
68
{-# LANGUAGE OverloadedRecordDot #-}
79
{-# LANGUAGE BlockArguments #-}
810

911
module Solid.PP.NewLexer where
1012

11-
import Prelude hiding (mod, takeWhile)
13+
import Prelude hiding (span, mod, takeWhile)
1214

1315
import Data.Char
1416
import Data.Functor
1517
import Data.Text (Text)
1618
import Data.Text.Internal (Text(..))
1719
import Data.Text.Unsafe (Iter(..))
20+
import Data.Text.Internal.Encoding.Utf8
1821
import qualified Data.Text.Unsafe as Unsafe
1922
import qualified Data.Text as T
2023

21-
data SrcLoc = SrcLoc
22-
{ offset :: !Int
23-
, line :: !Int
24-
, column :: !Int
25-
} deriving (Show, Eq)
24+
data Location = Location {
25+
offset :: !Int
26+
, charOffset :: !Int
27+
, line :: !Int
28+
, column :: !Int
29+
} deriving (Show, Eq) -- FIXME: Eq should not be used in production code; only compare offset instead
30+
31+
data SrcSpan = SrcSpan {
32+
start :: Location
33+
, end :: Location
34+
} deriving (Show, Eq) -- FIXME: Eq should not be used in production code; only compare offset instead
35+
36+
data Tok = Tok {
37+
tokenType :: TokenType
38+
, span :: SrcSpan
39+
} deriving (Show, Eq)
40+
41+
textSpan :: Text -> Tok -> Text
42+
textSpan input token = textSpan__ input token.span
2643

27-
data SrcSpan = SrcSpan
28-
{ start :: SrcLoc
29-
, end :: SrcLoc
30-
} deriving (Show, Eq)
44+
textSpan__ :: Text -> SrcSpan -> Text
45+
textSpan__ input span = textSpan_ input start end
46+
where
47+
start = span.start.offset
48+
end = span.end.offset
3149

32-
data Token = Token
33-
{ tokType :: TokenType
34-
, tokSpan :: SrcSpan
35-
} deriving (Show, Eq)
50+
textSpan_ :: Text -> Int -> Int -> Text
51+
textSpan_ (Text arr _ _) start end = Text arr start (end - start)
3652

3753
data TokenType =
38-
TokKeyword Text
39-
| Identifier Text
54+
Keyword
55+
| Identifier
4056
| QualifiedIdentifier Text Text
41-
| Constructor Text
57+
| Constructor
4258
| QualifiedConstructor Text Text
4359
| IncompleteQualifiedName Text
44-
| TokOperator Text
45-
| Integer Text
46-
| TokSymbol Char
47-
| TokComment Text
60+
| Operator Text
61+
| Integer
62+
| String
63+
| Symbol Char
64+
| Comment
4865
| EndOfFile
4966
deriving (Show, Eq)
5067

5168
data Lexer = Lexer {
52-
current :: SrcLoc
69+
current :: Location
5370
, input :: Text
5471
} deriving (Show, Eq)
5572

56-
data WithSrcSpan a = WithSrcSpan
57-
{ span :: SrcSpan
58-
, value :: a
59-
} deriving (Show, Eq)
73+
data WithSrcSpan a = WithSrcSpan {
74+
span :: SrcSpan
75+
, value :: a
76+
} deriving (Show, Eq)
6077

6178
newtype LexerM a = LexerM { unLexerM :: Lexer -> (Lexer, a) }
6279

@@ -76,10 +93,6 @@ instance Monad LexerM where
7693
LexerM mb = f a
7794
in mb s'
7895

79-
-- running
80-
runLexer :: LexerM a -> Text -> a
81-
runLexer (LexerM m) txt = snd (m (Lexer (SrcLoc 0 1 1) txt))
82-
8396
-- primitives
8497
get :: LexerM Lexer
8598
get = LexerM \s -> (s, s)
@@ -120,20 +133,21 @@ peekChar = do
120133
return $ if T.null lexer.input then '\0' else T.head lexer.input
121134

122135
-- Lexer driver
123-
tokenize :: Text -> [Token]
124-
tokenize input = loop (Lexer (SrcLoc 0 1 1) input)
136+
tokenize :: Text -> [Tok]
137+
tokenize input@(Text _ off _) = loop (Lexer (Location off 0 1 1) input)
125138
where
126-
loop lexer = case unLexerM lexOne lexer of
127-
(_, Token EndOfFile _) -> []
139+
loop :: Lexer -> [Tok]
140+
loop lexer = case lexOne.unLexerM lexer of
141+
(_, Tok EndOfFile _) -> []
128142
(new, token) -> token : loop new
129143

130-
lexOne :: LexerM Token
144+
lexOne :: LexerM Tok
131145
lexOne = do
132146
lexer <- get
133147
mc <- peekChar
134148
case mc of
135149
c
136-
| c == '\0' -> return (Token EndOfFile $ SrcSpan lexer.current lexer.current)
150+
| c == '\0' -> return (Tok EndOfFile $ SrcSpan lexer.current lexer.current)
137151
{-
138152
| T.isPrefixOf "--" <$> (input <$> get) -> do
139153
comment <- takeUntil (== '\n')
@@ -145,45 +159,69 @@ lexOne = do
145159

146160
| isLower c || c == '_' -> do
147161
word <- takeWhile isIdChar
162+
{-
148163
let typ = if word.value `elem` keywords
149164
then TokKeyword word.value
150165
else Identifier word.value
151166
pure $ Token typ word.span
167+
-}
168+
pure $ Tok Identifier word.span
152169

153-
| isUpper c -> LexerM qualifiedName
170+
| isUpper c -> do
171+
word <- takeWhile isIdChar
172+
pure $ Tok Constructor word.span
173+
-- LexerM qualifiedName
154174

155175
| isDigit c -> do
156176
num <- takeWhile isDigit
157-
pure $ Token (Integer num.value) num.span
177+
pure $ Tok Integer num.span
178+
179+
| c == '"' -> do
180+
string
181+
new <- get
182+
pure $ Tok String (SrcSpan lexer.current new.current)
183+
158184
| c `elem` operators -> do
159185
op <- takeWhile (`elem` operators)
160-
pure $ Token (TokOperator op.value) op.span
186+
pure $ Tok (Operator op.value) op.span
161187
| c `elem` symbols -> do
162188
sym <- consumeChar
163-
pure $ Token (TokSymbol sym.value) sym.span
189+
pure $ Tok (Symbol sym.value) sym.span
164190
| otherwise -> do
165191
ch <- consumeChar
166-
pure $ Token (TokComment $ T.pack [ch.value]) ch.span -- FIXME
192+
pure $ Tok Comment ch.span -- FIXME
167193

168-
qualifiedName :: Lexer -> (Lexer, Token)
194+
string :: LexerM (WithSrcSpan Char)
195+
string = loop
196+
where
197+
loop = do
198+
_ <- consumeChar
199+
_ <- takeUntil (\ c -> c == '"' || c == '\\')
200+
c <- peekChar
201+
if
202+
| c == '"' -> consumeChar
203+
| c == '\\' -> consumeChar >> loop
204+
| otherwise -> undefined -- partial string - eof
205+
206+
qualifiedName :: Lexer -> (Lexer, Tok)
169207
qualifiedName Lexer{..} = scanConstructor -1 0
170208
where
171-
scanConstructor :: Int -> Int -> (Lexer, Token)
209+
scanConstructor :: Int -> Int -> (Lexer, Tok)
172210
scanConstructor lastDot !i
173211
| c == '.' = scanIdentifier (i + d)
174212
| isIdChar c = scanConstructor lastDot (i + d)
175213
| otherwise = done
176214
where
177-
done :: (Lexer, Token)
215+
done :: (Lexer, Tok)
178216
done = accept i \ match ->
179217
if lastDot < 0 then
180-
Constructor match
218+
Constructor
181219
else
182220
QualifiedConstructor (Unsafe.takeWord8 (lastDot - 1) match) (Unsafe.dropWord8 lastDot match)
183221

184222
Iter c d = safeIter input i
185223

186-
scanIdentifier :: Int -> (Lexer, Token)
224+
scanIdentifier :: Int -> (Lexer, Tok)
187225
scanIdentifier !i
188226
| isLower c = accept (findEndOfId i) \ match ->
189227
let
@@ -203,7 +241,7 @@ qualifiedName Lexer{..} = scanConstructor -1 0
203241
where
204242
Iter c d = safeIter input i
205243

206-
accept :: Int -> (Text -> TokenType) -> (Lexer, Token)
244+
accept :: Int -> (Text -> TokenType) -> (Lexer, Tok)
207245
accept n f =
208246
let
209247
match = Unsafe.takeWord8 n input
@@ -212,7 +250,7 @@ qualifiedName Lexer{..} = scanConstructor -1 0
212250
current = advanceText current match
213251
, input = rest
214252
}
215-
in (new, Token (f match) (SrcSpan current new.current))
253+
in (new, Tok (f match) (SrcSpan current new.current))
216254

217255

218256
isIdChar :: Char -> Bool
@@ -233,12 +271,12 @@ operators = ":!#$%&*+./<=>?@\\^|-~"
233271
symbols :: [Char]
234272
symbols = "(),;[]{}"
235273

236-
advanceChar :: SrcLoc -> Char -> SrcLoc
237-
advanceChar (SrcLoc o l c) ch
238-
| ch == '\n' = SrcLoc (o + 1) (l + 1) 1
239-
| otherwise = SrcLoc (o + 1) l (c + 1)
274+
advanceChar :: Location -> Char -> Location
275+
advanceChar (Location offset o l c) ch
276+
| ch == '\n' = Location (offset + utf8Length ch) (o + 1) (l + 1) 1
277+
| otherwise = Location (offset + utf8Length ch) (o + 1) l (c + 1)
240278

241-
advanceText :: SrcLoc -> Text -> SrcLoc
279+
advanceText :: Location -> Text -> Location
242280
advanceText = T.foldl' advanceChar
243281

244282
safeIter :: Text -> Int -> Iter

0 commit comments

Comments
 (0)