Skip to content

Commit 2acc8ef

Browse files
committed
Work in progress
1 parent 306e64e commit 2acc8ef

File tree

4 files changed

+272
-141
lines changed

4 files changed

+272
-141
lines changed

solid-pp/src/Solid/PP/Lexer/Fast.hs

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
module Solid.PP.Lexer.Fast where
2+
3+
import Data.Word
4+
import Data.Bits
5+
6+
-- |
7+
-- O(1) check whether a 'Word8' is one of:
8+
-- [33,35,36,37,38,42,43,45,46,47,58,60,61,62,63,64,92,94,124,126]
9+
isSymbol :: Word8 -> Bool
10+
isSymbol !c = (bitset `unsafeShiftR` i .&. 1) /= 0
11+
where
12+
!i = fromIntegral (c .&. 0b111111) :: Int
13+
14+
!bitset = case c `unsafeShiftR` 6 of
15+
0 -> 0xf400ec7a00000000 :: Word64 -- [33,35,36,37,38,42,43,45,46,47,58,60,61,62,63]
16+
1 -> 0x5000000050000001 :: Word64 -- [64,92,94,124,126]
17+
_ -> 0

solid-pp/src/Solid/PP/NewLexer.hs

+106-93
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ module Solid.PP.NewLexer where
1212

1313
import Prelude hiding (span, mod, takeWhile)
1414

15-
import Data.Char
15+
import Data.Char hiding (isSymbol)
1616
import Data.Functor
1717
import Data.Text (Text)
1818
import Data.Text.Internal (Text(..))
@@ -28,20 +28,29 @@ data Location = Location {
2828
, column :: !Int
2929
} deriving (Show, Eq) -- FIXME: Eq should not be used in production code; only compare offset instead
3030

31-
data SrcSpan = SrcSpan {
31+
adjustOffset :: Int -> Location -> Location
32+
adjustOffset n Location{..} = Location {
33+
offset = offset + n
34+
, charOffset = charOffset + n
35+
, line
36+
, column = column + n
37+
}
38+
{-# INLINE adjustOffset #-}
39+
40+
data Span = Span {
3241
start :: Location
3342
, end :: Location
3443
} deriving (Show, Eq) -- FIXME: Eq should not be used in production code; only compare offset instead
3544

36-
data Tok = Tok {
45+
data Token = Token {
3746
tokenType :: TokenType
38-
, span :: SrcSpan
47+
, span :: Span
3948
} deriving (Show, Eq)
4049

41-
textSpan :: Text -> Tok -> Text
50+
textSpan :: Text -> Token -> Text
4251
textSpan input token = textSpan__ input token.span
4352

44-
textSpan__ :: Text -> SrcSpan -> Text
53+
textSpan__ :: Text -> Span -> Text
4554
textSpan__ input span = textSpan_ input start end
4655
where
4756
start = span.start.offset
@@ -51,27 +60,59 @@ textSpan_ :: Text -> Int -> Int -> Text
5160
textSpan_ (Text arr _ _) start end = Text arr start (end - start)
5261

5362
data TokenType =
54-
Keyword
63+
-- Keyword
64+
Constructor
5565
| Identifier
56-
| QualifiedIdentifier Text Text
57-
| Constructor
58-
| QualifiedConstructor Text Text
59-
| IncompleteQualifiedName Text
60-
| Operator Text
66+
| Symbol Text
6167
| Integer
6268
| String
63-
| Symbol Char
69+
| UnterminatedString
70+
| Special Char
6471
| Comment
6572
| EndOfFile
73+
74+
-- synthetic tokens
75+
| QualifiedIdentifier
76+
| QualifiedConstructor
77+
| IncompleteQualifiedName
78+
| Projection
6679
deriving (Show, Eq)
6780

81+
union :: Span -> Span -> Span
82+
union start end = Span start.start end.end
83+
84+
synthesize :: [Token] -> [Token]
85+
synthesize = loop
86+
where
87+
loop :: [Token] -> [Token]
88+
loop = \ case
89+
[] -> []
90+
Token (Symbol ".") start : Token Identifier end : rest | start.end.offset == end.start.offset -> Token Projection (Span start.start end.end) : loop rest
91+
Token Constructor start : Token (Symbol ".") end : rest | start.end.offset == end.start.offset -> qualifiedName (union start end) rest
92+
token : rest -> token : loop rest
93+
94+
qualifiedName :: Span -> [Token] -> [Token]
95+
qualifiedName start = \ case
96+
Token Constructor name : Token (Symbol ".") end : rest | name.end.offset == end.start.offset -> qualifiedName span rest
97+
where
98+
span = union start end
99+
100+
Token t end : rest | start.end.offset == end.start.offset -> case t of
101+
Constructor -> accept QualifiedConstructor
102+
Identifier -> accept QualifiedIdentifier
103+
_ -> undefined
104+
where
105+
accept t = Token t (Span start.start end.end) : loop rest
106+
tokens@(_ : _) -> Token IncompleteQualifiedName start : loop tokens
107+
[] -> [Token IncompleteQualifiedName start]
108+
68109
data Lexer = Lexer {
69110
current :: Location
70111
, input :: Text
71112
} deriving (Show, Eq)
72113

73114
data WithSrcSpan a = WithSrcSpan {
74-
span :: SrcSpan
115+
span :: Span
75116
, value :: a
76117
} deriving (Show, Eq)
77118

@@ -111,7 +152,7 @@ takeWhile p = do
111152
let (match, rest) = T.span p lexer.input
112153
end = advanceText lexer.current match
113154
put $ Lexer end rest
114-
pure $ WithSrcSpan (SrcSpan lexer.current end) match
155+
pure $ WithSrcSpan (Span lexer.current end) match
115156

116157
takeUntil :: (Char -> Bool) -> LexerM (WithSrcSpan Text)
117158
takeUntil p = takeWhile (not . p)
@@ -125,29 +166,32 @@ consumeChar = do
125166
let start = lexer.current
126167
end = advanceChar start c
127168
put $ Lexer end rest
128-
pure $ WithSrcSpan (SrcSpan start end) c
169+
pure $ WithSrcSpan (Span start end) c
170+
171+
consumeChar_ :: LexerM ()
172+
consumeChar_ = void consumeChar
129173

130174
peekChar :: LexerM Char
131175
peekChar = do
132176
lexer <- get
133177
return $ if T.null lexer.input then '\0' else T.head lexer.input
134178

135179
-- Lexer driver
136-
tokenize :: Text -> [Tok]
180+
tokenize :: Text -> [Token]
137181
tokenize input@(Text _ off _) = loop (Lexer (Location off 0 1 1) input)
138182
where
139-
loop :: Lexer -> [Tok]
183+
loop :: Lexer -> [Token]
140184
loop lexer = case lexOne.unLexerM lexer of
141-
(_, Tok EndOfFile _) -> []
185+
(_, Token EndOfFile _) -> []
142186
(new, token) -> token : loop new
143187

144-
lexOne :: LexerM Tok
188+
lexOne :: LexerM Token
145189
lexOne = do
146190
lexer <- get
147191
mc <- peekChar
148192
case mc of
149193
c
150-
| c == '\0' -> return (Tok EndOfFile $ SrcSpan lexer.current lexer.current)
194+
| c == '\0' -> return (Token EndOfFile $ Span lexer.current lexer.current)
151195
{-
152196
| T.isPrefixOf "--" <$> (input <$> get) -> do
153197
comment <- takeUntil (== '\n')
@@ -165,93 +209,55 @@ lexOne = do
165209
else Identifier word.value
166210
pure $ Token typ word.span
167211
-}
168-
pure $ Tok Identifier word.span
212+
pure $ Token Identifier word.span
169213

170214
| isUpper c -> do
171215
word <- takeWhile isIdChar
172-
pure $ Tok Constructor word.span
173-
-- LexerM qualifiedName
216+
pure $ Token Constructor word.span
174217

175218
| isDigit c -> do
176219
num <- takeWhile isDigit
177-
pure $ Tok Integer num.span
220+
pure $ Token Integer num.span
178221

179222
| c == '"' -> do
180-
string
223+
t <- string
181224
new <- get
182-
pure $ Tok String (SrcSpan lexer.current new.current)
225+
pure $ Token t (Span lexer.current new.current)
226+
227+
| isSymbol c -> do
228+
op <- takeWhile isSymbol
229+
case op.value of
230+
"--" -> do
231+
232+
-- FIXME: improve performance
233+
--
234+
-- 1. don't need to update column
235+
-- 2. skip \n and increase line
236+
ignore <- takeWhile (/= '\n')
237+
238+
lexOne
239+
_ -> pure $ Token (Symbol op.value) op.span
183240

184-
| c `elem` operators -> do
185-
op <- takeWhile (`elem` operators)
186-
pure $ Tok (Operator op.value) op.span
187-
| c `elem` symbols -> do
241+
| c `elem` special -> do
188242
sym <- consumeChar
189-
pure $ Tok (Symbol sym.value) sym.span
243+
pure $ Token (Special sym.value) sym.span
190244
| otherwise -> do
191245
ch <- consumeChar
192-
pure $ Tok Comment ch.span -- FIXME
246+
pure $ Token Comment ch.span -- FIXME
193247

194-
string :: LexerM (WithSrcSpan Char)
248+
string :: LexerM TokenType
195249
string = loop
196250
where
251+
loop :: LexerM TokenType
197252
loop = do
198-
_ <- consumeChar
199-
_ <- takeUntil (\ c -> c == '"' || c == '\\')
253+
consumeChar_
254+
_ <- takeUntil (\ c -> c == '"' || c == '\\' || c == '\n')
200255
c <- peekChar
201256
if
202-
| c == '"' -> consumeChar
203-
| c == '\\' -> consumeChar >> loop
204-
| otherwise -> undefined -- partial string - eof
205-
206-
qualifiedName :: Lexer -> (Lexer, Tok)
207-
qualifiedName Lexer{..} = scanConstructor -1 0
208-
where
209-
scanConstructor :: Int -> Int -> (Lexer, Tok)
210-
scanConstructor lastDot !i
211-
| c == '.' = scanIdentifier (i + d)
212-
| isIdChar c = scanConstructor lastDot (i + d)
213-
| otherwise = done
214-
where
215-
done :: (Lexer, Tok)
216-
done = accept i \ match ->
217-
if lastDot < 0 then
218-
Constructor
219-
else
220-
QualifiedConstructor (Unsafe.takeWord8 (lastDot - 1) match) (Unsafe.dropWord8 lastDot match)
221-
222-
Iter c d = safeIter input i
223-
224-
scanIdentifier :: Int -> (Lexer, Tok)
225-
scanIdentifier !i
226-
| isLower c = accept (findEndOfId i) \ match ->
227-
let
228-
mod = Unsafe.takeWord8 (i - 1) match
229-
name = Unsafe.dropWord8 i match
230-
tok = QualifiedIdentifier mod name
231-
in tok
232-
| isIdChar c = scanConstructor i (i + d)
233-
| otherwise = accept i IncompleteQualifiedName
234-
where
235-
Iter c d = safeIter input i
236-
237-
findEndOfId :: Int -> Int
238-
findEndOfId !i
239-
| isIdChar c = findEndOfId (i + d)
240-
| otherwise = i
241-
where
242-
Iter c d = safeIter input i
243-
244-
accept :: Int -> (Text -> TokenType) -> (Lexer, Tok)
245-
accept n f =
246-
let
247-
match = Unsafe.takeWord8 n input
248-
rest = Unsafe.dropWord8 n input
249-
new = Lexer {
250-
current = advanceText current match
251-
, input = rest
252-
}
253-
in (new, Tok (f match) (SrcSpan current new.current))
254-
257+
| c == '"' -> consumeChar_ >> pure String
258+
| c == '\\' -> consumeChar_ >> loop
259+
| c == '\n' -> pure UnterminatedString
260+
| otherwise -> pure UnterminatedString
255261

256262
isIdChar :: Char -> Bool
257263
isIdChar c = isAlphaNum c || c == '_'
@@ -265,11 +271,18 @@ keywords =
265271
, "module", "newtype", "of", "then", "type", "where", "forall"
266272
]
267273

268-
operators :: [Char]
269-
operators = ":!#$%&*+./<=>?@\\^|-~"
270-
271274
symbols :: [Char]
272-
symbols = "(),;[]{}"
275+
symbols = ":!#$%&*+./<=>?@\\^|-~"
276+
277+
278+
-- .. | : | :: | = | \ | | | <- | -> | @ | ~ | =>
279+
reservedop = ["..", ":", "::", "=", "\\", "|", "<-", "->", "@", "~", "=>"]
280+
281+
isSymbol :: Char -> Bool
282+
isSymbol = (`elem` symbols)
283+
284+
special :: [Char]
285+
special = "(),;[]{}"
273286

274287
advanceChar :: Location -> Char -> Location
275288
advanceChar (Location offset o l c) ch
+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
module Solid.PP.Lexer.FastSpec (spec) where
2+
3+
import Test.Hspec
4+
import Control.Monad
5+
import Data.ByteString.Internal (c2w, w2c)
6+
import Data.List
7+
8+
import Solid.PP.Lexer.Fast as Fast
9+
10+
spec :: Spec
11+
spec = do
12+
let symbols = map c2w $ sort ":!#$%&*+./<=>?@\\^|-~"
13+
describe "isSymbol" $ do
14+
forM_ symbols $ \ c -> do
15+
it [w2c c] $ do
16+
Fast.isSymbol c `shouldBe` True
17+
18+
forM_ ([minBound .. maxBound] \\ symbols) $ \ c -> do
19+
it (show c) $ do
20+
Fast.isSymbol c `shouldBe` False

0 commit comments

Comments
 (0)