Work in progress

sol · sol · commit 2acc8efcfde4 · 2025-05-03T13:47:19.000+07:00
diff --git a/solid-pp/src/Solid/PP/Lexer/Fast.hs b/solid-pp/src/Solid/PP/Lexer/Fast.hs
@@ -0,0 +1,17 @@
+module Solid.PP.Lexer.Fast where
+
+import Data.Word
+import Data.Bits
+
+-- |
+-- O(1) check whether a 'Word8' is one of:
+-- [33,35,36,37,38,42,43,45,46,47,58,60,61,62,63,64,92,94,124,126]
+isSymbol :: Word8 -> Bool
+isSymbol !c = (bitset `unsafeShiftR` i .&. 1) /= 0
+  where
+    !i = fromIntegral (c .&. 0b111111) :: Int
+
+    !bitset = case c `unsafeShiftR` 6 of
+      0 -> 0xf400ec7a00000000 :: Word64 -- [33,35,36,37,38,42,43,45,46,47,58,60,61,62,63]
+      1 -> 0x5000000050000001 :: Word64 -- [64,92,94,124,126]
+      _ -> 0
diff --git a/solid-pp/src/Solid/PP/NewLexer.hs b/solid-pp/src/Solid/PP/NewLexer.hs
@@ -12,7 +12,7 @@ module Solid.PP.NewLexer where
 
 import Prelude hiding (span, mod, takeWhile)
 
-import Data.Char
+import Data.Char hiding (isSymbol)
 import Data.Functor
 import Data.Text (Text)
 import Data.Text.Internal (Text(..))
@@ -28,20 +28,29 @@ data Location = Location {
 , column :: !Int
 } deriving (Show, Eq) -- FIXME: Eq should not be used in production code; only compare offset instead
 
-data SrcSpan = SrcSpan {
+adjustOffset :: Int -> Location -> Location
+adjustOffset n Location{..} = Location {
+  offset = offset + n
+, charOffset = charOffset + n
+, line
+, column = column + n
+}
+{-# INLINE adjustOffset #-}
+
+data Span = Span {
   start :: Location
 , end   :: Location
 } deriving (Show, Eq) -- FIXME: Eq should not be used in production code; only compare offset instead
 
-data Tok = Tok {
+data Token = Token {
   tokenType :: TokenType
-, span :: SrcSpan
+, span :: Span
 } deriving (Show, Eq)
 
-textSpan :: Text -> Tok -> Text
+textSpan :: Text -> Token -> Text
 textSpan input token = textSpan__ input token.span
 
-textSpan__ :: Text -> SrcSpan -> Text
+textSpan__ :: Text -> Span -> Text
 textSpan__ input span = textSpan_ input start end
   where
     start = span.start.offset
@@ -51,27 +60,59 @@ textSpan_ :: Text -> Int -> Int -> Text
 textSpan_ (Text arr _ _) start end = Text arr start (end - start)
 
 data TokenType =
-    Keyword
+    -- Keyword
+    Constructor
   | Identifier
-  | QualifiedIdentifier Text Text
-  | Constructor
-  | QualifiedConstructor Text Text
-  | IncompleteQualifiedName Text
-  | Operator Text
+  | Symbol Text
   | Integer
   | String
-  | Symbol Char
+  | UnterminatedString
+  | Special Char
   | Comment
   | EndOfFile
+
+  -- synthetic tokens
+  | QualifiedIdentifier
+  | QualifiedConstructor
+  | IncompleteQualifiedName
+  | Projection
   deriving (Show, Eq)
 
+union :: Span -> Span -> Span
+union start end = Span start.start end.end
+
+synthesize :: [Token] -> [Token]
+synthesize = loop
+  where
+    loop :: [Token] -> [Token]
+    loop = \ case
+      [] -> []
+      Token (Symbol ".") start : Token Identifier end : rest | start.end.offset == end.start.offset -> Token Projection (Span start.start end.end) : loop rest
+      Token Constructor start : Token (Symbol ".") end : rest | start.end.offset == end.start.offset -> qualifiedName (union start end) rest
+      token : rest -> token : loop rest
+
+    qualifiedName :: Span -> [Token] -> [Token]
+    qualifiedName start = \ case
+      Token Constructor name : Token (Symbol ".") end : rest | name.end.offset == end.start.offset -> qualifiedName span rest
+        where
+          span = union start end
+
+      Token t end : rest | start.end.offset == end.start.offset -> case t of
+        Constructor -> accept QualifiedConstructor
+        Identifier -> accept QualifiedIdentifier
+        _ -> undefined
+        where
+          accept t = Token t (Span start.start end.end) : loop rest
+      tokens@(_ : _) -> Token IncompleteQualifiedName start : loop tokens
+      [] -> [Token IncompleteQualifiedName start]
+
 data Lexer = Lexer {
   current :: Location
 , input   :: Text
 } deriving (Show, Eq)
 
 data WithSrcSpan a = WithSrcSpan {
-  span  :: SrcSpan
+  span  :: Span
 , value :: a
 } deriving (Show, Eq)
 
@@ -111,7 +152,7 @@ takeWhile p = do
   let (match, rest) = T.span p lexer.input
       end = advanceText lexer.current match
   put $ Lexer end rest
-  pure $ WithSrcSpan (SrcSpan lexer.current end) match
+  pure $ WithSrcSpan (Span lexer.current end) match
 
 takeUntil :: (Char -> Bool) -> LexerM (WithSrcSpan Text)
 takeUntil p = takeWhile (not . p)
@@ -125,29 +166,32 @@ consumeChar = do
       let start = lexer.current
           end = advanceChar start c
       put $ Lexer end rest
-      pure $ WithSrcSpan (SrcSpan start end) c
+      pure $ WithSrcSpan (Span start end) c
+
+consumeChar_ :: LexerM ()
+consumeChar_ = void consumeChar
 
 peekChar :: LexerM Char
 peekChar = do
   lexer <- get
   return $ if T.null lexer.input then '\0' else T.head lexer.input
 
 -- Lexer driver
-tokenize :: Text -> [Tok]
+tokenize :: Text -> [Token]
 tokenize input@(Text _ off _) = loop (Lexer (Location off 0 1 1) input)
   where
-    loop :: Lexer -> [Tok]
+    loop :: Lexer -> [Token]
     loop lexer = case lexOne.unLexerM lexer of
-      (_, Tok EndOfFile _) -> []
+      (_, Token EndOfFile _) -> []
       (new, token) -> token : loop new
 
-lexOne :: LexerM Tok
+lexOne :: LexerM Token
 lexOne = do
   lexer <- get
   mc <- peekChar
   case mc of
     c
-      | c == '\0' -> return (Tok EndOfFile $ SrcSpan lexer.current lexer.current)
+      | c == '\0' -> return (Token EndOfFile $ Span lexer.current lexer.current)
     {-
       | T.isPrefixOf "--" <$> (input <$> get) -> do
           comment <- takeUntil (== '\n')
@@ -165,93 +209,55 @@ lexOne = do
                       else Identifier word.value
           pure $ Token typ word.span
           -}
-          pure $ Tok Identifier word.span
+          pure $ Token Identifier word.span
 
       | isUpper c -> do
           word <- takeWhile isIdChar
-          pure $ Tok Constructor word.span
-          -- LexerM qualifiedName
+          pure $ Token Constructor word.span
 
       | isDigit c -> do
           num <- takeWhile isDigit
-          pure $ Tok Integer num.span
+          pure $ Token Integer num.span
 
       | c == '"' -> do
-          string
+          t <- string
           new <- get
-          pure $ Tok String (SrcSpan lexer.current new.current)
+          pure $ Token t (Span lexer.current new.current)
+
+      | isSymbol c -> do
+          op <- takeWhile isSymbol
+          case op.value of
+            "--" -> do
+
+              -- FIXME: improve performance
+              --
+              -- 1. don't need to update column
+              -- 2. skip \n and increase line
+              ignore <- takeWhile (/= '\n')
+
+              lexOne
+            _ -> pure $ Token (Symbol op.value) op.span
 
-      | c `elem` operators -> do
-          op <- takeWhile (`elem` operators)
-          pure $ Tok (Operator op.value) op.span
-      | c `elem` symbols -> do
+      | c `elem` special -> do
           sym <- consumeChar
-          pure $ Tok (Symbol sym.value) sym.span
+          pure $ Token (Special sym.value) sym.span
       | otherwise -> do
           ch <- consumeChar
-          pure $ Tok Comment ch.span -- FIXME
+          pure $ Token Comment ch.span -- FIXME
 
-string :: LexerM (WithSrcSpan Char)
+string :: LexerM TokenType
 string = loop
   where
+    loop :: LexerM TokenType
     loop = do
-      _ <- consumeChar
-      _ <- takeUntil (\ c -> c == '"' || c == '\\')
+      consumeChar_
+      _ <- takeUntil (\ c -> c == '"' || c == '\\' || c == '\n')
       c <- peekChar
       if
-        | c == '"' -> consumeChar
-        | c == '\\' -> consumeChar >> loop
-        | otherwise -> undefined -- partial string - eof
-
-qualifiedName :: Lexer -> (Lexer, Tok)
-qualifiedName Lexer{..} = scanConstructor -1 0
-  where
-    scanConstructor :: Int -> Int -> (Lexer, Tok)
-    scanConstructor lastDot !i
-      | c == '.' = scanIdentifier (i + d)
-      | isIdChar c = scanConstructor lastDot (i + d)
-      | otherwise = done
-      where
-        done :: (Lexer, Tok)
-        done = accept i \ match ->
-          if lastDot < 0 then
-            Constructor
-          else
-            QualifiedConstructor (Unsafe.takeWord8 (lastDot - 1) match) (Unsafe.dropWord8 lastDot match)
-
-        Iter c d = safeIter input i
-
-    scanIdentifier :: Int -> (Lexer, Tok)
-    scanIdentifier !i
-      | isLower c = accept (findEndOfId i) \ match ->
-          let
-            mod = Unsafe.takeWord8 (i - 1) match
-            name = Unsafe.dropWord8 i match
-            tok = QualifiedIdentifier mod name
-          in tok
-      | isIdChar c = scanConstructor i (i + d)
-      | otherwise = accept i IncompleteQualifiedName
-      where
-        Iter c d = safeIter input i
-
-    findEndOfId :: Int -> Int
-    findEndOfId !i
-      | isIdChar c = findEndOfId (i + d)
-      | otherwise = i
-      where
-        Iter c d = safeIter input i
-
-    accept :: Int -> (Text -> TokenType) -> (Lexer, Tok)
-    accept n f =
-      let
-        match = Unsafe.takeWord8 n input
-        rest = Unsafe.dropWord8 n input
-        new = Lexer {
-          current = advanceText current match
-        , input = rest
-        }
-      in (new, Tok (f match) (SrcSpan current new.current))
-
+        | c == '"'  -> consumeChar_ >> pure String
+        | c == '\\' -> consumeChar_ >> loop
+        | c == '\n' -> pure UnterminatedString
+        | otherwise -> pure UnterminatedString
 
 isIdChar :: Char -> Bool
 isIdChar c = isAlphaNum c || c == '_'
@@ -265,11 +271,18 @@ keywords =
   , "module", "newtype", "of", "then", "type", "where", "forall"
   ]
 
-operators :: [Char]
-operators = ":!#$%&*+./<=>?@\\^|-~"
-
 symbols :: [Char]
-symbols = "(),;[]{}"
+symbols = ":!#$%&*+./<=>?@\\^|-~"
+
+
+-- .. | : | :: | = | \ | | | <- | -> | @ | ~ | =>
+reservedop =  ["..", ":", "::", "=", "\\", "|", "<-", "->", "@", "~", "=>"]
+
+isSymbol :: Char -> Bool
+isSymbol = (`elem` symbols)
+
+special :: [Char]
+special = "(),;[]{}"
 
 advanceChar :: Location -> Char -> Location
 advanceChar (Location offset o l c) ch
diff --git a/solid-pp/test/Solid/PP/Lexer/FastSpec.hs b/solid-pp/test/Solid/PP/Lexer/FastSpec.hs
@@ -0,0 +1,20 @@
+module Solid.PP.Lexer.FastSpec (spec) where
+
+import           Test.Hspec
+import           Control.Monad
+import           Data.ByteString.Internal (c2w, w2c)
+import           Data.List
+
+import           Solid.PP.Lexer.Fast as Fast
+
+spec :: Spec
+spec = do
+  let symbols = map c2w $ sort ":!#$%&*+./<=>?@\\^|-~"
+  describe "isSymbol" $ do
+    forM_ symbols $ \ c -> do
+      it [w2c c] $ do
+        Fast.isSymbol c `shouldBe` True
+
+    forM_ ([minBound .. maxBound] \\ symbols) $ \ c -> do
+      it (show c) $ do
+        Fast.isSymbol c `shouldBe` False
diff --git a/solid-pp/test/Solid/PP/NewLexerSpec.hs b/solid-pp/test/Solid/PP/NewLexerSpec.hs