Avoid per-byte loop in cstring{,Utf8} builders

hs-viktor · hs-viktor · commit 266d6da74e30 · 2023-01-12T23:30:06.000-05:00
Copy chunks of the input to the output buffer with 'memcpy', up to the shorter
of the available buffer space and the "null-free" portion of the remaining
string.  For the UTF8 version, encoded NUL bytes are located via strstr(3).
diff --git a/Data/ByteString/Builder/Prim.hs b/Data/ByteString/Builder/Prim.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
+{-# LANGUAGE BangPatterns, CPP, ScopedTypeVariables #-}
 {-# LANGUAGE MagicHash, UnboxedTuples, PatternGuards #-}
 {-# OPTIONS_GHC -fno-warn-unused-imports #-}
 {-# LANGUAGE Trustworthy #-}
@@ -469,6 +469,7 @@ import           Data.ByteString.Builder.Prim.ASCII
 import           Foreign
 import           Foreign.C.Types
 import           Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
+import           GHC.Int (Int (..))
 import           GHC.Word (Word8 (..))
 import           GHC.Exts
 import           GHC.IO
@@ -672,50 +673,87 @@ primMapLazyByteStringBounded w =
 --
 -- @since 0.11.0.0
 cstring :: Addr# -> Builder
-cstring =
-    \addr0 -> builder $ step addr0
-  where
-    step :: Addr# -> BuildStep r -> BuildStep r
-    step !addr !k br@(BufferRange op0@(Ptr op0#) ope)
-      | W8# ch == 0 = k br
-      | op0 == ope =
-          return $ bufferFull 1 op0 (step addr k)
-      | otherwise = do
-          IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of
-                       s' -> (# s', () #)
-          let br' = BufferRange (op0 `plusPtr` 1) ope
-          step (addr `plusAddr#` 1#) k br'
-      where
-        !ch = indexWord8OffAddr# addr 0#
+cstring = \addr0 -> builder $ \k br -> do
+#if __GLASGOW_HASKELL__ >= 811
+    let len = cstringLength# addr0
+#else
+    (I# len) <- fromIntegral <$> S.c_strlen (Ptr addr0)
+#endif
+    cstring_step addr0 len k br
+{-# INLINE cstring #-}
+
+cstring_step :: Addr# -> Int# -> BuildStep r -> BuildStep r
+cstring_step !addr !len !k br@(BufferRange op0 ope)
+    -- String is empty, process the continuation
+  | (I# len) == 0 = k br
+    -- Buffer is full, allocate some more...  We ask for just one more
+    -- byte, but the builder allocation strategy will in practice give
+    -- us more space, which we'll consume in a single step.
+  | op0 == ope =
+       return $ bufferFull 1 op0 (cstring_step addr len k)
+    -- Copy as much of the string as fits into the available buffer space.
+    -- If the string is long enough, we may have asked for less than its
+    -- full length, filling the buffer with the rest will go into the next
+    -- builder step.
+  | otherwise = do
+      let !avail@(I# avail#) = min (I# len) (ope `minusPtr` op0)
+          br' = BufferRange (op0 `plusPtr` avail) ope
+          addr' = addr `plusAddr#` avail#
+          len' = len -# avail#
+      S.memcpy op0 (Ptr addr) avail
+      cstring_step addr' len' k br'
 
 -- | A null-terminated UTF-8 encoded 'Foreign.C.String.CString'.
 -- Null characters can be encoded as @0xc0 0x80@.
 --
 -- @since 0.11.0.0
 cstringUtf8 :: Addr# -> Builder
-cstringUtf8 =
-    \addr0 -> builder $ step addr0
-  where
-    step :: Addr# -> BuildStep r -> BuildStep r
-    step !addr !k br@(BufferRange op0@(Ptr op0#) ope)
-      | W8# ch == 0 = k br
-      | op0 == ope =
-          return $ bufferFull 1 op0 (step addr k)
-        -- NULL is encoded as 0xc0 0x80
-      | W8# ch == 0xc0
-      , W8# (indexWord8OffAddr# addr 1#) == 0x80 = do
-          let !(W8# nullByte#) = 0
-          IO $ \s -> case writeWord8OffAddr# op0# 0# nullByte# s of
-                       s' -> (# s', () #)
-          let br' = BufferRange (op0 `plusPtr` 1) ope
-          step (addr `plusAddr#` 2#) k br'
-      | otherwise = do
-          IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of
-                       s' -> (# s', () #)
-          let br' = BufferRange (op0 `plusPtr` 1) ope
-          step (addr `plusAddr#` 1#) k br'
-      where
-        !ch = indexWord8OffAddr# addr 0#
+cstringUtf8 = \addr0 -> builder $ \k br -> do
+#if __GLASGOW_HASKELL__ >= 811
+    let len = cstringLength# addr0
+#else
+    (I# len) <- fromIntegral <$> S.c_strlen (Ptr addr0)
+#endif
+    nullAt <- S.c_strstr (Ptr addr0) (Ptr "\xc0\x80"#)
+    cstringUtf8_step addr0 len nullAt k br
+{-# INLINE cstringUtf8 #-}
+
+cstringUtf8_step :: Addr# -> Int# -> Ptr Word8 -> BuildStep r -> BuildStep r
+cstringUtf8_step !addr !len !nullAt !k br@(BufferRange op0@(Ptr op0#) ope)
+    -- String is empty, process the continuation
+  | (I# len) == 0 = k br
+    -- Contains no encoded nulls, use simpler 'cstring' code
+  | nullPtr == nullAt =
+      cstring_step addr len k br
+    -- Buffer is full, allocate some more...  We ask for just one more
+    -- byte, but the builder allocation strategy will in practice give
+    -- us more space, which we'll consume in a single step.
+  | op0 == ope =
+       return $ bufferFull 1 op0 (cstringUtf8_step addr len nullAt k)
+    -- We're at the encoded null-byte, append a '\0' to the buffer and
+    -- continue with the rest of the input string, after locating the
+    -- next encoded null-byte, if any.
+  | (Ptr addr) == nullAt = do
+      let !(W8# nullByte#) = 0
+      IO $ \s -> case writeWord8OffAddr# op0# 0# nullByte# s of
+                   s' -> (# s', () #)
+      let br' = BufferRange (op0 `plusPtr` 1) ope
+          addr' = addr `plusAddr#` 2#
+          len' = len -# 2#
+      nullAt' <- S.c_strstr (Ptr addr') (Ptr "\xc0\x80"#)
+      cstringUtf8_step addr' len' nullAt' k br'
+    -- Copy as much of the null-free portion of the string as fits into the
+    -- available buffer space. If the string is long enough, we may have asked
+    -- for less than its full length, filling the buffer with the rest will go
+    -- into the next builder step.
+  | otherwise = do
+      let !nullFree = nullAt `minusPtr` (Ptr addr)
+          !avail@(I# avail#) = min nullFree (ope `minusPtr` op0)
+          br' = BufferRange (op0 `plusPtr` avail) ope
+          addr' = addr `plusAddr#` avail#
+          len' = len -# avail#
+      S.memcpy op0 (Ptr addr) avail
+      cstringUtf8_step addr' len' nullAt k br'
 
 ------------------------------------------------------------------------------
 -- Char8 encoding
diff --git a/Data/ByteString/Internal.hs b/Data/ByteString/Internal.hs
@@ -65,6 +65,7 @@ module Data.ByteString.Internal (
 
         -- * Standard C Functions
         c_strlen,
+        c_strstr,
         c_free_finalizer,
 
         memchr,
diff --git a/Data/ByteString/Internal/Type.hs b/Data/ByteString/Internal/Type.hs
@@ -82,6 +82,7 @@ module Data.ByteString.Internal.Type (
 
         -- * Standard C Functions
         c_strlen,
+        c_strstr,
         c_free_finalizer,
 
         memchr,
@@ -1001,6 +1002,9 @@ accursedUnutterablePerformIO (IO m) = case m realWorld# of (# _, r #) -> r
 foreign import ccall unsafe "string.h strlen" c_strlen
     :: CString -> IO CSize
 
+foreign import ccall unsafe "string.h strstr" c_strstr
+    :: CString -> CString -> IO (Ptr Word8)
+
 foreign import ccall unsafe "static stdlib.h &free" c_free_finalizer
     :: FunPtr (Ptr Word8 -> IO ())
 
diff --git a/tests/builder/Data/ByteString/Builder/Prim/Tests.hs b/tests/builder/Data/ByteString/Builder/Prim/Tests.hs
@@ -33,8 +33,11 @@ testCString = testProperty "cstring" $
 
 testCStringUtf8 :: TestTree
 testCStringUtf8 = testProperty "cstringUtf8" $
-    toLazyByteString (BP.cstringUtf8 "hello\xc0\x80world!"#) ==
-      LC.pack "hello" `L.append` L.singleton 0x00 `L.append` LC.pack "world!"
+    toLazyByteString (BP.cstringUtf8 "hello\xc0\x80world\xc0\x80!"#) ==
+      LC.pack "hello" `L.append` L.singleton 0x00
+                      `L.append` LC.pack "world"
+                      `L.append` L.singleton 0x00
+                      `L.append` LC.singleton '!'
 
 ------------------------------------------------------------------------------
 -- Binary