Track more state when reading file to avoid unnecessary IO.

mchav · mchav · commit 3a395b02c638 · 2025-06-23T12:23:14.000-07:00
diff --git a/src/DataFrame/IO/CSV.hs b/src/DataFrame/IO/CSV.hs
@@ -23,7 +23,7 @@ import qualified Data.Vector.Mutable as VM
 import qualified Data.Vector.Unboxed.Mutable as VUM
 
 import Control.Applicative ((<$>), (<|>), (<*>), (<*), (*>), many)
-import Control.Monad (forM_, zipWithM_, unless, void, replicateM_)
+import Control.Monad (forM_, zipWithM_, unless, when, void, replicateM_)
 import Data.Attoparsec.Text
 import Data.Char
 import DataFrame.Internal.Column (Column(..), freezeColumn', writeColumn, columnLength)
@@ -49,33 +49,39 @@ data ReadOptions = ReadOptions {
     hasHeader :: Bool,
     inferTypes :: Bool,
     safeRead :: Bool,
-    rowRange :: Maybe (Int, Int),  -- (start, length)
-    seekPos :: Maybe Integer
+    rowRange :: !(Maybe (Int, Int)),  -- (start, length)
+    seekPos :: !(Maybe Integer),
+    totalRows :: !(Maybe Int),
+    leftOver :: !T.Text,
+    rowsRead :: !Int
 }
 
 -- | By default we assume the file has a header, we infer the types on read
 -- and we convert any rows with nullish objects into Maybe (safeRead).
 defaultOptions :: ReadOptions
-defaultOptions = ReadOptions { hasHeader = True, inferTypes = True, safeRead = True, rowRange = Nothing, seekPos = Nothing }
+defaultOptions = ReadOptions { hasHeader = True, inferTypes = True, safeRead = True, rowRange = Nothing, seekPos = Nothing, totalRows = Nothing, leftOver = "", rowsRead = 0 }
 
 -- | Reads a CSV file from the given path.
 -- Note this file stores intermediate temporary files
 -- while converting the CSV from a row to a columnar format.
 readCsv :: String -> IO DataFrame
-readCsv = readSeparated ',' defaultOptions
+readCsv path = fst <$> readSeparated ',' defaultOptions path
 
 -- | Reads a tab separated file from the given path.
 -- Note this file stores intermediate temporary files
 -- while converting the CSV from a row to a columnar format.
 readTsv :: String -> IO DataFrame
-readTsv = readSeparated '\t' defaultOptions
+readTsv path = fst <$> readSeparated '\t' defaultOptions path
 
 -- | Reads a character separated file into a dataframe using mutable vectors.
-readSeparated :: Char -> ReadOptions -> String -> IO DataFrame
+readSeparated :: Char -> ReadOptions -> String -> IO (DataFrame, (Integer, T.Text, Int))
 readSeparated c opts path = do
-    (begin, len) <- case rowRange opts of
-            Nothing           -> countRows c path >>= \totalRows -> return (0, if hasHeader opts then totalRows - 1 else totalRows)
-            Just (start, len) -> return (start, len)
+    totalRows <- case totalRows opts of
+        Nothing -> countRows c path >>= \total -> if hasHeader opts then return (total - 1) else return total
+        Just n -> if hasHeader opts then return (n - 1) else return n
+    let (begin, len) = case rowRange opts of
+            Nothing           -> (0, totalRows)
+            Just (start, len) -> (start, min len (totalRows - rowsRead opts))
     withFile path ReadMode $ \handle -> do
         firstRow <- map T.strip . parseSep c <$> TIO.hGetLine handle
         let columnNames = if hasHeader opts
@@ -84,17 +90,18 @@ readSeparated c opts path = do
         -- If there was no header rewind the file cursor.
         unless (hasHeader opts) $ hSeek handle AbsoluteSeek 0
 
-        -- skip columns till `begin`
-        _ <- replicateM_ begin (TIO.hGetLine handle >> return () )
+        currPos <- hTell handle
+        when (isJust $ seekPos opts) $ hSeek handle AbsoluteSeek (fromMaybe currPos (seekPos opts))
 
         -- Initialize mutable vectors for each column
         let numColumns = length columnNames
-        let numRows = len 
+        let numRows = len
         -- Use this row to infer the types of the rest of the column.
         -- TODO: this isn't robust but in so far as this is a guess anyway
         -- it's probably fine. But we should probably sample n rows and pick
         -- the most likely type from the sample.
-        dataRow <- map T.strip . parseSep c <$> TIO.hGetLine handle
+        -- dataRow <- map T.strip . parseSep c . (<>) (leftOver opts) <$> TIO.hGetLine handle
+        (!dataRow, !remainder) <- readSingleLine c (leftOver opts) handle
 
         -- This array will track the indices of all null values for each column.
         -- If any exist then the column will be an optional type.
@@ -104,18 +111,19 @@ readSeparated c opts path = do
         getInitialDataVectors numRows mutableCols dataRow
 
         -- Read rows into the mutable vectors
-        fillColumns numRows c mutableCols nullIndices handle
+        (!unconsumed, !r) <- fillColumns numRows c mutableCols nullIndices remainder handle
 
         -- Freeze the mutable vectors into immutable ones
         nulls' <- V.unsafeFreeze nullIndices
         cols <- V.mapM (freezeColumn mutableCols nulls' opts) (V.generate numColumns id)
+        pos <- hTell handle
 
-        return $ DataFrame {
+        return (DataFrame {
                 columns = cols,
                 freeIndices = [],
                 columnIndices = M.fromList (zip columnNames [0..]),
                 dataframeDimensions = (maybe 0 columnLength (cols V.! 0), V.length cols)
-            }
+            }, (pos, unconsumed, r + 1))
 {-# INLINE readSeparated #-}
 
 getInitialDataVectors :: Int -> VM.IOVector Column -> [T.Text] -> IO ()
@@ -138,10 +146,22 @@ inferValueType s = let
             Nothing -> "Other"
 {-# INLINE inferValueType #-}
 
+readSingleLine :: Char -> T.Text -> Handle -> IO ([T.Text], T.Text)
+readSingleLine c unused handle = parseWith (TIO.hGetChunk handle) (parseRow c) unused >>= \case
+                Fail unconsumed ctx er -> do
+                  erpos <- hTell handle
+                  fail $ "Failed to parse CSV file around " <> show erpos <> " byte; due: "
+                    <> show er <> "; context: " <> show ctx
+                Partial c -> do
+                  fail "Partial handler is called"
+                Done (unconsumed :: T.Text) (row :: [T.Text]) -> do
+                  return (row, unconsumed)
+
 -- | Reads rows from the handle and stores values in mutable vectors.
-fillColumns :: Int -> Char -> VM.IOVector Column -> VM.IOVector [(Int, T.Text)] -> Handle -> IO ()
-fillColumns n c mutableCols nullIndices handle = do
-    input <- newIORef (mempty :: T.Text)
+fillColumns :: Int -> Char -> VM.IOVector Column -> VM.IOVector [(Int, T.Text)] -> T.Text -> Handle -> IO (T.Text, Int)
+fillColumns n c mutableCols nullIndices unused handle = do
+    input <- newIORef unused
+    rowsRead <- newIORef (0 :: Int)
     forM_ [1..(n - 1)] $ \i -> do
         isEOF <- hIsEOF handle
         input' <- readIORef input
@@ -155,7 +175,11 @@ fillColumns n c mutableCols nullIndices handle = do
                   fail "Partial handler is called"
                 Done (unconsumed :: T.Text) (row :: [T.Text]) -> do
                   writeIORef input unconsumed
+                  modifyIORef rowsRead (+1)
                   zipWithM_ (writeValue mutableCols nullIndices i) [0..] row
+    l <- readIORef input
+    r <- readIORef rowsRead
+    pure (l, r)
 {-# INLINE fillColumns #-}
 
 -- | Writes a value into the appropriate column, resizing the vector if necessary.
diff --git a/src/DataFrame/Internal/Column.hs b/src/DataFrame/Internal/Column.hs
@@ -4,7 +4,7 @@
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE RankNTypes #-}
 {-# LANGUAGE ScopedTypeVariables #-}
-{-# LANGUAGE StrictData #-}
+{-# LANGUAGE Strict #-}
 {-# LANGUAGE TypeApplications #-}
 {-# LANGUAGE FlexibleContexts #-}
 {-# LANGUAGE FlexibleInstances #-}
diff --git a/src/DataFrame/Internal/DataFrame.hs b/src/DataFrame/Internal/DataFrame.hs
@@ -4,7 +4,7 @@
 {-# LANGUAGE ScopedTypeVariables #-}
 {-# LANGUAGE TypeApplications #-}
 {-# LANGUAGE GADTs #-}
-{-# LANGUAGE StrictData #-}
+{-# LANGUAGE Strict #-}
 {-# LANGUAGE FlexibleContexts #-}
 module DataFrame.Internal.DataFrame where
 
diff --git a/src/DataFrame/Lazy/Internal/DataFrame.hs b/src/DataFrame/Lazy/Internal/DataFrame.hs
@@ -3,19 +3,24 @@
 {-# LANGUAGE InstanceSigs #-}
 {-# LANGUAGE ExistentialQuantification #-}
 {-# LANGUAGE AllowAmbiguousTypes #-}
+{-# LANGUAGE Strict #-}
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE NumericUnderscores #-}
 module DataFrame.Lazy.Internal.DataFrame where
 
-import           Control.Monad (forM_)
+import           Control.Monad (forM, foldM)
 import           Data.IORef
 import           Data.Kind
+import qualified Data.List as L
 import qualified Data.Map as M
 import qualified Data.Text as T
 import qualified Data.Vector as V
 import qualified DataFrame.Internal.DataFrame as D
 import qualified DataFrame.Internal.Column as C
 import qualified DataFrame.Internal.Expression as E
 import qualified DataFrame.Operations.Core as D
+import           DataFrame.Operations.Merge
 import qualified DataFrame.Operations.Subset as D
 import qualified DataFrame.Operations.Transformations as D
 import qualified DataFrame.IO.CSV as D
@@ -37,7 +42,7 @@ data InputType = ICSV deriving Show
 data LazyDataFrame = LazyDataFrame
   { inputPath        :: FilePath
   , inputType        :: InputType
-  , operations          :: [LazyOperation]
+  , operations       :: [LazyOperation]
   , batchSize        :: Int
   } deriving Show
 
@@ -49,27 +54,33 @@ eval (Filter expr) = D.filterWhere expr
 runDataFrame :: forall a . (C.Columnable a) => LazyDataFrame -> IO D.DataFrame
 runDataFrame df = do
   let path = inputPath df
-  -- totalRows <- D.countRows ',' path
-  let batches = batchRanges 1000000 (batchSize df)
-  _ <- forM_ batches $ \ (start, end) -> do
-    -- TODO: implement specific read operations for batching that returns a seek instead of re-reading everything.
-    sdf <- D.readSeparated ',' (D.defaultOptions { D.rowRange = Just (start, (batchSize df)) }) path
-    let rdf = foldl' (\d op -> eval op d) sdf (operations df)
-    if fst (D.dimensions rdf) == 0 then return () else print rdf 
-  return (D.empty)
+  totalRows <- D.countRows ',' path
+  let batches = batchRanges totalRows (batchSize df)
+  (df', _) <- foldM (\(!accDf, (!pos, !unused, !r)) (!start, !end) -> do
+    mapM_ putStr ["Scanning: ", show start, " to ", show end, " rows out of ", show totalRows, "\n"] 
+
+    (!sdf, (!pos', !unconsumed, !rowsRead)) <- D.readSeparated ',' (
+      D.defaultOptions { D.rowRange = Just (start, batchSize df)
+                       , D.totalRows = Just totalRows
+                       , D.seekPos = pos
+                       , D.rowsRead = r
+                       , D.leftOver = unused}) path
+    let !rdf = L.foldl' (flip eval) sdf (operations df)
+    return (accDf <> rdf, (Just pos', unconsumed, rowsRead + r)) ) (D.empty, (Nothing, "", 0)) batches
+  return df'
 
 batchRanges :: Int -> Int -> [(Int, Int)]
 batchRanges n inc = go n [0,inc..n]
-  where 
+  where
     go _ []         = []
     go n [x]        = [(x, n)]
     go n (f:s:rest) =(f, s) : go n (s:rest)
 
 scanCsv :: T.Text -> LazyDataFrame
-scanCsv path = LazyDataFrame (T.unpack path) ICSV [] 1024
+scanCsv path = LazyDataFrame (T.unpack path) ICSV [] 512_000
 
 addOperation :: LazyOperation -> LazyDataFrame -> LazyDataFrame
-addOperation op df = df { operations = (operations df) ++ [op] } 
+addOperation op df = df { operations = operations df ++ [op] }
 
 derive :: C.Columnable a => T.Text -> E.Expr a -> LazyDataFrame -> LazyDataFrame
 derive name expr = addOperation (Derive name expr)
diff --git a/src/DataFrame/Operations/Core.hs b/src/DataFrame/Operations/Core.hs
@@ -6,6 +6,7 @@
 {-# LANGUAGE ScopedTypeVariables #-}
 {-# LANGUAGE TypeApplications #-}
 {-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE Strict #-}
 module DataFrame.Operations.Core where
 
 import qualified Data.List as L
diff --git a/src/DataFrame/Operations/Merge.hs b/src/DataFrame/Operations/Merge.hs
@@ -1,4 +1,5 @@
 {-# LANGUAGE InstanceSigs #-}
+{-# LANGUAGE Strict #-}
 module DataFrame.Operations.Merge where
 
 import qualified Data.List as L
@@ -11,9 +12,13 @@ import qualified DataFrame.Operations.Core as D
 instance Semigroup D.DataFrame where
     (<>) :: D.DataFrame -> D.DataFrame -> D.DataFrame
     (<>) a b = let
-            columnsInBOnly = filter (\c -> not (c `elem` (D.columnNames b))) (D.columnNames b)
+            columnsInBOnly = filter (\c -> c `notElem` D.columnNames b) (D.columnNames b)
             columnsInA = D.columnNames a
-            addColumns a' b' df name = let
+            addColumns a' b' df name
+                | fst (D.dimensions a') == 0 && fst (D.dimensions b') == 0 = df
+                | fst (D.dimensions a') == 0 = D.insertColumn' name (D.getColumn name b') df
+                | fst (D.dimensions b') == 0 = D.insertColumn' name (D.getColumn name a') df
+                | otherwise = let
                         numColumnsA = (fst $ D.dimensions a')
                         numColumnsB = (fst $ D.dimensions b')
                         numColumns = max numColumnsA numColumnsB
@@ -26,4 +31,9 @@ instance Semigroup D.DataFrame where
                         Just b'' -> case optA of
                             Nothing  -> D.insertColumn' name (Just (D.leftExpandColumn numColumnsA b'')) df
                             Just a'' -> D.insertColumn' name (D.concatColumns a'' b'') df
-        in foldl' (addColumns a b) D.empty (L.union (D.columnNames a) (D.columnNames b))
+        in L.foldl' (addColumns a b) D.empty (D.columnNames a `L.union` D.columnNames b)
+
+instance Monoid D.DataFrame where
+  mempty = D.empty
+
+
diff --git a/src/DataFrame/Operations/Subset.hs b/src/DataFrame/Operations/Subset.hs
@@ -99,10 +99,10 @@ filterBy = flip filter
 filterWhere :: Expr Bool -> DataFrame -> DataFrame
 filterWhere expr df = let
     (TColumn col) = interpret @Bool df expr
-    (Just indexes) = ifoldlColumn (\s i satisfied -> if satisfied then S.insert i s else s) S.empty col
+    (Just indexes) = VU.convert . V.map (fromMaybe 0) . V.filter isJust . toVector @(Maybe Int) <$> itransform (\i satisfied -> if satisfied then Just i else Nothing) col
     c' = snd $ dataframeDimensions df
-    pick idxs col = atIndices idxs <$> col
-  in df {columns = V.map (pick indexes) (columns df), dataframeDimensions = (S.size indexes, c')}
+    pick idxs col = atIndicesStable idxs <$> col
+  in df {columns = V.map (pick indexes) (columns df), dataframeDimensions = (VU.length indexes, c')}
 
 
 -- | O(k) removes all rows with `Nothing` in a given column from the dataframe.
diff --git a/src/DataFrame/Operations/Transformations.hs b/src/DataFrame/Operations/Transformations.hs
@@ -3,6 +3,8 @@
 {-# LANGUAGE ScopedTypeVariables #-}
 {-# LANGUAGE TypeApplications #-}
 {-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE Strict #-}
+{-# LANGUAGE StrictData #-}
 module DataFrame.Operations.Transformations where
 
 import qualified Data.List as L