@@ -23,7 +23,7 @@ import qualified Data.Vector.Mutable as VM
2323import qualified Data.Vector.Unboxed.Mutable as VUM
2424
2525import Control.Applicative ((<$>) , (<|>) , (<*>) , (<*) , (*>) , many )
26- import Control.Monad (forM_ , zipWithM_ , unless , void , replicateM_ )
26+ import Control.Monad (forM_ , zipWithM_ , unless , when , void , replicateM_ )
2727import Data.Attoparsec.Text
2828import Data.Char
2929import DataFrame.Internal.Column (Column (.. ), freezeColumn' , writeColumn , columnLength )
@@ -49,33 +49,39 @@ data ReadOptions = ReadOptions {
4949 hasHeader :: Bool ,
5050 inferTypes :: Bool ,
5151 safeRead :: Bool ,
52- rowRange :: Maybe (Int , Int ), -- (start, length)
53- seekPos :: Maybe Integer
52+ rowRange :: ! (Maybe (Int , Int )), -- (start, length)
53+ seekPos :: ! (Maybe Integer ),
54+ totalRows :: ! (Maybe Int ),
55+ leftOver :: ! T. Text ,
56+ rowsRead :: ! Int
5457}
5558
5659-- | By default we assume the file has a header, we infer the types on read
5760-- and we convert any rows with nullish objects into Maybe (safeRead).
5861defaultOptions :: ReadOptions
59- defaultOptions = ReadOptions { hasHeader = True , inferTypes = True , safeRead = True , rowRange = Nothing , seekPos = Nothing }
62+ defaultOptions = ReadOptions { hasHeader = True , inferTypes = True , safeRead = True , rowRange = Nothing , seekPos = Nothing , totalRows = Nothing , leftOver = " " , rowsRead = 0 }
6063
6164-- | Reads a CSV file from the given path.
6265-- Note this file stores intermediate temporary files
6366-- while converting the CSV from a row to a columnar format.
6467readCsv :: String -> IO DataFrame
65- readCsv = readSeparated ' ,' defaultOptions
68+ readCsv path = fst <$> readSeparated ' ,' defaultOptions path
6669
6770-- | Reads a tab separated file from the given path.
6871-- Note this file stores intermediate temporary files
6972-- while converting the CSV from a row to a columnar format.
7073readTsv :: String -> IO DataFrame
71- readTsv = readSeparated ' \t ' defaultOptions
74+ readTsv path = fst <$> readSeparated ' \t ' defaultOptions path
7275
7376-- | Reads a character separated file into a dataframe using mutable vectors.
74- readSeparated :: Char -> ReadOptions -> String -> IO DataFrame
77+ readSeparated :: Char -> ReadOptions -> String -> IO ( DataFrame , ( Integer , T. Text , Int ))
7578readSeparated c opts path = do
76- (begin, len) <- case rowRange opts of
77- Nothing -> countRows c path >>= \ totalRows -> return (0 , if hasHeader opts then totalRows - 1 else totalRows)
78- Just (start, len) -> return (start, len)
79+ totalRows <- case totalRows opts of
80+ Nothing -> countRows c path >>= \ total -> if hasHeader opts then return (total - 1 ) else return total
81+ Just n -> if hasHeader opts then return (n - 1 ) else return n
82+ let (begin, len) = case rowRange opts of
83+ Nothing -> (0 , totalRows)
84+ Just (start, len) -> (start, min len (totalRows - rowsRead opts))
7985 withFile path ReadMode $ \ handle -> do
8086 firstRow <- map T. strip . parseSep c <$> TIO. hGetLine handle
8187 let columnNames = if hasHeader opts
@@ -84,17 +90,18 @@ readSeparated c opts path = do
8490 -- If there was no header rewind the file cursor.
8591 unless (hasHeader opts) $ hSeek handle AbsoluteSeek 0
8692
87- -- skip columns till `begin`
88- _ <- replicateM_ begin ( TIO. hGetLine handle >> return () )
93+ currPos <- hTell handle
94+ when (isJust $ seekPos opts) $ hSeek handle AbsoluteSeek (fromMaybe currPos (seekPos opts) )
8995
9096 -- Initialize mutable vectors for each column
9197 let numColumns = length columnNames
92- let numRows = len
98+ let numRows = len
9399 -- Use this row to infer the types of the rest of the column.
94100 -- TODO: this isn't robust but in so far as this is a guess anyway
95101 -- it's probably fine. But we should probably sample n rows and pick
96102 -- the most likely type from the sample.
97- dataRow <- map T. strip . parseSep c <$> TIO. hGetLine handle
103+ -- dataRow <- map T.strip . parseSep c . (<>) (leftOver opts) <$> TIO.hGetLine handle
104+ (! dataRow, ! remainder) <- readSingleLine c (leftOver opts) handle
98105
99106 -- This array will track the indices of all null values for each column.
100107 -- If any exist then the column will be an optional type.
@@ -104,18 +111,19 @@ readSeparated c opts path = do
104111 getInitialDataVectors numRows mutableCols dataRow
105112
106113 -- Read rows into the mutable vectors
107- fillColumns numRows c mutableCols nullIndices handle
114+ ( ! unconsumed, ! r) <- fillColumns numRows c mutableCols nullIndices remainder handle
108115
109116 -- Freeze the mutable vectors into immutable ones
110117 nulls' <- V. unsafeFreeze nullIndices
111118 cols <- V. mapM (freezeColumn mutableCols nulls' opts) (V. generate numColumns id )
119+ pos <- hTell handle
112120
113- return $ DataFrame {
121+ return ( DataFrame {
114122 columns = cols,
115123 freeIndices = [] ,
116124 columnIndices = M. fromList (zip columnNames [0 .. ]),
117125 dataframeDimensions = (maybe 0 columnLength (cols V. ! 0 ), V. length cols)
118- }
126+ }, (pos, unconsumed, r + 1 ))
119127{-# INLINE readSeparated #-}
120128
121129getInitialDataVectors :: Int -> VM. IOVector Column -> [T. Text ] -> IO ()
@@ -138,10 +146,22 @@ inferValueType s = let
138146 Nothing -> " Other"
139147{-# INLINE inferValueType #-}
140148
149+ readSingleLine :: Char -> T. Text -> Handle -> IO ([T. Text ], T. Text )
150+ readSingleLine c unused handle = parseWith (TIO. hGetChunk handle) (parseRow c) unused >>= \ case
151+ Fail unconsumed ctx er -> do
152+ erpos <- hTell handle
153+ fail $ " Failed to parse CSV file around " <> show erpos <> " byte; due: "
154+ <> show er <> " ; context: " <> show ctx
155+ Partial c -> do
156+ fail " Partial handler is called"
157+ Done (unconsumed :: T. Text ) (row :: [T. Text ]) -> do
158+ return (row, unconsumed)
159+
141160-- | Reads rows from the handle and stores values in mutable vectors.
142- fillColumns :: Int -> Char -> VM. IOVector Column -> VM. IOVector [(Int , T. Text )] -> Handle -> IO ()
143- fillColumns n c mutableCols nullIndices handle = do
144- input <- newIORef (mempty :: T. Text )
161+ fillColumns :: Int -> Char -> VM. IOVector Column -> VM. IOVector [(Int , T. Text )] -> T. Text -> Handle -> IO (T. Text , Int )
162+ fillColumns n c mutableCols nullIndices unused handle = do
163+ input <- newIORef unused
164+ rowsRead <- newIORef (0 :: Int )
145165 forM_ [1 .. (n - 1 )] $ \ i -> do
146166 isEOF <- hIsEOF handle
147167 input' <- readIORef input
@@ -155,7 +175,11 @@ fillColumns n c mutableCols nullIndices handle = do
155175 fail " Partial handler is called"
156176 Done (unconsumed :: T. Text ) (row :: [T. Text ]) -> do
157177 writeIORef input unconsumed
178+ modifyIORef rowsRead (+ 1 )
158179 zipWithM_ (writeValue mutableCols nullIndices i) [0 .. ] row
180+ l <- readIORef input
181+ r <- readIORef rowsRead
182+ pure (l, r)
159183{-# INLINE fillColumns #-}
160184
161185-- | Writes a value into the appropriate column, resizing the vector if necessary.
0 commit comments