Skip to content

Commit b3f1b4e

Browse files
authored
Lazy dataframe for computations larger than memory (#35)
Implement and expose functionality for doing basic operations on a lazy dataframes.
1 parent c3afd7b commit b3f1b4e

13 files changed

Lines changed: 508 additions & 10 deletions

File tree

dataframe.cabal

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ source-repository head
2222
location: https://github.com/mchav/dataframe
2323

2424
library
25-
exposed-modules: DataFrame
25+
exposed-modules: DataFrame,
26+
DataFrame.Lazy
2627
other-modules: DataFrame.Internal.Types,
2728
DataFrame.Internal.Expression,
2829
DataFrame.Internal.Parsing,
@@ -33,20 +34,24 @@ library
3334
DataFrame.Internal.Row,
3435
DataFrame.Errors,
3536
DataFrame.Operations.Core,
37+
DataFrame.Operations.Merge,
3638
DataFrame.Operations.Subset,
3739
DataFrame.Operations.Sorting,
3840
DataFrame.Operations.Statistics,
3941
DataFrame.Operations.Transformations,
4042
DataFrame.Operations.Typing,
4143
DataFrame.Operations.Aggregation,
4244
DataFrame.Display.Terminal.Plot,
43-
DataFrame.IO.CSV
45+
DataFrame.IO.CSV,
46+
DataFrame.Lazy.IO.CSV,
47+
DataFrame.Lazy.Internal.DataFrame
4448
build-depends: base >= 4.17.2.0 && < 4.22,
4549
array ^>= 0.5,
4650
attoparsec >= 0.12 && <= 0.14.4,
4751
bytestring >= 0.11 && <= 0.12.2.0,
4852
containers >= 0.6.7 && < 0.8,
4953
directory >= 1.3.0.0 && <= 1.3.9.0,
54+
filepath >= 1.0.0.0 && <= 1.5.4.0,
5055
hashable >= 1.2 && <= 1.5.0.0,
5156
statistics >= 0.16.2.1 && <= 0.16.3.0,
5257
text >= 2.0 && <= 2.1.2,
@@ -76,7 +81,8 @@ executable dataframe
7681
DataFrame.Operations.Typing,
7782
DataFrame.Operations.Aggregation,
7883
DataFrame.Display.Terminal.Plot,
79-
DataFrame.IO.CSV
84+
DataFrame.IO.CSV,
85+
DataFrame.Lazy.IO.CSV
8086
build-depends: base >= 4.17.2.0 && < 4.22,
8187
array ^>= 0.5,
8288
attoparsec >= 0.12 && <= 0.14.4,

src/DataFrame/IO.hs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
module DataFrame.IO where
2+
3+
data InputTypes = CSV deriving Show

src/DataFrame/Internal/Column.hs

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
{-# LANGUAGE OverloadedStrings #-}
55
{-# LANGUAGE RankNTypes #-}
66
{-# LANGUAGE ScopedTypeVariables #-}
7-
{-# LANGUAGE StrictData #-}
7+
{-# LANGUAGE Strict #-}
88
{-# LANGUAGE TypeApplications #-}
99
{-# LANGUAGE FlexibleContexts #-}
1010
{-# LANGUAGE FlexibleInstances #-}
@@ -521,6 +521,31 @@ expandColumn n (UnboxedColumn col) = OptionalColumn $ VB.map Just (VU.convert co
521521
expandColumn n (GroupedBoxedColumn col) = GroupedBoxedColumn $ col <> VB.replicate n VB.empty
522522
expandColumn n (GroupedUnboxedColumn col) = GroupedUnboxedColumn $ col <> VB.replicate n VU.empty
523523

524+
leftExpandColumn :: Int -> Column -> Column
525+
leftExpandColumn n (OptionalColumn col) = OptionalColumn $ VB.replicate n Nothing <> col
526+
leftExpandColumn n (BoxedColumn col) = OptionalColumn $ VB.replicate n Nothing <> VB.map Just col
527+
leftExpandColumn n (UnboxedColumn col) = OptionalColumn $ VB.replicate n Nothing <> VB.map Just (VU.convert col)
528+
leftExpandColumn n (GroupedBoxedColumn col) = GroupedBoxedColumn $ VB.replicate n VB.empty <> col
529+
leftExpandColumn n (GroupedUnboxedColumn col) = GroupedUnboxedColumn $ VB.replicate n VU.empty <> col
530+
531+
concatColumns :: Column -> Column -> Maybe Column
532+
concatColumns (OptionalColumn left) (OptionalColumn right) = case testEquality (typeOf left) (typeOf right) of
533+
Nothing -> Nothing
534+
Just Refl -> Just (OptionalColumn $ left <> right)
535+
concatColumns (BoxedColumn left) (BoxedColumn right) = case testEquality (typeOf left) (typeOf right) of
536+
Nothing -> Nothing
537+
Just Refl -> Just (BoxedColumn $ left <> right)
538+
concatColumns (UnboxedColumn left) (UnboxedColumn right) = case testEquality (typeOf left) (typeOf right) of
539+
Nothing -> Nothing
540+
Just Refl -> Just (UnboxedColumn $ left <> right)
541+
concatColumns (GroupedBoxedColumn left) (GroupedBoxedColumn right) = case testEquality (typeOf left) (typeOf right) of
542+
Nothing -> Nothing
543+
Just Refl -> Just (GroupedBoxedColumn $ left <> right)
544+
concatColumns (GroupedUnboxedColumn left) (GroupedUnboxedColumn right) = case testEquality (typeOf left) (typeOf right) of
545+
Nothing -> Nothing
546+
Just Refl -> Just (GroupedUnboxedColumn $ left <> right)
547+
concatColumns _ _ = Nothing
548+
524549
toVector :: forall a . Columnable a => Column -> VB.Vector a
525550
toVector column@(OptionalColumn (col :: VB.Vector b)) =
526551
case testEquality (typeRep @a) (typeRep @b) of

src/DataFrame/Internal/DataFrame.hs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
{-# LANGUAGE ScopedTypeVariables #-}
55
{-# LANGUAGE TypeApplications #-}
66
{-# LANGUAGE GADTs #-}
7-
{-# LANGUAGE StrictData #-}
7+
{-# LANGUAGE Strict #-}
88
{-# LANGUAGE FlexibleContexts #-}
99
module DataFrame.Internal.DataFrame where
1010

src/DataFrame/Internal/Expression.hs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ data Expr a where
3232
Apply :: (Columnable a, Columnable b) => T.Text -> (b -> a) -> Expr b -> Expr a
3333
BinOp :: (Columnable c, Columnable b, Columnable a) => T.Text -> (c -> b -> a) -> Expr c -> Expr b -> Expr a
3434

35-
interpret :: forall a b . (Columnable a) => DataFrame -> Expr a -> TypedColumn a
35+
interpret :: forall a . (Columnable a) => DataFrame -> Expr a -> TypedColumn a
3636
interpret df (Lit value) = TColumn $ toColumn' $ V.replicate (fst $ dataframeDimensions df) value
3737
interpret df (Col name) = case getColumn name df of
3838
Nothing -> throw $ ColumnNotFoundException name "" (map fst $ M.toList $ columnIndices df)

src/DataFrame/Internal/Types.hs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,4 @@ import Data.Word ( Word8, Word16, Word32, Word64 )
1919
import Type.Reflection (TypeRep, typeOf, typeRep)
2020
import Data.Type.Equality (TestEquality(..))
2121

22-
type Columnable' a = (Typeable a, Show a, Ord a, Eq a)
22+
type Columnable' a = (Typeable a, Show a, Ord a, Eq a, Read a)

src/DataFrame/Lazy.hs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
module DataFrame.Lazy (module DataFrame.Lazy.Internal.DataFrame) where
2+
3+
import DataFrame.Lazy.Internal.DataFrame

0 commit comments

Comments
 (0)