feat: derive read schema from record for CSV.

mchav · mchav · commit c9bc13d0fb54 · 2026-05-12T23:54:30.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,7 @@
 ### New features
 * New `DataFrame.Typed.TH.deriveSchemaFromType` Template Haskell splice generates a typed schema synonym and a `HasSchema` instance from a Haskell record ADT. Pair with `DataFrame.fromRecords` / `DataFrame.toRecords` (or `DataFrame.Typed.fromRecordsTyped` / `toRecordsTyped`) to convert between `[Order]` and `DataFrame`/`TypedDataFrame OrderSchema`. Field names are translated `camelCase → snake_case` by default; the transform is configurable via `SchemaOptions`.
 * New `DataFrame.Typed.Generic` exposes `SchemaOf`/`SchemaOfRaw` plus `genericToColumns` / `genericFromColumns`, so users who prefer `GHC.Generics` over a TH splice can derive the same schema and row bridge.
+* New `DataFrame.Internal.Schema.deriveSchema` Template Haskell splice generates, from a record ADT, both a runtime `Schema` value (`orderSchema :: Schema`, suitable for `readCsvWithSchema` / `readCsvWithOpts`) and one `Expr` accessor per field (`orderCustomerId :: Expr Int`, etc.), so expression-DSL code can refer to columns by typed name without writing `col @T "snake_case_name"` at each call site. Re-exported from `DataFrame`.
 
 ### Refactor
 * The untyped Template Haskell splices (`declareColumns`, `declareColumnsFromCsvFile`, `declareColumnsFromCsvWithOpts`, `declareColumnsFromParquetFile`, `declareColumnsWithPrefix`, `declareColumnsWithPrefix'`) have moved from `DataFrame.Functions` to a new `DataFrame.TH` module (re-exported from `DataFrame`). Update imports accordingly; the bundled `dataframe.ghci` already points to the new module.
diff --git a/README.md b/README.md
@@ -305,6 +305,33 @@ Field names are translated `camelCase → snake_case` by default; override
 the translation with `deriveSchemaFromTypeWith
 defaultSchemaOptions{nameTransform = id}` (or any `String -> String`).
 
+If all you need is a runtime `Schema` to drive `readCsvWithSchema` (no
+typed-dataframe machinery), there's a companion splice in
+`DataFrame.Internal.Schema` (re-exported from `DataFrame`):
+
+```haskell
+$(D.deriveSchema ''Order)
+-- emits:
+--   orderSchema     :: Schema
+--   orderSchema     = makeSchema [("order_id", schemaType @Int64), ...]
+--   orderOrderId    :: Expr Int64
+--   orderOrderId    = col "order_id"
+--   orderRegion     :: Expr Text
+--   orderRegion     = col "region"
+--   orderAmount     :: Expr Double
+--   orderAmount     = col "amount"
+
+orders :: IO D.DataFrame
+orders = do
+    df <- D.readCsvWithSchema orderSchema "orders.csv"
+    pure (D.filter orderAmount (> 100) df)
+```
+
+Each record field gets a typed accessor named `<lower-first TyConName><UpperFirst FieldName>`,
+so `data Order { customerId :: Int }` yields `orderCustomerId :: Expr Int = col "customer_id"`.
+That's the same shape as `$(D.declareColumns df)` produces from a runtime
+`DataFrame`, but driven off the ADT instead of an existing frame.
+
 If you'd rather not depend on Template Haskell, the same schema is
 available via `GHC.Generics`:
 
diff --git a/src/DataFrame.hs b/src/DataFrame.hs
@@ -260,6 +260,7 @@ import DataFrame.IO.CSV as CSV (
     fromCsvBytes,
     readCsv,
     readCsvWithOpts,
+    readCsvWithSchema,
     readSeparated,
     readTsv,
     writeCsv,
@@ -309,6 +310,7 @@ import DataFrame.Internal.Row as Row (
     toRowVector,
  )
 import DataFrame.Internal.Schema as Schema (
+    deriveSchema,
     makeSchema,
     schemaType,
  )
diff --git a/src/DataFrame/Internal/Schema.hs b/src/DataFrame/Internal/Schema.hs
@@ -4,18 +4,23 @@
 {-# LANGUAGE InstanceSigs #-}
 {-# LANGUAGE RankNTypes #-}
 {-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE TemplateHaskellQuotes #-}
 {-# LANGUAGE TypeApplications #-}
 {-# LANGUAGE TypeFamilies #-}
 
 module DataFrame.Internal.Schema where
 
+import Data.Char (isUpper, toLower, toUpper)
 import qualified Data.Map as M
 import qualified Data.Proxy as P
 import qualified Data.Text as T
 
 import Data.Maybe (isJust)
 import Data.Type.Equality (TestEquality (..))
 import DataFrame.Internal.Column (Columnable)
+import DataFrame.Internal.Expression (Expr)
+import DataFrame.Operators (col)
+import Language.Haskell.TH
 import Type.Reflection (typeRep)
 
 -- | A runtime tag for a column’s element type.
@@ -108,3 +113,130 @@ True
 -}
 makeSchema :: [(T.Text, SchemaType)] -> Schema
 makeSchema = Schema . M.fromList
+
+{- | Auto-generate a runtime 'Schema' (and per-column @'Expr'@ accessors)
+from a record ADT.
+
+The splice reifies the record, applies @camelCase -> snake_case@ to each
+record-selector name, and emits:
+
+* a top-level @\<lower-first TyConName\>Schema :: 'Schema'@ binding suitable
+  for passing to 'DataFrame.IO.CSV.readCsvWithSchema' /
+  'DataFrame.IO.CSV.readCsvWithOpts'.
+* one @\<lower-first TyConName\>\<UpperFirst FieldName\> :: 'Expr' /ty/@ binding
+  per field, so you can refer to columns in expression DSL code by name
+  without writing @col \@/ty/ "snake_case_name"@ at every call site.
+
+@
+data Order = Order { customerId :: Int, region :: Text, amount :: Double }
+
+\$(deriveSchema ''Order)
+-- expands to:
+-- orderSchema :: Schema
+-- orderSchema = makeSchema
+--     [ ("customer_id", schemaType \@Int)
+--     , ("region",      schemaType \@Text)
+--     , ("amount",      schemaType \@Double)
+--     ]
+-- orderCustomerId :: Expr Int
+-- orderCustomerId = col "customer_id"
+-- orderRegion :: Expr Text
+-- orderRegion = col "region"
+-- orderAmount :: Expr Double
+-- orderAmount = col "amount"
+
+main = do
+    df <- D.readCsvWithSchema orderSchema "orders.csv"
+    let bigOrders = D.filterWhere (orderAmount .>. 100) df
+    ...
+@
+
+The data type must have exactly one record constructor; sum types or
+positional constructors fail the splice with a descriptive error. Field
+types must satisfy @('Columnable' a, 'Read' a)@ — the same constraints
+'schemaType' already requires.
+-}
+deriveSchema :: Name -> DecsQ
+deriveSchema tyName = do
+    info <- reify tyName
+    fields <- extractRecordFields tyName info
+    let entries =
+            [ (camelToSnake fieldBase, fieldBase, fTy)
+            | (fName, _bang, fTy) <- fields
+            , let fieldBase = nameBase fName
+            ]
+        schemaName = mkName (lowerFirst (nameBase tyName) ++ "Schema")
+        prefix = lowerFirst (nameBase tyName)
+        tupleE (colName, _, fTy) =
+            TupE
+                [ Just (AppE (VarE 'T.pack) (LitE (StringL colName)))
+                , Just (AppTypeE (VarE 'schemaType) fTy)
+                ]
+        schemaBody =
+            AppE (VarE 'makeSchema) (ListE (map tupleE entries))
+        schemaDecls =
+            [ SigD schemaName (ConT ''Schema)
+            , ValD (VarP schemaName) (NormalB schemaBody) []
+            ]
+        accessorDecls =
+            concat
+                [ [ SigD accName (AppT (ConT ''Expr) fTy)
+                  , ValD
+                        (VarP accName)
+                        ( NormalB
+                            ( AppE
+                                (VarE 'col)
+                                ( AppE
+                                    (VarE 'T.pack)
+                                    (LitE (StringL colName))
+                                )
+                            )
+                        )
+                        []
+                  ]
+                | (colName, fieldBase, fTy) <- entries
+                , let accName = mkName (prefix ++ upperFirst fieldBase)
+                ]
+    pure (schemaDecls ++ accessorDecls)
+
+extractRecordFields :: Name -> Info -> Q [VarBangType]
+extractRecordFields _ (TyConI dec) = case dec of
+    DataD _ _ _ _ [RecC _ fs] _ -> pure fs
+    NewtypeD _ _ _ _ (RecC _ fs) _ -> pure fs
+    DataD _ n _ _ _ _ ->
+        fail $
+            "deriveSchema: "
+                ++ show n
+                ++ " must have exactly one record constructor"
+    NewtypeD _ n _ _ _ _ ->
+        fail $
+            "deriveSchema: " ++ show n ++ " newtype must use record syntax"
+    other ->
+        fail $
+            "deriveSchema: unsupported declaration: " ++ show other
+extractRecordFields tyName _ =
+    fail $
+        "deriveSchema: "
+            ++ show tyName
+            ++ " is not a data/newtype declaration"
+
+-- Local @camelCase -> snake_case@: lowercase the first char, then prefix
+-- @\'_\'@ before any uppercase character (lowercased). Duplicated from
+-- 'DataFrame.Typed.TH.camelToSnake' to keep this module free of any
+-- @DataFrame.Typed.*@ imports.
+camelToSnake :: String -> String
+camelToSnake [] = []
+camelToSnake (c : cs) = toLower c : go cs
+  where
+    go [] = []
+    go (x : xs)
+        | isUpper x = '_' : toLower x : go xs
+        | otherwise = x : go xs
+
+lowerFirst :: String -> String
+lowerFirst [] = []
+lowerFirst (c : cs) = toLower c : cs
+
+upperFirst :: String -> String
+upperFirst [] = []
+upperFirst (c : cs) = toUpper c : cs
diff --git a/tests/Operations/Record.hs b/tests/Operations/Record.hs
@@ -15,13 +15,15 @@ module Operations.Record where
 
 import Data.Int (Int64)
 import qualified Data.Map.Strict as M
-import Data.Proxy (Proxy (..))
 import qualified Data.Text as T
+import qualified Data.Text.IO as TIO
 import GHC.Generics (Generic)
 
 import qualified DataFrame as D
+import qualified DataFrame.Functions as F
 import qualified DataFrame.Internal.Column as DI
 import qualified DataFrame.Internal.Schema as IS
+import DataFrame.Operators
 import DataFrame.Typed (Schema)
 import qualified DataFrame.Typed as DT
 
@@ -36,6 +38,7 @@ data Order = Order
     deriving (Show, Eq)
 
 $(DT.deriveSchemaFromType ''Order)
+$(D.deriveSchema ''Order)
 
 -- Nullable fields (Maybe Text -> RNullableBoxed; Maybe Int -> RNullableUnboxed).
 data User = User
@@ -46,6 +49,7 @@ data User = User
     deriving (Show, Eq)
 
 $(DT.deriveSchemaFromType ''User)
+$(D.deriveSchema ''User)
 
 -- Identity-cased: keep the record selector names verbatim.
 data Account = Account
@@ -85,6 +89,7 @@ data Wide = Wide
     deriving (Show, Eq)
 
 $(DT.deriveSchemaFromType ''Wide)
+$(D.deriveSchema ''Wide)
 
 -- Generics opt-in: derive the schema via Generic, not TH.
 data Foo = Foo
@@ -220,6 +225,117 @@ genericColumnNames = TestCase $ do
         ["foo_id", "foo_name", "foo_value"]
         (D.columnNames df)
 
+deriveSchemaSplice :: Test
+deriveSchemaSplice = TestCase $ do
+    assertEqual
+        "orderSchema column names"
+        ["amount", "order_id", "region"]
+        (M.keys (IS.elements orderSchema))
+    assertEqual
+        "order_id is Int64"
+        (Just (IS.schemaType @Int64))
+        (M.lookup "order_id" (IS.elements orderSchema))
+    assertEqual
+        "region is Text"
+        (Just (IS.schemaType @T.Text))
+        (M.lookup "region" (IS.elements orderSchema))
+    assertEqual
+        "amount is Double"
+        (Just (IS.schemaType @Double))
+        (M.lookup "amount" (IS.elements orderSchema))
+
+deriveSchemaNullable :: Test
+deriveSchemaNullable = TestCase $ do
+    assertEqual
+        "userSchema column names"
+        ["user_age", "user_id", "user_name"]
+        (M.keys (IS.elements userSchema))
+    assertEqual
+        "user_id is Int64"
+        (Just (IS.schemaType @Int64))
+        (M.lookup "user_id" (IS.elements userSchema))
+    assertEqual
+        "user_name is Maybe Text"
+        (Just (IS.schemaType @(Maybe T.Text)))
+        (M.lookup "user_name" (IS.elements userSchema))
+    assertEqual
+        "user_age is Maybe Int"
+        (Just (IS.schemaType @(Maybe Int)))
+        (M.lookup "user_age" (IS.elements userSchema))
+
+deriveSchemaWide :: Test
+deriveSchemaWide = TestCase $ do
+    assertEqual
+        "wideSchema has 8 keys"
+        8
+        (M.size (IS.elements wideSchema))
+    assertEqual
+        "f1 is Int"
+        (Just (IS.schemaType @Int))
+        (M.lookup "f1" (IS.elements wideSchema))
+    assertEqual
+        "f8 is Int"
+        (Just (IS.schemaType @Int))
+        (M.lookup "f8" (IS.elements wideSchema))
+
+deriveSchemaReadsCsv :: Test
+deriveSchemaReadsCsv = TestCase $ do
+    let csv =
+            T.unlines
+                [ "order_id,region,amount"
+                , "1,us,10.0"
+                , "2,eu,20.5"
+                , "3,ap,30.0"
+                ]
+        tmp = "/tmp/dataframe_test_deriveSchema.csv"
+    TIO.writeFile tmp csv
+    df <- D.readCsvWithSchema orderSchema tmp
+    assertEqual
+        "deriveSchema-driven readCsvWithSchema column names"
+        ["order_id", "region", "amount"]
+        (D.columnNames df)
+    case D.toRecords df :: Either T.Text [Order] of
+        Left e -> assertFailure (T.unpack e)
+        Right xs ->
+            assertEqual
+                "deriveSchema-driven CSV parses back to records"
+                [Order 1 "us" 10.0, Order 2 "eu" 20.5, Order 3 "ap" 30.0]
+                xs
+
+deriveSchemaAccessorFilter :: Test
+deriveSchemaAccessorFilter = TestCase $ do
+    let df =
+            D.fromRecords
+                [ Order 1 "us" 20.0
+                , Order 2 "eu" 20.5
+                , Order 3 "ap" 30.0
+                , Order 4 "us" 25.0
+                ]
+        big =
+            D.filterWhere
+                (orderAmount .>. F.lit @Double 15.0 .&&. orderRegion .==. F.lit @T.Text "us")
+                df
+    case D.toRecords big :: Either T.Text [Order] of
+        Left e -> assertFailure (T.unpack e)
+        Right xs ->
+            assertEqual
+                "accessor drives D.filter (amount > 15.0 && region == \"us\")"
+                [Order 1 "us" 20.0, Order 4 "us" 25.0]
+                xs
+
+deriveSchemaAccessorDerive :: Test
+deriveSchemaAccessorDerive = TestCase $ do
+    let df =
+            D.fromRecords
+                [ Order 1 "us" 10.0
+                , Order 2 "eu" 20.0
+                ]
+        df' = D.derive "double_amount" (orderAmount + orderAmount) df
+    assertEqual
+        "accessor composes in derive expression"
+        [20.0, 40.0]
+        (D.columnAsList (D.col @Double "double_amount") df')
+
 tests :: [Test]
 tests =
     [ TestLabel "basicTypedRoundTrip" basicTypedRoundTrip
@@ -233,4 +349,10 @@ tests =
     , TestLabel "wideRoundTrip" wideRoundTrip
     , TestLabel "genericRoundTrip" genericRoundTrip
     , TestLabel "genericColumnNames" genericColumnNames
+    , TestLabel "deriveSchemaSplice" deriveSchemaSplice
+    , TestLabel "deriveSchemaNullable" deriveSchemaNullable
+    , TestLabel "deriveSchemaWide" deriveSchemaWide
+    , TestLabel "deriveSchemaReadsCsv" deriveSchemaReadsCsv
+    , TestLabel "deriveSchemaAccessorFilter" deriveSchemaAccessorFilter
+    , TestLabel "deriveSchemaAccessorDerive" deriveSchemaAccessorDerive
     ]