1- {-# LANGUAGE OverloadedStrings #-}
1+ {-# LANGUAGE OverloadedStrings #-}
2+ {-# LANGUAGE TypeApplications #-}
3+ {-# LANGUAGE ScopedTypeVariables #-}
4+ {-# LANGUAGE NumericUnderscores #-}
25module Main where
36
7+ import Control.Monad (when )
8+ import Data.Maybe
9+ import qualified Data.Map as M
10+ import qualified Data.Text as T
411import qualified DataFrame as D
12+ import qualified DataFrame.Functions as F
13+ import qualified Data.Vector.Unboxed as VU
14+ import qualified Data.Vector as V
15+ import Torch
16+
17+ import DataFrame ((|>) )
518
619main :: IO ()
720main = do
8- parsed <- D. readCsv " ./data/housing.csv"
21+ {- Feature ingestion and engineering -}
22+ df <- fmap (D. apply (\ (op :: T. Text ) -> oceanProximity M. ! op) " ocean_proximity" ) (D. readCsv " ./data/housing.csv" )
23+ -- This column has nulls so we:
24+ -- * Remove all nulls with filterJust
25+ -- * Calculate the mean of total_bedrooms
26+ -- * impute the mean.
27+ -- This could probably be a utility function.
28+ let meanTotalBedrooms = fromMaybe 0 $ df |> D. filterJust " total_bedrooms"
29+ |> D. mean " total_bedrooms"
30+ imputed = df |> D. impute " total_bedrooms" meanTotalBedrooms
31+ |> D. exclude [" median_house_value" ]
32+ |> normalizeFeatures
33+ (r, c) = D. dimensions imputed
34+ features = reshape [r,c] $ asTensor (flattenFeatures imputed)
35+ labels = asTensor ((VU. map realToFrac . VU. convert) (D. columnAsVector @ Double " median_house_value" df) :: VU. Vector Float )
36+
37+ {- Train the model -}
38+ putStrLn " Training linear regression model..."
39+ init <- sample $ LinearSpec {in_features = (snd (D. dimensions df) - 1 ), out_features = 1 }
40+ trained <- foldLoop init 100_000 $ \ state i -> do
41+ let labels' = model state features
42+ loss = mseLoss labels labels'
43+ when (i `mod` 10_000 == 0 ) $ do
44+ putStrLn $ " Iteration: " ++ show i ++ " | Loss: " ++ show loss
45+ (state', _) <- runStep state GD loss 0.1
46+ pure state'
47+
48+ {- Show predictions -}
49+ let predictions = D. insertUnboxedVector " predicted_house_value" (asValue @ (VU. Vector Float ) (model trained features)) df
50+ print $ D. select [" median_house_value" , " predicted_house_value" ] predictions |> D. take 10
51+
52+ normalizeFeatures :: D. DataFrame -> D. DataFrame
53+ normalizeFeatures df = df |> D. fold (\ name d -> let
54+ m = fromMaybe 0 (D. mean name d)
55+ stdDev = fromMaybe 0.01 (D. standardDeviation name d)
56+ col = F. col @ Double name
57+ in D. derive name ((col - (F. minimum col)) / (F. maximum col - F. minimum col)) d) (D. columnNames df)
58+
959
10- print $ D. describeColumns parsed
60+ model :: Linear -> Tensor -> Tensor
61+ model state input = squeezeAll $ linear state input
1162
12- print $ D. take 5 parsed
63+ oceanProximity :: M. Map T. Text Double
64+ oceanProximity = M. fromList [(" ISLAND" , 0 ), (" NEAR OCEAN" , 1 ), (" NEAR BAY" , 2 ), (" <1H OCEAN" , 3 ), (" INLAND" , 4 )]
1365
14- D. plotHistograms D. PlotAll D. VerticalHistogram parsed
66+ flattenFeatures :: D. DataFrame -> VU. Vector Float
67+ flattenFeatures df = V. foldl' (\ acc v -> acc VU. ++ v) VU. empty (D. toMatrix df)
0 commit comments