Skip to content

Commit a0c26da

Browse files
committed
FPM-566 support alt_df in AltData infilling
1 parent 82fea79 commit a0c26da

File tree

2 files changed

+50
-7
lines changed

2 files changed

+50
-7
lines changed

src/time_stream/infill.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -364,37 +364,39 @@ class AltData(InfillMethod):
364364

365365
name = "alt_data"
366366

367-
def __init__(self, alt_data_column: str, correction_factor: float = 1.0):
367+
def __init__(self, alt_data_column: str, correction_factor: float = 1.0, alt_df: pl.DataFrame | None = None):
368368
"""Initialize the alternative data infill method.
369369
370370
Args:
371371
alt_data_column: The name of the column providing the alternative data.
372372
correction_factor: An optional correction factor to apply to the alternative data.
373+
alt_df: The DataFrame containing the alternative data.
373374
"""
374375
self.alt_data_column = alt_data_column
375376
self.correction_factor = correction_factor
377+
self.alt_df = alt_df
376378

377-
def _fill(self, df: pl.DataFrame, infill_column: str, alt_df: Optional[pl.DataFrame]) -> pl.DataFrame:
379+
def _fill(self, df: pl.DataFrame, infill_column: str) -> pl.DataFrame:
378380
"""Fill missing values using data from the alternative column.
379381
380382
Args:
381383
df: The DataFrame to infill.
382384
infill_column: The column to infill.
383-
alt_df: The DataFrame containing the alternative data.
384385
385386
Returns:
386387
pl.DataFrame with infilled values.
387388
"""
388-
if alt_df is None:
389+
breakpoint()
390+
if self.alt_df is None:
389391
check_columns_in_dataframe(df, [self.alt_data_column])
390392
else:
391-
check_columns_in_dataframe(alt_df, ['time', self.alt_data_column])
393+
check_columns_in_dataframe(self.alt_df, ['time', self.alt_data_column])
392394

393395
if self.alt_data_column in df.columns:
394396
raise ValueError(f"Column {self.alt_data_column} already exists in the main dataframe.")
395397

396398
df = df.join(
397-
alt_df.select(['time', self.alt_data_column]),
399+
self.alt_df.select(['time', self.alt_data_column]),
398400
on='time',
399401
how="left",
400402
suffix="_alt"
@@ -407,7 +409,7 @@ def _fill(self, df: pl.DataFrame, infill_column: str, alt_df: Optional[pl.DataFr
407409
.alias(self._infilled_column_name(infill_column))
408410
)
409411

410-
if alt_df is not None:
412+
if self.alt_df is not None:
411413
infilled = infilled.drop(self.alt_data_column)
412414

413415
return infilled

tests/time_stream/test_infill.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -492,3 +492,44 @@ def test_alt_data_infill_restricting_date_range(self) -> None:
492492
)
493493
expected_df = self.df.with_columns(pl.Series("values", [1.0, 20.0, 3.0, None, 5.0]))
494494
assert_frame_equal(result_df, expected_df, check_column_order=False)
495+
496+
def test_alt_data_infill_with_alt_data_provided(self) -> None:
497+
"""Test infilling from a provided alternative DataFrame."""
498+
alt_df = pl.DataFrame(
499+
{
500+
"timestamp": self.df["timestamp"],
501+
"alt_values_df": [11.0, 22.0, 33.0, 44.0, 55.0],
502+
}
503+
)
504+
infiller = AltData(alt_data_column="alt_values_df", alt_df=alt_df)
505+
result_df = infiller.apply(self.tf.df, self.tf.time_name, self.tf.periodicity, "values")
506+
expected_df = self.df.with_columns(pl.Series("values", [1.0, 22.0, 3.0, 44.0, 5.0]))
507+
assert_frame_equal(result_df, expected_df, check_column_order=False)
508+
509+
def test_alt_data_infill_with_alt_data_missing_time_column(self) -> None:
510+
"""Test error when provided alt_data is missing the time column."""
511+
alt_df = pl.DataFrame({"alt_values_df": [11.0, 22.0, 33.0, 44.0, 55.0]})
512+
infiller = AltData(alt_data_column="alt_values", alt_df=alt_df)
513+
with self.assertRaises(ColumnNotFoundError):
514+
infiller.apply(self.tf.df, self.tf.time_name, self.tf.periodicity, "values")
515+
516+
def test_alt_data_infill_with_alt_data_missing_data_column(self) -> None:
517+
"""Test error when provided alt_data is missing the data column."""
518+
alt_df = pl.DataFrame({"time": self.df["timestamp"]})
519+
infiller = AltData(alt_data_column="non_existent_column", alt_df=alt_df)
520+
with self.assertRaises(ColumnNotFoundError):
521+
infiller.apply(self.tf.df, self.tf.time_name, self.tf.periodicity, "values")
522+
523+
def test_alt_data_infill_with_alt_data_and_column_in_main_df(self) -> None:
524+
"""Test that alt_data is prioritized when column name exists in main df."""
525+
alt_df = pl.DataFrame(
526+
{
527+
"timestamp": self.df["timestamp"],
528+
"alt_values": [11.0, 22.0, 33.0, 44.0, 55.0],
529+
}
530+
)
531+
infiller = AltData(alt_data_column="alt_values", alt_df=alt_df)
532+
533+
with self.assertRaises(ValueError):
534+
infiller.apply(self.tf.df, self.tf.time_name, self.tf.periodicity, "values")
535+

0 commit comments

Comments
 (0)