Skip to content

Commit d92ed1f

Browse files
Merge branch 'main' into pct-null
2 parents ac98682 + 0c1a35a commit d92ed1f

File tree

6 files changed

+618
-91
lines changed

6 files changed

+618
-91
lines changed

docs/_quarto.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,10 +239,12 @@ quartodoc:
239239
The *YAML* group contains functions that allow for the use of YAML to orchestrate validation
240240
workflows. The `yaml_interrogate()` function can be used to run a validation workflow from
241241
YAML strings or files. The `validate_yaml()` function checks if the YAML configuration
242-
passes its own validity checks.
242+
passes its own validity checks. The `yaml_to_python()` function converts YAML configuration
243+
to equivalent Python code.
243244
contents:
244245
- name: yaml_interrogate
245246
- name: validate_yaml
247+
- name: yaml_to_python
246248
- title: Utility Functions
247249
desc: >
248250
The *Utility Functions* group contains functions that are useful accessing metadata about
Lines changed: 72 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,61 @@
11
---
2-
pagetitle: "Examples: Validating Datetime Values and Data Freshness"
2+
pagetitle: "Examples: Validating Data Freshness"
33
notebook-links: false
44
page-navigation: false
55
toc: false
66
html-table-processing: none
77
---
88

9-
### Validating Datetime Values and Data Freshness
9+
### Validating Data Freshness
1010

11-
Use date/datetime-based validations to ensure your data is current and within expected time ranges.
11+
Use date/datetime-based validations to ensure your data is current and recent. This is critical for applications that depend on timely data updates.
1212

1313
```{python}
1414
#| echo: false
1515
1616
import pointblank as pb
1717
import polars as pl
18-
from datetime import date, timedelta
18+
from datetime import date, datetime, timedelta
1919
20-
# Supposing it is 2023-12-31, there should be data available in the last 3 days
21-
cutoff_date = date(2023, 12, 31) - timedelta(days=3)
20+
# Create sample data with mixed freshness levels
21+
freshness_data = pl.DataFrame({
22+
"data_timestamp": [
23+
datetime(2023, 12, 28, 10, 30), # 3 days ago from Dec 31
24+
datetime(2023, 12, 29, 14, 15), # 2 days ago
25+
datetime(2023, 12, 30, 9, 45), # 1 day ago
26+
datetime(2023, 12, 31, 16, 20), # Today
27+
],
28+
"sensor_id": ["TEMP_01", "TEMP_02", "TEMP_01", "TEMP_03"],
29+
"reading": [22.5, 21.8, 23.1, 22.9],
30+
"quality_score": [0.95, 0.88, 0.92, 0.97]
31+
})
32+
33+
# Assuming today is 2023-12-31, check for data freshness
34+
current_date = date(2023, 12, 31)
35+
freshness_cutoff = current_date - timedelta(days=2) # Data should be within 2 days
2236
2337
validation = (
24-
pb.Validate(
25-
data=pb.load_dataset(dataset="global_sales", tbl_type="polars")
26-
)
38+
pb.Validate(freshness_data)
2739
.specially(
28-
expr=lambda df: df.filter(pl.col("timestamp") >= cutoff_date).height > 0,
29-
brief="Recent data availability check: there is data in the last 3 days."
40+
expr=lambda df: df.filter(
41+
pl.col("data_timestamp").dt.date() >= freshness_cutoff
42+
).height > 0,
43+
brief=f"Recent data available (within 2 days of {current_date})"
3044
)
31-
.col_vals_between(
32-
columns="timestamp",
33-
left="2021-01-01",
34-
right="2023-12-31",
35-
brief="Date range validation."
45+
.col_vals_ge(
46+
columns="data_timestamp",
47+
value=current_date - timedelta(days=7), # Within last week
48+
brief="All data points are from the last week"
49+
)
50+
.specially(
51+
expr=lambda df: (
52+
df.select(pl.col("data_timestamp").max()).item().date() >= current_date
53+
),
54+
brief="Most recent data is from today"
3655
)
3756
.col_vals_not_null(
38-
columns="timestamp",
39-
brief="No missing timestamps."
57+
columns="data_timestamp",
58+
brief="No missing timestamps"
4059
)
4160
.interrogate()
4261
)
@@ -47,28 +66,47 @@ validation
4766
```python
4867
import pointblank as pb
4968
import polars as pl
50-
from datetime import date, timedelta
69+
from datetime import date, datetime, timedelta
5170

52-
# Supposing it is 2023-12-31, there should be data available in the last 3 days
53-
cutoff_date = date(2023, 12, 31) - timedelta(days=3)
71+
# Create sample data with mixed freshness levels
72+
freshness_data = pl.DataFrame({
73+
"data_timestamp": [
74+
datetime(2023, 12, 28, 10, 30), # 3 days ago from Dec 31
75+
datetime(2023, 12, 29, 14, 15), # 2 days ago
76+
datetime(2023, 12, 30, 9, 45), # 1 day ago
77+
datetime(2023, 12, 31, 16, 20), # Today
78+
],
79+
"sensor_id": ["TEMP_01", "TEMP_02", "TEMP_01", "TEMP_03"],
80+
"reading": [22.5, 21.8, 23.1, 22.9],
81+
"quality_score": [0.95, 0.88, 0.92, 0.97]
82+
})
83+
84+
# Assuming today is 2023-12-31, check for data freshness
85+
current_date = date(2023, 12, 31)
86+
freshness_cutoff = current_date - timedelta(days=2) # Data should be within 2 days
5487

5588
validation = (
56-
pb.Validate(
57-
data=pb.load_dataset(dataset="global_sales", tbl_type="polars")
58-
)
89+
pb.Validate(freshness_data)
5990
.specially(
60-
expr=lambda df: df.filter(pl.col("timestamp") >= cutoff_date).height > 0,
61-
brief="Recent data availability check: there is data in the last 3 days."
91+
expr=lambda df: df.filter(
92+
pl.col("data_timestamp").dt.date() >= freshness_cutoff
93+
).height > 0,
94+
brief=f"Recent data available (within 2 days of {current_date})"
6295
)
63-
.col_vals_between(
64-
columns="timestamp",
65-
left="2021-01-01",
66-
right="2023-12-31",
67-
brief="Date range validation."
96+
.col_vals_ge(
97+
columns="data_timestamp",
98+
value=current_date - timedelta(days=7), # Within last week
99+
brief="All data points are from the last week"
100+
)
101+
.specially(
102+
expr=lambda df: (
103+
df.select(pl.col("data_timestamp").max()).item().date() >= current_date
104+
),
105+
brief="Most recent data is from today"
68106
)
69107
.col_vals_not_null(
70-
columns="timestamp",
71-
brief="No missing timestamps."
108+
columns="data_timestamp",
109+
brief="No missing timestamps"
72110
)
73111
.interrogate()
74112
)
@@ -81,7 +119,7 @@ validation
81119

82120
```{python}
83121
# | echo: false
84-
pb.preview(pb.load_dataset(dataset="global_sales", tbl_type="polars"))
122+
pb.preview(freshness_data)
85123
```
86124

87125
</details>
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
---
2+
pagetitle: "Examples: Date and Datetime Validations"
3+
notebook-links: false
4+
page-navigation: false
5+
toc: false
6+
html-table-processing: none
7+
---
8+
9+
### Date and Datetime Validations
10+
11+
**pointblank** provides comprehensive support for validating date and datetime values, including timezone-aware comparisons. This ensures temporal data quality in applications that handle time-sensitive information.
12+
13+
```{python}
14+
#| echo: false
15+
16+
import pointblank as pb
17+
import polars as pl
18+
from datetime import date, datetime
19+
import pytz
20+
21+
# Create sample data with various temporal data types
22+
temporal_data = pl.DataFrame({
23+
"order_date": [
24+
date(2023, 1, 15),
25+
date(2023, 6, 10),
26+
date(2023, 12, 5),
27+
date(2024, 3, 20)
28+
],
29+
"created_at": [
30+
datetime(2023, 1, 15, 9, 30, 0),
31+
datetime(2023, 6, 10, 14, 45, 30),
32+
datetime(2023, 12, 5, 8, 15, 0),
33+
datetime(2024, 3, 20, 17, 22, 45)
34+
],
35+
"event_time_tz": [
36+
datetime(2023, 1, 15, 9, 0, tzinfo=pytz.timezone("America/New_York")),
37+
datetime(2023, 6, 10, 12, 30, tzinfo=pytz.timezone("America/New_York")),
38+
datetime(2023, 12, 5, 15, 45, tzinfo=pytz.timezone("America/New_York")),
39+
datetime(2024, 3, 20, 18, 15, tzinfo=pytz.timezone("America/New_York"))
40+
],
41+
"order_id": [1001, 1002, 1003, 1004],
42+
"amount": [150.0, 275.5, 89.99, 420.00]
43+
})
44+
45+
validation = (
46+
pb.Validate(temporal_data)
47+
.col_vals_ge(
48+
columns="order_date",
49+
value=date(2023, 1, 1),
50+
brief="Orders are from 2023 or later"
51+
)
52+
.col_vals_between(
53+
columns="created_at",
54+
left=datetime(2023, 1, 1, 0, 0, 0),
55+
right=datetime(2024, 12, 31, 23, 59, 59),
56+
brief="Creation timestamps within expected range"
57+
)
58+
.col_vals_ge(
59+
columns="event_time_tz",
60+
value=datetime(2023, 1, 1, 8, 0, tzinfo=pytz.timezone("America/New_York")),
61+
brief="Timezone-aware events after 8 AM Eastern"
62+
)
63+
.col_schema_match(
64+
pb.Schema(
65+
columns=[
66+
("order_date", "Date"),
67+
("created_at", "Datetime(time_unit='us', time_zone=None)"),
68+
("event_time_tz", "Datetime(time_unit='us', time_zone='America/New_York')"),
69+
("order_id", "Int64"),
70+
("amount", "Float64")
71+
]
72+
),
73+
brief="Schema includes proper date/datetime types"
74+
)
75+
.interrogate()
76+
)
77+
78+
validation
79+
```
80+
81+
```python
82+
import pointblank as pb
83+
import polars as pl
84+
from datetime import date, datetime
85+
import pytz
86+
87+
# Create sample data with various temporal data types
88+
temporal_data = pl.DataFrame({
89+
"order_date": [
90+
date(2023, 1, 15),
91+
date(2023, 6, 10),
92+
date(2023, 12, 5),
93+
date(2024, 3, 20)
94+
],
95+
"created_at": [
96+
datetime(2023, 1, 15, 9, 30, 0),
97+
datetime(2023, 6, 10, 14, 45, 30),
98+
datetime(2023, 12, 5, 8, 15, 0),
99+
datetime(2024, 3, 20, 17, 22, 45)
100+
],
101+
"event_time_tz": [
102+
datetime(2023, 1, 15, 9, 0, tzinfo=pytz.timezone("America/New_York")),
103+
datetime(2023, 6, 10, 12, 30, tzinfo=pytz.timezone("America/New_York")),
104+
datetime(2023, 12, 5, 15, 45, tzinfo=pytz.timezone("America/New_York")),
105+
datetime(2024, 3, 20, 18, 15, tzinfo=pytz.timezone("America/New_York"))
106+
],
107+
"order_id": [1001, 1002, 1003, 1004],
108+
"amount": [150.0, 275.5, 89.99, 420.00]
109+
})
110+
111+
validation = (
112+
pb.Validate(temporal_data)
113+
.col_vals_ge(
114+
columns="order_date",
115+
value=date(2023, 1, 1),
116+
brief="Orders are from 2023 or later"
117+
)
118+
.col_vals_between(
119+
columns="created_at",
120+
left=datetime(2023, 1, 1, 0, 0, 0),
121+
right=datetime(2024, 12, 31, 23, 59, 59),
122+
brief="Creation timestamps within expected range"
123+
)
124+
.col_vals_ge(
125+
columns="event_time_tz",
126+
value=datetime(2023, 1, 1, 8, 0, tzinfo=pytz.timezone("America/New_York")),
127+
brief="Timezone-aware events after 8 AM Eastern"
128+
)
129+
.col_schema_match(
130+
pb.Schema(
131+
columns=[
132+
("order_date", "Date"),
133+
("created_at", "Datetime(time_unit='us', time_zone=None)"),
134+
("event_time_tz", "Datetime(time_unit='us', time_zone='America/New_York')"),
135+
("order_id", "Int64"),
136+
("amount", "Float64")
137+
]
138+
),
139+
brief="Schema includes proper date/datetime types"
140+
)
141+
.interrogate()
142+
)
143+
144+
validation
145+
```
146+
147+
<details>
148+
<summary>Preview of Input Table</summary>
149+
150+
```{python}
151+
# | echo: false
152+
pb.preview(temporal_data)
153+
```
154+
155+
</details>

docs/demos/index.qmd

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,8 +153,11 @@ For far more specialized validations, modify the table with the `pre=` argument
153153
[Verifying Row and Column Counts](./check-row-column-counts/index.qmd)<br>
154154
Check the dimensions of the table with the `*_count_match()` validation methods.
155155

156-
[Validating Datetime Values and Data Freshness](./check-for-freshness/index.qmd)<br>
157-
Use date-based validations to ensure your data is current and within expected time ranges.
156+
[Validating Data Freshness](./check-for-freshness/index.qmd)<br>
157+
Use date-based validations to ensure your data is current and recent.
158+
159+
[Date and Datetime Validations](./datetime-validations/index.qmd)<br>
160+
Comprehensive examples of date, datetime, and timezone-aware datetime comparisons.
158161

159162
[Custom Validation with `specially()`](./custom-validation-specially/index.qmd)<br>
160163
Create bespoke validations using `specially()` to implement domain-specific business rules.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ dev = [
108108
"pytest-snapshot",
109109
"pytest-xdist>=3.6.1",
110110
"pytest-xdist>=3.6.1",
111+
"pytz>=2025.2",
111112
"quartodoc>=0.8.1; python_version >= '3.9'",
112113
"ruff>=0.9.9",
113114
"shiny>=1.4.0",

0 commit comments

Comments
 (0)