From 058c7470cee5ee90b67b25063fba24a3fb05c8cc Mon Sep 17 00:00:00 2001 From: Luis Fuentes Date: Thu, 12 Mar 2026 17:07:20 -0500 Subject: [PATCH 1/2] feat(data-platform): complete phase 2.1 hardening --- .env.example | 5 +- .github/workflows/ci.yml | 42 ++++++++++++++++ .gitignore | 3 ++ README.md | 19 ++++++-- data/README.md | 4 +- docs/business_metrics.md | 95 +++++++++++++++++++++++++++++++++++++ docs/data_dictionary.md | 2 +- pyproject.toml | 20 ++++++++ requirements.lock | Bin 0 -> 2232 bytes requirements.txt | 5 ++ scripts/db_utils.py | 38 +++++++++++---- scripts/load_raw_duckdb.py | 60 ++++++++++++----------- 12 files changed, 248 insertions(+), 45 deletions(-) create mode 100644 .github/workflows/ci.yml create mode 100644 docs/business_metrics.md create mode 100644 requirements.lock diff --git a/.env.example b/.env.example index e797c39..e31b52a 100644 --- a/.env.example +++ b/.env.example @@ -1,7 +1,8 @@ ENVIRONMENT=local -# DuckDB (MVP) -DUCKDB_PATH=./data/warehouse/ecommerce.duckdb +# Warehouse +WAREHOUSE_PATH=data/warehouse/ecommerce.duckdb +WAREHOUSE_TEST_PATH=data/warehouse/ecommerce_test.duckdb # API API_HOST=0.0.0.0 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..32e6b02 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,42 @@ +name: ci + +on: + pull_request: + push: + branches: + - main + +jobs: + validate: + runs-on: ubuntu-latest + env: + WAREHOUSE_PATH: data/warehouse/ecommerce.duckdb + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: pip + cache-dependency-path: requirements.lock + + - name: Install dependencies + run: pip install -r requirements.lock + + - name: Download raw dataset + run: python scripts/download_dataset.py + + - name: Ingest raw data into DuckDB + run: python scripts/ingest_raw.py + + - name: Run pytest + run: pytest -q + + - name: Install dbt packages + run: dbt deps --project-dir ./dbt --profiles-dir ./dbt + + - name: Run dbt build + run: dbt build --project-dir ./dbt --profiles-dir ./dbt diff --git a/.gitignore b/.gitignore index 09e4b94..a43ebd7 100644 --- a/.gitignore +++ b/.gitignore @@ -81,3 +81,6 @@ dbt/.user.yml # OS .DS_Store + +# Personal planning notes (not part of project deliverables) +docs/cv_phase2_plan_upgrade.md diff --git a/README.md b/README.md index e6ecef6..76e7ee4 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ Production-like, end-to-end **Data Engineering** project: ingest -> warehouse -> ## Architecture -**CSV dataset** -> **Ingestion (Python)** -> **DuckDB (`data/warehouse/ecommerce.duckdb`)** -> **dbt models** -> **FastAPI endpoints** +**CSV dataset** -> **Ingestion (Python)** -> **DuckDB (`WAREHOUSE_PATH`, default: `data/warehouse/ecommerce.duckdb`)** -> **dbt models** -> **FastAPI endpoints** Quality gates: - `pytest` (pipeline/unit checks) @@ -75,7 +75,13 @@ source .venv/bin/activate # Windows (PowerShell) .venv\Scripts\Activate.ps1 -pip install -r requirements.txt +pip install -r requirements.lock + +# Optional: override warehouse path (default already works) +# Linux/Mac +export WAREHOUSE_PATH=data/warehouse/ecommerce.duckdb +# Windows (PowerShell) +$env:WAREHOUSE_PATH="data/warehouse/ecommerce.duckdb" ``` ### Run Phase 1 (download -> ingest -> validate) @@ -92,7 +98,7 @@ pytest -q ``` Expected: -- `data/warehouse/ecommerce.duckdb` created +- warehouse file created at `WAREHOUSE_PATH` (default: `data/warehouse/ecommerce.duckdb`) - 9 tables under the `raw` schema - tests pass @@ -137,7 +143,11 @@ Important: ### DuckDB CLI ```bash -duckdb data/warehouse/ecommerce.duckdb +# Linux/Mac +duckdb "${WAREHOUSE_PATH:-data/warehouse/ecommerce.duckdb}" + +# Windows (PowerShell) +duckdb $env:WAREHOUSE_PATH ``` ```sql @@ -197,6 +207,7 @@ dataops-ecommerce-platform/ - **Data directory**: `data/README.md` (local layout + verification) - **Data dictionary**: `docs/data_dictionary.md` (raw schema field-level docs) +- **Business metrics**: `docs/business_metrics.md` (GMV, AOV, cancel_rate, late_delivery_rate) - **dbt docs (local)**: run `dbt docs generate` and `dbt docs serve` inside `dbt/` --- diff --git a/data/README.md b/data/README.md index 04b4369..b8187d3 100644 --- a/data/README.md +++ b/data/README.md @@ -5,7 +5,7 @@ This directory documents the project's local data layout. ## Structure - `data/raw/`: source CSV files downloaded from Kaggle (excluded from Git via `.gitignore`) -- `data/warehouse/ecommerce.duckdb`: local DuckDB analytical warehouse file (excluded from Git via `.gitignore`) +- `WAREHOUSE_PATH` (default: `data/warehouse/ecommerce.duckdb`): local DuckDB analytical warehouse file (excluded from Git via `.gitignore`) - Raw files are not tracked because they are: - Large (1.5M+ rows combined) - Reproducible (can be re-downloaded) @@ -20,7 +20,7 @@ This directory documents the project's local data layout. ## Local verification (recommended) -After ingestion, validate the date range directly from the warehouse (`data/warehouse/ecommerce.duckdb`): +After ingestion, validate the date range directly from the warehouse file configured by `WAREHOUSE_PATH` (default: `data/warehouse/ecommerce.duckdb`): ```sql SELECT diff --git a/docs/business_metrics.md b/docs/business_metrics.md new file mode 100644 index 0000000..55acefc --- /dev/null +++ b/docs/business_metrics.md @@ -0,0 +1,95 @@ +# Business Metrics Definitions + +This document defines the core business metrics used in the project for consistent KPI tracking. + +## Scope + +- Primary fact tables: `marts.fact_orders`, `marts.fact_order_items` +- Default time grain: `order_purchase_date` (daily) +- Currency: BRL + +## Metrics + +### GMV + +- Definition: Gross Merchandise Value of sold items. +- Formula: `SUM(item_total)` +- Grain: Daily (`order_purchase_date`), aggregable to week/month. +- Filters: + - Include only orders with `is_canceled = false`. + - Recommended for "realized GMV": also filter `is_delivered = true`. + +SQL reference: + +```sql +select + fo.order_purchase_date, + sum(foi.item_total) as gmv +from marts.fact_order_items foi +join marts.fact_orders fo on fo.order_id = foi.order_id +where fo.is_canceled = false +group by 1; +``` + +### AOV + +- Definition: Average Order Value. +- Formula: `GMV / COUNT(DISTINCT order_id)` +- Grain: Daily (`order_purchase_date`), aggregable to week/month. +- Filters: + - Same population as GMV (recommended: non-canceled orders). + +SQL reference: + +```sql +select + fo.order_purchase_date, + sum(foi.item_total) / nullif(count(distinct fo.order_id), 0) as aov +from marts.fact_order_items foi +join marts.fact_orders fo on fo.order_id = foi.order_id +where fo.is_canceled = false +group by 1; +``` + +### cancel_rate + +- Definition: Share of canceled orders over total orders. +- Formula: `COUNT_IF(is_canceled) / COUNT(order_id)` +- Grain: Daily (`order_purchase_date`), aggregable to week/month. +- Filters: + - Include all orders in denominator. + +SQL reference: + +```sql +select + order_purchase_date, + avg(case when is_canceled then 1.0 else 0.0 end) as cancel_rate +from marts.fact_orders +group by 1; +``` + +### late_delivery_rate + +- Definition: Share of delivered orders that arrived after estimated date. +- Formula: `COUNT_IF(is_late_delivery) / COUNT_IF(is_delivered)` +- Grain: Daily (`order_purchase_date`), aggregable to week/month. +- Filters: + - Denominator should include delivered orders only. + - Exclude canceled/non-delivered orders from denominator. + +SQL reference: + +```sql +select + order_purchase_date, + sum(case when is_late_delivery and is_delivered then 1 else 0 end) * 1.0 + / nullif(sum(case when is_delivered then 1 else 0 end), 0) as late_delivery_rate +from marts.fact_orders +group by 1; +``` + +## Notes + +- Keep metric filters identical across dashboards and API endpoints. +- For monthly reporting, aggregate from daily grain rather than recalculating from mixed granularities. diff --git a/docs/data_dictionary.md b/docs/data_dictionary.md index 2a4e97c..dc0f5c3 100644 --- a/docs/data_dictionary.md +++ b/docs/data_dictionary.md @@ -1,6 +1,6 @@ # Data Dictionary — Raw Layer (`raw` schema) -Field-level documentation for the **raw ingestion layer** stored in `warehouse/ecommerce.duckdb` under the `raw` schema. +Field-level documentation for the **raw ingestion layer** stored in the DuckDB warehouse configured by `WAREHOUSE_PATH` (default: `data/warehouse/ecommerce.duckdb`) under the `raw` schema. ## Scope diff --git a/pyproject.toml b/pyproject.toml index e69de29..beb1f0f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -0,0 +1,20 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "dataops-ecommerce-platform" +version = "0.2.0" +description = "DataOps e-commerce ELT pipeline with DuckDB + dbt." +readme = "README.md" +requires-python = ">=3.11" +dependencies = [ + "duckdb>=1.4,<2.0", + "requests>=2.32,<3.0", + "pytest>=9.0,<10.0", + "dbt-core>=1.8,<1.9", + "dbt-duckdb>=1.8,<1.9", +] + +[tool.pytest.ini_options] +testpaths = ["tests"] diff --git a/requirements.lock b/requirements.lock new file mode 100644 index 0000000000000000000000000000000000000000..410337a7188a601b295e0751a7bb951747725ee5 GIT binary patch literal 2232 zcmZvdOK%!c42AEyQhy3n81m|(i*Bn#iqutCQDA@&9-%X!wE6LE&*y`4L5)Vkps{^y zAD`>_`{$v|%R||fY5AgSSU#1{x`*Xs`Jne3UC*-Gqb&pd|6R87?!*5~zE`@>4*omY zyMwJM7qXncS;s;dD{>W)jouH-^_z7fOQ+~XndzpMMyO4>3_ozFXi^@9LWZe)2T@&D z)_S*6)N5c6J5m&xGYadZECZLE*xlnq{jGF&!T^V8x+3~Aa<+Oq)0O;SX%3cgRPj*o z+3G(%n3-hK%(Uv9ddard^{mPhZz!~6@Yt^tTKZ7))k)5_?4u(hj=fGRo0eb7OK%fB z=@MW1gV)pA;1mWn@`SB{e)&&Nvp_hWIxswj11WmIP0^BX9N^D_yysFO|>2R!<*?%}td#FZlOBU_yz`CRXf z;^$Jrvo5O~KEN1-YHk(X$+B1O@A^LzPV!CX_M(qoIrh;}C#>|LCeGhFcwsV}IC^0} zya<&u^rAPZw&h8s82J`N~&-|q(G6x$G4W(}CJn``EN#!s(Gzl6C^TxoBS>>d$(v?rTCkA7C z<(O-qVyC0yOt23#V|jV^SY6EYI%cs|Bz5F@%^IBL;$5G6v1tC&`#dt+`%Z}S=m$*C z!+RPfns2f)w_d+sV5!r!^-(!CVmevc%XguEGiSzdy1}uwPGkE6KkO7UWKHBwx3}4^ zb0Gsva#xmi`Z72-UlbU0FLy%zQNEVjauas*pQi!6a87t`;=HCN=y!DV-NWOj=44hX zZ==wJx$UIkR^1NtbU#Mn*Zue6$oh2$^}E^&eHCZG>Z2Fp=)~wugiSwC6>LBB&ZK!J zCqCo;ep}z(@Tt<=LWyp@ae|Ix?hl#Zgo?cfiScQoCiesDR)F_(-D|F7FJ@P|7Si}A Vs`Lp;{$FL;>H?en+r+Q2{{i*SNXGyG literal 0 HcmV?d00001 diff --git a/requirements.txt b/requirements.txt index e69de29..7940cfa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,5 @@ +duckdb==1.4.4 +requests==2.32.5 +pytest==9.0.2 +dbt-core==1.8.7 +dbt-duckdb==1.8.2 diff --git a/scripts/db_utils.py b/scripts/db_utils.py index c12a575..38bdca7 100644 --- a/scripts/db_utils.py +++ b/scripts/db_utils.py @@ -1,23 +1,43 @@ +from contextlib import contextmanager +import os from pathlib import Path from typing import Generator -from contextlib import contextmanager import duckdb -DB_PATH = Path("warehouse") / "ecommerce.duckdb" +PROJECT_ROOT = Path(__file__).resolve().parent.parent +DEFAULT_WAREHOUSE_PATH = Path("data") / "warehouse" / "ecommerce.duckdb" + + +def resolve_warehouse_path() -> Path: + """ + Resolve DuckDB warehouse path from environment variables. + + Priority: + 1. WAREHOUSE_PATH + 2. DUCKDB_PATH (legacy fallback) + 3. data/warehouse/ecommerce.duckdb + """ + configured = os.getenv("WAREHOUSE_PATH") or os.getenv("DUCKDB_PATH") + path = Path(configured) if configured else DEFAULT_WAREHOUSE_PATH + if not path.is_absolute(): + path = PROJECT_ROOT / path + return path -def init_warehouse() -> None: - """Asegura que exista el directorio del warehouse.""" - DB_PATH.parent.mkdir(parents=True, exist_ok=True) +def init_warehouse(db_path: Path | None = None) -> Path: + """Ensure warehouse directory exists and return resolved path.""" + resolved_path = db_path or resolve_warehouse_path() + resolved_path.parent.mkdir(parents=True, exist_ok=True) + return resolved_path @contextmanager def get_connection() -> Generator[duckdb.DuckDBPyConnection, None, None]: - """Context manager para DuckDB.""" - init_warehouse() # asegura carpeta antes de conectar - conn = duckdb.connect(str(DB_PATH)) + """Context manager for DuckDB connections.""" + db_path = init_warehouse() + conn = duckdb.connect(str(db_path)) try: yield conn finally: - conn.close() \ No newline at end of file + conn.close() diff --git a/scripts/load_raw_duckdb.py b/scripts/load_raw_duckdb.py index 19eb306..8539fe9 100644 --- a/scripts/load_raw_duckdb.py +++ b/scripts/load_raw_duckdb.py @@ -1,38 +1,44 @@ -import os -import duckdb +from pathlib import Path -DB_PATH = os.path.join("data", "warehouse", "ecommerce.duckdb") +from db_utils import get_connection, resolve_warehouse_path + +PROJECT_ROOT = Path(__file__).resolve().parent.parent +RAW_DIR = PROJECT_ROOT / "data" / "raw" LOADS = [ - ("orders", "data/raw/olist_orders_dataset.csv"), - ("order_items", "data/raw/olist_order_items_dataset.csv"), - ("payments", "data/raw/olist_order_payments_dataset.csv"), - ("customers", "data/raw/olist_customers_dataset.csv"), - ("products", "data/raw/olist_products_dataset.csv"), - ("sellers", "data/raw/olist_sellers_dataset.csv"), - ("reviews", "data/raw/olist_order_reviews_dataset.csv"), - ("categories", "data/raw/product_category_name_translation.csv"), - ("geolocation", "data/raw/olist_geolocation_dataset.csv"), + ("orders", "olist_orders_dataset.csv"), + ("order_items", "olist_order_items_dataset.csv"), + ("payments", "olist_order_payments_dataset.csv"), + ("customers", "olist_customers_dataset.csv"), + ("products", "olist_products_dataset.csv"), + ("sellers", "olist_sellers_dataset.csv"), + ("reviews", "olist_order_reviews_dataset.csv"), + ("categories", "product_category_name_translation.csv"), + ("geolocation", "olist_geolocation_dataset.csv"), ] -def main(): - os.makedirs(os.path.dirname(DB_PATH), exist_ok=True) - con = duckdb.connect(DB_PATH) - con.execute("create schema if not exists raw;") - for table, csv_path in LOADS: - if not os.path.exists(csv_path): - raise FileNotFoundError(f"No existe: {csv_path}") +def main() -> None: + with get_connection() as conn: + conn.execute("create schema if not exists raw;") + + for table, filename in LOADS: + csv_path = RAW_DIR / filename + if not csv_path.exists(): + raise FileNotFoundError(f"No existe: {csv_path}") + + conn.execute( + f""" + create or replace table raw.{table} as + select * from read_csv_auto(?, header=true); + """, + [str(csv_path)], + ) + count = conn.execute(f"select count(*) from raw.{table}").fetchone()[0] + print(f"OK raw.{table}: {count:,} filas") - con.execute(f""" - create or replace table raw.{table} as - select * from read_csv_auto('{csv_path}', header=true); - """) - count = con.execute(f"select count(*) from raw.{table}").fetchone()[0] - print(f"OK raw.{table}: {count:,} filas") + print(f"Carga completada en: {resolve_warehouse_path()}") - con.close() - print("Carga completada.") if __name__ == "__main__": main() From f8e1480efcbcb9d37c7b52aee76e581cc7d3fd45 Mon Sep 17 00:00:00 2001 From: Luis Fuentes Date: Thu, 12 Mar 2026 17:17:36 -0500 Subject: [PATCH 2/2] fix(ci): align dbt profile name with project config --- .github/workflows/ci.yml | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 32e6b02..f79ebd6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,6 +11,7 @@ jobs: runs-on: ubuntu-latest env: WAREHOUSE_PATH: data/warehouse/ecommerce.duckdb + DBT_PROFILES_DIR: .ci_dbt_profiles steps: - name: Checkout repository @@ -35,8 +36,27 @@ jobs: - name: Run pytest run: pytest -q + - name: Create dbt profile for CI + run: | + mkdir -p "${DBT_PROFILES_DIR}" + cat > "${DBT_PROFILES_DIR}/profiles.yml" <<'YAML' + dataops_ecommerce: + target: dev + outputs: + dev: + type: duckdb + path: "{{ env_var('WAREHOUSE_PATH', 'data/warehouse/ecommerce.duckdb') }}" + schema: main + threads: 4 + extensions: + - parquet + - json + settings: + memory_limit: "4GB" + YAML + - name: Install dbt packages - run: dbt deps --project-dir ./dbt --profiles-dir ./dbt + run: dbt deps --project-dir ./dbt --profiles-dir "${DBT_PROFILES_DIR}" - name: Run dbt build - run: dbt build --project-dir ./dbt --profiles-dir ./dbt + run: dbt build --project-dir ./dbt --profiles-dir "${DBT_PROFILES_DIR}"