From 64eae57f0f5945197761f9011ffc7a45e78ad17a Mon Sep 17 00:00:00 2001 From: Tushar Choudhary <151359025+tusharchou@users.noreply.github.com> Date: Sun, 27 Jul 2025 15:57:27 +0530 Subject: [PATCH] feat(dx): Add Makefile and setup guide (#96) * feat(dx): Add Makefile and setup guide * feat: Stabilize build and enhance developer experience - Fixes dependency issues by correcting pyproject.toml and standardizing on a root requirements.txt for Read the Docs. - Adds a Makefile to streamline common development commands. - Introduces Dockerfiles for reproducible development and documentation hosting. - Implements a script to dynamically generate a list of open GitHub issues for the documentation. - Cleans up the project structure by removing obsolete Sphinx files, duplicate Makefiles, and misplaced source files. * Add docs improvements, recipes, test, and CI workflow * Remove site/ assets from version control and add to .gitignore * docs: add wiki to docs and update mkdocs navigation * Refactor: move package to root, update docs, fix navigation and developer tooling * cleaning * fix: update readthedocs.yml and docs improvements * deleted readthedocs.yml * added more docs * added more docs * added letter * feat(library): introduce github module and src layout * reset * issues * issues fix * homepage * business solutions * marketing data analysis * marketing data analysis fix * marketing data analysis final --- .github/workflows/append_pr_history.yml | 43 ++++ .github/workflows/ci.yml | 56 +++++ .github/workflows/mkdocs_link_check.yml | 37 +++ .gitignore | 56 +---- Makefile | 84 +++++++ README 2.md | 19 ++ README.md | 187 +++----------- docs/Makefile | 90 +++++-- docs/PR_HISTORY.md | 5 + docs/api.md | 5 + docs/closed_items.md | 25 ++ docs/conf.py | 63 ----- docs/contributing.md | 28 +++ docs/developer_tooling_update.md | 27 ++ docs/index.md | 30 +++ .../__init__.py => docs/pr_description.md | 0 docs/pr_reviews.md | 31 +++ docs/recipes.md | 27 ++ docs/requirements.txt | 58 ----- docs/scripts/generate_issue_list.py | 0 docs/user_issues.md | 230 ++++++++++++++++++ docs/wiki/ACTIVE_DOCS_URLS.md | 69 ++++++ docs/wiki/BRANCHES.md | 22 ++ docs/wiki/CONTENTS.md | 17 ++ docs/wiki/CONTRIBUTING.md | 5 + docs/wiki/DEVELOPMENT.md | 31 +++ docs/wiki/GITHUB.md | 64 +++++ docs/wiki/PROBLEM_STATEMENT.md | 18 ++ docs/wiki/PRODUCT_DEVELOPMENT.md | 28 +++ docs/wiki/PROJECT_OVERVIEW.md | 10 + docs/wiki/PROJECT_STRUCTURE.md | 32 +++ docs/wiki/RECIPES.md | 5 + docs/wiki/TECHNICAL_SPECIFICATIONS.md | 3 + docs/wiki/VISION.md | 29 +++ docs/wiki/business/MARKETING_DATA_ANALYSIS.md | 69 ++++++ docs/wiki/business/PROPOSAL.md | 31 +++ docs/wiki/library/BASE.md | 24 ++ docs/wiki/library/EXCEPTION.md | 164 +++++++++++++ docs/wiki/library/TEST.md | 156 ++++++++++++ docs/wiki/plan/ISSUES.md | 17 ++ docs/wiki/usecases/PHOTO_MANAGEMENT.md | 19 ++ docs/wiki/workflow/PHOTO_MANAGEMENT.md | 86 +++++++ how_to_setup.md | 70 ++++++ .../__init__.py | 0 .../catalog/__init__.py | 0 .../catalog/local/__init__.py | 0 .../catalog/local/iceberg/__init__.py | 0 .../engine/__init__.py | 0 .../etl.py | 0 .../exceptions.py | 0 .../format/__init__.py | 0 .../format/csv/__init__.py | 0 .../format/iceberg/__init__.py | 0 .../format/parquet/__init__.py | 0 .../hello_world.py | 0 .../issue/__init__.py | 0 .../logger.py | 0 .../pipeline/__init__.py | 0 .../pipeline/egression/__init__.py | 0 .../egression/csv_to_iceberg/__init__.py | 0 .../egression/iceberg_to_csv/__init__.py | 0 .../pipeline/ingestion/__init__.py | 0 .../ingestion/bigquery_to_csv/__init__.py | 0 .../ingestion/csv_to_iceberg/__init__.py | 0 .../ingestion/parquet_to_iceberg/__init__.py | 0 .../pipeline/ingestion/pyarrow/__init__.py | 0 .../store/__init__.py | 0 .../store/source/__init__.py | 0 .../store/source/gcp/__init__.py | 0 .../store/source/gcp/bigquery/__init__.py | 0 .../store/source/json/__init__.py | 0 .../store/source/near/__init__.py | 0 .../store/source/parquet/__init__.py | 0 .../store/target/__init__.py | 0 .../store/target/iceberg/__init__.py | 0 .../tmp/warehouse/pyiceberg_catalog.db | Bin mkdocs.yml | 31 ++- poetry.lock | 35 ++- pyproject.toml | 24 +- scripts/__init__.py | 0 scripts/append_pr_history.py | 48 ++++ scripts/fetch_closed_items.py | 54 ++++ scripts/fetch_rtd_urls.py | 58 +++++ scripts/generate_issue_list.py | 71 ++++++ scripts/github_api.py | 0 src/local_data_platform/__init__.py | 0 src/local_data_platform/github/__init__.py | 101 ++++++++ src/tmp/warehouse/pyiceberg_catalog.db | Bin 0 -> 20480 bytes tests/__init__.py | 0 tests/test_github.py | 65 +++++ tests/test_json_source.py | 0 tests/test_placeholder.py | 3 + 92 files changed, 2209 insertions(+), 351 deletions(-) create mode 100644 .github/workflows/append_pr_history.yml create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/mkdocs_link_check.yml create mode 100644 Makefile create mode 100644 README 2.md create mode 100644 docs/PR_HISTORY.md create mode 100644 docs/api.md create mode 100644 docs/closed_items.md create mode 100644 docs/contributing.md create mode 100644 docs/developer_tooling_update.md create mode 100644 docs/index.md rename local-data-platform/local_data_platform/store/source/near/__init__.py => docs/pr_description.md (100%) create mode 100644 docs/pr_reviews.md create mode 100644 docs/recipes.md delete mode 100644 docs/requirements.txt create mode 100644 docs/scripts/generate_issue_list.py create mode 100644 docs/user_issues.md create mode 100644 docs/wiki/ACTIVE_DOCS_URLS.md create mode 100644 docs/wiki/BRANCHES.md create mode 100644 docs/wiki/CONTENTS.md create mode 100644 docs/wiki/CONTRIBUTING.md create mode 100644 docs/wiki/DEVELOPMENT.md create mode 100644 docs/wiki/GITHUB.md create mode 100644 docs/wiki/PROBLEM_STATEMENT.md create mode 100644 docs/wiki/PRODUCT_DEVELOPMENT.md create mode 100644 docs/wiki/PROJECT_OVERVIEW.md create mode 100644 docs/wiki/PROJECT_STRUCTURE.md create mode 100644 docs/wiki/RECIPES.md create mode 100644 docs/wiki/TECHNICAL_SPECIFICATIONS.md create mode 100644 docs/wiki/VISION.md create mode 100644 docs/wiki/business/MARKETING_DATA_ANALYSIS.md create mode 100644 docs/wiki/business/PROPOSAL.md create mode 100644 docs/wiki/library/BASE.md create mode 100644 docs/wiki/library/EXCEPTION.md create mode 100644 docs/wiki/library/TEST.md create mode 100644 docs/wiki/plan/ISSUES.md create mode 100644 docs/wiki/usecases/PHOTO_MANAGEMENT.md create mode 100644 docs/wiki/workflow/PHOTO_MANAGEMENT.md create mode 100644 how_to_setup.md rename {local-data-platform/local_data_platform => local_data_platform}/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/catalog/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/catalog/local/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/catalog/local/iceberg/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/engine/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/etl.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/exceptions.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/format/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/format/csv/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/format/iceberg/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/format/parquet/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/hello_world.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/issue/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/logger.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/pipeline/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/pipeline/egression/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/pipeline/egression/csv_to_iceberg/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/pipeline/egression/iceberg_to_csv/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/pipeline/ingestion/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/pipeline/ingestion/bigquery_to_csv/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/pipeline/ingestion/csv_to_iceberg/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/pipeline/ingestion/parquet_to_iceberg/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/pipeline/ingestion/pyarrow/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/store/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/store/source/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/store/source/gcp/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/store/source/gcp/bigquery/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/store/source/json/__init__.py (100%) create mode 100644 local_data_platform/store/source/near/__init__.py rename {local-data-platform/local_data_platform => local_data_platform}/store/source/parquet/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/store/target/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/store/target/iceberg/__init__.py (100%) rename {local-data-platform/local_data_platform => local_data_platform}/tmp/warehouse/pyiceberg_catalog.db (100%) create mode 100644 scripts/__init__.py create mode 100644 scripts/append_pr_history.py create mode 100644 scripts/fetch_closed_items.py create mode 100644 scripts/fetch_rtd_urls.py create mode 100644 scripts/generate_issue_list.py create mode 100644 scripts/github_api.py create mode 100644 src/local_data_platform/__init__.py create mode 100644 src/local_data_platform/github/__init__.py create mode 100644 src/tmp/warehouse/pyiceberg_catalog.db create mode 100644 tests/__init__.py create mode 100644 tests/test_github.py create mode 100644 tests/test_json_source.py create mode 100644 tests/test_placeholder.py diff --git a/.github/workflows/append_pr_history.yml b/.github/workflows/append_pr_history.yml new file mode 100644 index 0000000..dda7a64 --- /dev/null +++ b/.github/workflows/append_pr_history.yml @@ -0,0 +1,43 @@ +# This GitHub Actions workflow appends PR history to docs/PR_HISTORY.md after a PR is merged into main or a release branch. + +name: Append PR History + +on: + pull_request: + types: [closed] + branches: + - main + - 'release/**' + +jobs: + append-pr-history: + if: github.event.pull_request.merged == true + runs-on: ubuntu-latest + permissions: + contents: write # Required to push a commit + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Ensure full git history for diff + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Append PR history + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + PR_TITLE: ${{ github.event.pull_request.title }} + PR_MERGER: ${{ github.event.pull_request.merged_by.login }} + PR_DESCRIPTION: ${{ github.event.pull_request.body }} + run: python scripts/append_pr_history.py + + - name: Commit and push PR_HISTORY.md + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add docs/PR_HISTORY.md + git commit -m "docs: update PR history for PR #${{ github.event.pull_request.number }}" || echo "No changes to commit" + git push \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..089cccf --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,56 @@ +name: CI - Lint, Test, and Docs + +on: + pull_request: + branches: [ main, fix-readthedocs, docs-sidebar-recipes-from-fix-readthedocs ] + push: + branches: [ main, fix-readthedocs, docs-sidebar-recipes-from-fix-readthedocs ] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install poetry + poetry install --with dev,docs + - name: Lint with flake8 + run: poetry run flake8 src/ tests/ + + test: + runs-on: ubuntu-latest + needs: lint + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install poetry + poetry install --with dev,docs + - name: Run tests + run: poetry run pytest tests/ + + docs: + runs-on: ubuntu-latest + needs: test + steps: + - uses: actions/checkout@v4 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install poetry + poetry install --with docs + - name: Build documentation + run: poetry run mkdocs build --strict + +# Only allow merge if all jobs succeed (enforced by branch protection rules in GitHub settings) diff --git a/.github/workflows/mkdocs_link_check.yml b/.github/workflows/mkdocs_link_check.yml new file mode 100644 index 0000000..367dbe0 --- /dev/null +++ b/.github/workflows/mkdocs_link_check.yml @@ -0,0 +1,37 @@ +# This workflow checks for broken links in the built MkDocs site using mkdocs-htmlproofer-plugin. +# It runs on every push and pull request to main and release branches. + +name: MkDocs Broken Link Check + +on: + push: + branches: + - main + - 'release/**' + pull_request: + branches: + - main + - 'release/**' + +jobs: + link-check: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install dependencies + run: | + pip install -r requirements.txt || true + pip install mkdocs-htmlproofer-plugin + + - name: Build docs and check links + run: | + mkdocs build diff --git a/.gitignore b/.gitignore index 6c44f1e..59814a2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,50 +1,8 @@ -# Virtual Environment -.venv/ - -# Python cache -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -build/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -*.egg-info/ -.installed.cfg -*.egg - -# MkDocs -site/ +.idea/local-data-platform.iml +.idea/misc.xml +.idea/modules.xml +.idea/vcs.xml +.idea/workspace.xml +**/__pycache__/ docs/_build/ - -# Pytest -.pytest_cache/ -.mypy_cache/ -.coverage - -# OS / Editor specific -.DS_Store -.idea/ -*.swp - -# Temporary files -*.bak - -# Temporary directories -src/tmp/ -src/local_data_platform/tmp/ - -# Old config files -mkdocs.yaml \ No newline at end of file +site/ \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7741bba --- /dev/null +++ b/Makefile @@ -0,0 +1,84 @@ +# Makefile for the local-data-platform project + +.PHONY: help all install reinstall lint test docs serve-docs clean generate-docs verify-pymdownx + +# Default target to show help. +help: + @echo "Usage: make [target]" + @echo "" + @echo "Targets:" + @echo " all Run all quality checks (lint and test)." + @echo " help Show this help message." + @echo " install Install project dependencies using Poetry." + @echo " reinstall Force a clean re-installation of all dependencies." + @echo " lint Run flake8 linter on the project." + @echo " test Run pytest tests." + @echo " generate-docs Generate dynamic documentation content (e.g., issue lists)." + @echo " docs Build the MkDocs documentation." + @echo " serve-docs Build and serve the documentation locally on http://localhost:8000." + @echo " clean Remove temporary build files and caches." + @echo " verify-pymdownx Verify a specific documentation dependency." + +POETRY_RUN := poetry run + +# ============================================================================== +# Development Setup +# ============================================================================== + +install: + @echo "--> Installing dependencies with Poetry..." + poetry install --with dev,docs + +reinstall: + @echo "--> Removing existing virtual environment to ensure a clean state..." + @poetry env remove $$(poetry env info --path) || echo "No virtualenv found to remove, continuing..." + @echo "--> Reinstalling all dependencies from scratch..." + @$(MAKE) install + +# ============================================================================== +# Quality & Testing +# ============================================================================== + +all: lint test + @echo "--> All quality checks passed successfully." + +lint: + @echo "--> Linting with flake8..." + $(POETRY_RUN) flake8 src/ tests/ + +test: + @echo "--> Running tests with pytest..." + $(POETRY_RUN) pytest tests/ + +# ============================================================================== +# Documentation +# ============================================================================== + +generate-docs: + @echo "--> Generating dynamic documentation content..." + $(POETRY_RUN) python3 docs/scripts/generate_issue_list.py + +docs: + @$(MAKE) generate-docs + @echo "--> Building documentation..." + $(POETRY_RUN) mkdocs build --strict + +serve-docs: + @$(MAKE) generate-docs + @echo "--> Serving documentation..." + $(POETRY_RUN) mkdocs serve + +# ============================================================================== +# Cleaning +# ============================================================================== + +clean: + @echo "--> Cleaning up build artifacts and caches..." + @find . -type f -name "*.py[co]" -delete + @find . -type d -name "__pycache__" -exec rm -rf {} + + @rm -rf .pytest_cache .mypy_cache build dist *.egg-info site + +# Verify if pymdownx.toc is importable within the Poetry environment +verify-pymdownx: + @echo "--> Verifying pymdownx.toc installation..." + $(POETRY_RUN) python3 -c "import pymdownx.toc" || echo "pymdownx.toc not found. Please run 'make install'." \ No newline at end of file diff --git a/README 2.md b/README 2.md new file mode 100644 index 0000000..a1e988f --- /dev/null +++ b/README 2.md @@ -0,0 +1,19 @@ +# Project Name + +A short paragraph describing what this app does and who it's for. Replace this text with a clear, one- or two-sentence summary. + +## Features +- Briefly list core features of the app +- Example: "Track tasks and reminders" +- Example: "Syncs with iCloud" + +## Requirements +- iOS/iPadOS/macOS/watchOS/visionOS: Replace with supported OS and minimum version +- Xcode 15 or later (update as appropriate) +- Swift 5.9 or later (update as appropriate) + +## Getting Started +1. Clone the repository: + ```bash + git clone .git + cd diff --git a/README.md b/README.md index cbadcd2..6db1df9 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,13 @@ -Dear User # Local Data Platform -### Explain this to me like I am five -Imagine you have a toy box where you keep all your favorite toys. -A local data platform is like that toy box, but for storing and -organizing important information instead of toys. -Just like how your toy box, -**a local data platform keeps all your data** -(like pictures, documents, and other info) **in one place -so you can easily find, use and manage it.** -It's really handy for keeping everything organized and in one spot! 🌟📦 +**local-data-platform** is a Python library to build, test, and run a complete data platform on your local machine. The core idea is to provide a "toy box for data"—a local environment where you can manage the entire data lifecycle, from ingestion to reporting, before needing to scale up to a cloud environment. -Got it? What else are you curious about? +This approach allows developers and businesses to save on cloud infrastructure costs during the initial development and testing phases, with a clear path for future scaling. > **Vision:** Local Data Platform is used as a python library to learn > and operate data lake house locally.
> **Mission:** Develop a python package which provides solutions for all stages -> of data organisation, ranging from ingestion to reporting. -> The goal is that one can build data pipeline locally, test and -> easily scale up to cloud.
->
-> **By 2025,** local-data-platform is a python package that uses open source -> tools to orchestrate a data platform operation, locally, for development -> and testing.
+> of data organisation, ranging from ingestion to reporting. The goal is that one can build data pipelines locally, test them, and easily scale up to the cloud. ## Problem Statement @@ -41,127 +26,38 @@ This will help you understand how to read the repository. \ ## Directory Structure -### local-data-platform/ `repository` -- **.github/** `hidden folder` - - ISSUE-TEMPLATE/ `samples` - - bug_report.md `Report bugs here` - - custom.md `Report ad hoc issues here` - - feature_request.md `Request a new feature here` - - pull_request_template.md `Raise a pull request on the repo` - -- **docs/** `Documentation for Read the Docs` - -- **local-data-platform** `package` - - local_data_platform `library` - - hello_world.py `module` - - hello_world `function` - - prints 'Hello, world!' `output` - -- **samples/** `tutorials` - - bigQueryTutorial.py `Demo bigQuery compatibility here` -- .gitignore `Mention files to ignore in your PR` -- .readthedocs.yaml `Configuration for Read the Docs` -- LICENSE `for legal purposes` -- lumache.py `Template used in Sphinx projects for Read the Docs` -- pyproject.toml `template configuration` -- README.md `How to understand the repo` -- README.rst `Configuration for Read the Docs` +The project follows a standard `src` layout for Python packages. Key directories include: +- **src/local_data_platform/**: The main source code for the library. +- **docs/**: MkDocs documentation sources. +- **tests/**: The Pytest test suite. ## How to test Pre-release as a User -1. Check the directory structure \ -`ls` -2. Change directory to local-data-platform -`cd local-data-platform` -2. Install the dependencies listed in your pyproject.toml file \ -`$poetry install` -2. Execute your test suite to ensure everything is working as expected \ -`poetry run pytest` -3. Run hello world command \ -`poetry run python hello_world.py` - - - -## Package structure -- **local-data-platform** `package` - - **dist** `Package distribution files` - - **docs** `Documentation` - - **local_data_platform** `library` - - **catalog** `Catalog your data` - - **local** `Catalog your data locally` - - **iceberg** `Catalog your data in iceberg SQL lite db` - - export.py `Export your catalog data to csv` - - **cloud** `Interact with cloud service providers` - - **gcp** `Interact with Google Cloud Platform` - - **login** `Login to GCP to get API credentials` - - **engine** `Underlying processing Tech` - - **format** `Supported formats for storage` - - **csv** `Supports Google sheets and Excel sheets` - - **iceberg** `Supports Apache Iceberg` - - **parquet** `Supports Apache Parquet` - - **issue** `Github Issues` - - **pipeline** `Data Pipeline` - - **egression** `Downstream pipelines` - - **csv_to_iceberg** `Raw to Silver Layer` - - **iceberg_to_csv** `Silver Layer to Gold Layer` - - **ingestion** `Upstream pipelines` - - **bigquery_to_csv** `Source to Raw` - - **csv_to_iceberg** `Raw to Silver Layer` - - **paraquet_to_iceberg** `Raw to Silver Layer` - - **scraper** `HTML to CSV` - - **store** `Data store` - - **source** `Source data class` - - **gcp** `GCP Storage` - - **bigquery** `GCP service` - - **json** `Local JSON file` - - **near** `NEAR Data Lake` - - **parquet** `Local Parquet file` - - **target** `Target data class` - - **iceberg** `Local Data Lake house` - - etl.py `Sample pipeline` - - exceptions.py `Known limitations` - - hello_world.py `Test Feature` - - is_function.py `Query Library Functions` - - logger.py `Library logger` - - **real_world_use_cases** `User Test Cases` - - **near_data_lake** `NEAR Coin Transactions` - - **config** `Pipeline configurations` - - **sample_queries** `NEAR Data Lake Transaction Table` - - near_transaction.json `Query List` - - egression.json `Loading data in local data lake house` - - ingestion.json `Extracting data from NEAR data lake house` - - **data** `target path` - - **near_transactions.db** `Local data lake house` - - **transactions** `iceberg table` - - **data** `table records` - - **metadata** `iceberg table metadata` - - near_transactions_catalog.db `iceberg local data catalog` - - **reports** `Production analysis` - - get_data.py `Get insights` - - put_data.py `Refresh Gold Layer` - - near_transactions.csv `Output` - - **nyc_yello_taxi_dataset** `NYC Yello Taxis Rides` - - **config** `Pipeline configurations` - - egression.json `Loading data in local data lake house` - - egression_payments.json `Loading payments report in Gold Layer` - - ingestion.json `Extracting data from local parquet file` - - **data** `target path` - - **nyc_yello_taxi_dataset.db** `Local data lake house` - - **rides** `iceberg table` - - **data** `table records` - - **metadata** `iceberg table metadata` - - nyc_yellow_taxi_dataset_catalog.db `iceberg local data catalog` - - nyc_yellow_taxi_rides.csv `Ouput` - - **reports** `Production analysis` - - export_catalog.py `Saves local iceberg catalog in CSV` - - get_data.py `Create Gold Layer` - - get_report.py `Updates Gold Layer` - - put_data.py `Refreshes Gold Layer` - - monthly_reporting.md `Report in MD` - - **tests** `PyTest Unit testing` - - test_gcp_connection.py `Testing GCP Login` +1. **Clone the repository:** + ```bash + git clone https://github.com/tusharchou/local-data-platform.git + cd local-data-platform + ``` +2. **Install dependencies:** + This project uses Poetry for dependency management. Use the Makefile for convenience. + ```bash + make install + ``` +3. **Run the tests:** + ```bash + make test + ``` + +## Package Modules + +The library's main modules are located in `src/local_data_platform`. Key modules include: + +* **`store`**: Handles data storage and interaction with sources. +* **`pipeline`**: Provides tools for building ETL pipelines. +* **`catalog`**: Manages data cataloging with Apache Iceberg. +* **`cloud`**: Contains components for interacting with cloud services. ## Plan | Milestone | Epic | Target Date | Delivery Date | Comment | @@ -182,22 +78,17 @@ This will help you understand how to read the repository. \ ### Releases -- [x] 0.1.0 : Done- Published Library on [PyPI](https://pypi.org/project/local-data-platform/) - -- [ ] 0.1.1 : In Progress- [Demo BigQuery compatibility](https://github.com/tusharchou/local-data-platform/milestone/2) +#### Completed +- **v0.1.0**: Initial release on [PyPI](https://pypi.org/project/local-data-platform/). +- **v0.1.1**: Implemented data ingestion and improved documentation. -- [x] 0.1.1 : Done- [Documentation: Updated README to explain clearly problem and plan of excecution](https://github.com/tusharchou/local-data-platform/issues/6) +#### Upcoming -- [ ] 0.1.2 : To-do- [Warehousing: DuckDB, Iceberg, DBT](https://github.com/tusharchou/local-data-platform/milestone/5) -- [ ] 0.1.3 : To-do- [Orchestration](https://github.com/tusharchou/local-data-platform/milestone/6) -- [ ] 0.1.4 : To-do- [Self Serving Gold Layer](https://github.com/tusharchou/local-data-platform/milestone/11) -- [ ] 0.1.5 : To-do- [Monitoring](https://github.com/tusharchou/local-data-platform/milestone/10) -- [ ] 0.1.6 : To-do- [Business Intelligence Reporting Dashboard](https://github.com/tusharchou/local-data-platform/milestone/9) -- [ ] 0.1.7 : To-do- [Data Science Insights](https://github.com/tusharchou/local-data-platform/milestone/8) -- [ ] 0.1.8 : To-do- [LLM](https://github.com/tusharchou/local-data-platform/milestone/7) -- [ ] 0.1.9 : To-do- [Launch Documentation](https://github.com/tusharchou/local-data-platform/milestone/2) -- [ ] 0.2.0 : To-do- [Cloud Integration](https://github.com/tusharchou/local-data-platform/milestone/3) -- [ ] 1.0.0 : To-do- Product +- **v0.1.2**: Warehousing with DuckDB, Iceberg, and dbt. +- **v0.1.3**: Pipeline orchestration. +- **v0.1.9**: Full documentation launch. +- **v0.2.0**: Cloud integration features. +- **v1.0.0**: Production-ready release. ### References diff --git a/docs/Makefile b/docs/Makefile index d4bb2cb..c84ded3 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -1,20 +1,80 @@ -# Minimal makefile for Sphinx documentation -# +# Makefile for the local-data-platform project -# You can set these variables from the command line, and also -# from the environment for the first two. -SPHINXOPTS ?= -SPHINXBUILD ?= sphinx-build -SOURCEDIR = . -BUILDDIR = _build +.PHONY: help all install reinstall lint test docs serve-docs clean generate-docs verify-pymdownx -# Put it first so that "make" without argument is like "make help". +# Default target to show help. help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + @echo "Usage: make [target]" + @echo "" + @echo "Targets:" + @echo " all Run all quality checks (lint and test)." + @echo " help Show this help message." + @echo " install Install project dependencies using Poetry." + @echo " reinstall Force a clean re-installation of all dependencies." + @echo " lint Run flake8 linter on the project." + @echo " test Run pytest tests." + @echo " generate-docs Generate dynamic documentation content (e.g., issue lists)." + @echo " docs Build the MkDocs documentation." + @echo " serve-docs Build and serve the documentation locally on http://localhost:8000." + @echo " clean Remove temporary build files and caches." + @echo " verify-pymdownx Verify a specific documentation dependency." -.PHONY: help Makefile +POETRY_RUN := poetry run -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) +# ============================================================================== +# Development Setup +# ============================================================================== + +install: + @echo "--> Installing dependencies with Poetry..." + poetry install --with dev,docs + +reinstall: + @echo "--> Removing existing virtual environment to ensure a clean state..." + @poetry env remove $$(poetry env info --path) || echo "No virtualenv found to remove, continuing..." + @echo "--> Reinstalling all dependencies from scratch..." + @$(MAKE) install + +# ============================================================================== +# Quality & Testing +# ============================================================================== + +all: lint test + @echo "--> All quality checks passed successfully." + +lint: + @echo "--> Linting with flake8..." + $(POETRY_RUN) flake8 src/ tests/ + +test: + @echo "--> Running tests with pytest..." + $(POETRY_RUN) pytest tests/ + +# ============================================================================== +# Documentation +# ============================================================================== + +docs: + @$(MAKE) clean + @echo "--> Building documentation..." + $(POETRY_RUN) mkdocs build --strict + +serve-docs: + @$(MAKE) clean + @echo "--> Serving documentation..." + $(POETRY_RUN) mkdocs serve + +# ============================================================================== +# Cleaning +# ============================================================================== + +clean: + @echo "--> Cleaning up build artifacts and caches..." + @find . -type f -name "*.py[co]" -delete + @find . -type d -name "__pycache__" -exec rm -rf {} + + @rm -rf .pytest_cache .mypy_cache build dist *.egg-info site + +# Verify if pymdownx.toc is importable within the Poetry environment +verify-pymdownx: + @echo "--> Verifying pymdownx.toc installation..." + $(POETRY_RUN) python3 -c "import pymdownx.toc" || echo "pymdownx.toc not found. Please run 'make install'." \ No newline at end of file diff --git a/docs/PR_HISTORY.md b/docs/PR_HISTORY.md new file mode 100644 index 0000000..e5af010 --- /dev/null +++ b/docs/PR_HISTORY.md @@ -0,0 +1,5 @@ +# Pull Request History + +This page provides a chronological history of all pull requests merged into the main or release branches. It is automatically updated by a GitHub Action after each merge. + +--- \ No newline at end of file diff --git a/docs/api.md b/docs/api.md new file mode 100644 index 0000000..7637dc3 --- /dev/null +++ b/docs/api.md @@ -0,0 +1,5 @@ +# API Docs + +This section provides a detailed reference for the `local-data-platform` public API. + +::: local_data_platform \ No newline at end of file diff --git a/docs/closed_items.md b/docs/closed_items.md new file mode 100644 index 0000000..a1b586d --- /dev/null +++ b/docs/closed_items.md @@ -0,0 +1,25 @@ +# Recently Closed Items + +This page lists the most recently closed Pull Requests and Issues. + +## Recently Closed Pull Requests + +- **[PR #95](https://github.com/tusharchou/local-data-platform/pull/95)**: 0.1.1 fix(deps): introducing mkdocs (by @tusharchou) +- **[PR #92](https://github.com/tusharchou/local-data-platform/pull/92)**: Add Recipes Page, Sidebar Navigation, and JSON Reading Test to Documentation (by @tusharchou) +- **[PR #2](https://github.com/tusharchou/local-data-platform/pull/2)**: 0.1.1 Iceberg Python Lake House: Testing pyiceberg 0.7.1 (by @tusharchou) +- **[PR #86](https://github.com/tusharchou/local-data-platform/pull/86)**: 0.1.1 Pytest Added for BigQuery Source (by @mrutunjay-kinagi) +- **[PR #25](https://github.com/tusharchou/local-data-platform/pull/25)**: 0.1.1 19 local_data_platform.source.near.bigquery.get(query) (by @mrutunjay-kinagi) +- **[PR #85](https://github.com/tusharchou/local-data-platform/pull/85)**: Release v1.1 bug fix (by @mrutunjay-kinagi) +- **[PR #73](https://github.com/tusharchou/local-data-platform/pull/73)**: Github actions setup for release v1.1 (by @mrutunjay-kinagi) +- **[PR #84](https://github.com/tusharchou/local-data-platform/pull/84)**: Release v1.1 fix (by @mrutunjay-kinagi) +- **[PR #81](https://github.com/tusharchou/local-data-platform/pull/81)**: Update pyproject.toml (by @redpheonixx) +- **[PR #80](https://github.com/tusharchou/local-data-platform/pull/80)**: Update publish.yml (by @redpheonixx) +- **[PR #79](https://github.com/tusharchou/local-data-platform/pull/79)**: Update publish.yml (by @redpheonixx) +- **[PR #4](https://github.com/tusharchou/local-data-platform/pull/4)**: 0.1.1 Real World Use case: Fact table (by @tusharchou) +- **[PR #78](https://github.com/tusharchou/local-data-platform/pull/78)**: Update publish.yml (by @redpheonixx) +- **[PR #7](https://github.com/tusharchou/local-data-platform/pull/7)**: 0.1.1 Draft README.md (by @tusharchou) +- **[PR #61](https://github.com/tusharchou/local-data-platform/pull/61)**: Create manual.yml (by @tusharchou) + +## Recently Closed Issues + +- **[Issue #1](https://github.com/tusharchou/local-data-platform/issues/1)**: 0.1.2 Testing pyiceberg 0.8.1 feature requests (by @tusharchou) \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index 89d6367..e69de29 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,63 +0,0 @@ -# Configuration file for the Sphinx documentation builder. -# -# This file only contains a selection of the most common options. For a full -# list see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) - - -# -- Project information ----------------------------------------------------- - -project = "My Personal Data Project" -copyright = "2024, Local Data Platform core team" -author = "Local Data Platform core team" - - -# -- General configuration --------------------------------------------------- -# -- General configuration - -extensions = [ - "sphinx.ext.duration", - "sphinx.ext.doctest", - "sphinx.ext.autodoc", - "sphinx.ext.autosummary", - "sphinx.ext.intersphinx", -] - -intersphinx_mapping = { - "rtd": ("https://docs.readthedocs.io/en/stable/", None), - "python": ("https://docs.python.org/3/", None), - "sphinx": ("https://www.sphinx-doc.org/en/master/", None), -} -intersphinx_disabled_domains = ["std"] - -templates_path = ["_templates"] - -# -- Options for EPUB output -epub_show_urls = "footnote" - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = "sphinx_rtd_theme" - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ["_static"] diff --git a/docs/contributing.md b/docs/contributing.md new file mode 100644 index 0000000..b5c789e --- /dev/null +++ b/docs/contributing.md @@ -0,0 +1,28 @@ +# Contributing to Local Data Platform + +We're thrilled that you're interested in contributing to the Local Data Platform! Your help is essential for keeping it great. + +This section provides guidelines for contributing to the project. Please take a moment to review this document in order to make the contribution process easy and effective for everyone involved. + +## How to Get Started + +If you're new to the project, the best place to start is the `how_to_setup.md` guide located in the root of the repository. This will walk you through cloning the project and setting up your local development environment. + +Once you're set up, you can explore the other pages in this section to learn how to report issues or request features. + +## Submitting Pull Requests + +We follow a standard "fork and pull" model for contributions. To submit a change, please follow these steps: + +1. **Create a Fork**: Fork the repository to your own GitHub account. +2. **Create a Branch**: Create a new branch from `main` in your fork for your changes. Please use a descriptive branch name (e.g., `feat/add-new-ingestion-source` or `fix/docs-build-error`). +3. **Make Your Changes**: Make your changes, ensuring you follow the project's coding style. +4. **Run Quality Checks**: Before committing, run all the local quality checks to ensure your changes don't introduce any issues. + ```sh + make all + ``` +5. **Commit Your Changes**: Commit your changes with a clear and descriptive commit message. We follow the Conventional Commits specification. +6. **Push to Your Fork**: Push your branch to your fork on GitHub. +7. **Open a Pull Request**: From your fork on GitHub, open a pull request to the `main` branch of the `tusharchou/local-data-platform` repository. + +Your PR will be reviewed by the maintainers, and once approved, it will be merged. Thank you for your contribution! \ No newline at end of file diff --git a/docs/developer_tooling_update.md b/docs/developer_tooling_update.md new file mode 100644 index 0000000..78ca898 --- /dev/null +++ b/docs/developer_tooling_update.md @@ -0,0 +1,27 @@ +# Update on Developer Experience Improvements + +**To:** Product Designer, Product Manager + +**From:** The Development Team + +**Subject:** Upcoming PR: `feat(dx): Add Makefile and setup guide #96` + +--- + +Hello Team, + +This is a quick update on an upcoming Pull Request that significantly improves our development workflow. This PR introduces a `Makefile` and a comprehensive `how_to_setup.md` guide. + +### For the Product Designer + +This change will make it much faster and more consistent for you to set up a local development environment. If you ever need to run the project locally to test a new design or component, the process will be simplified to just a couple of straightforward commands, reducing friction and getting you up and running in minutes. + +### For the Product Manager + +This initiative is focused on improving our overall Developer Experience (DX). A better DX directly translates to faster onboarding for new team members and more efficient development cycles for the entire team. By standardizing our setup and common commands, we reduce time spent on environment-related issues and can focus more on delivering features, ultimately leading to quicker and more predictable progress. + +### Next Steps + +No action is required from you on this PR. We wanted to keep you informed about this foundational improvement that will help streamline our development process for everyone involved. + +Please feel free to reach out if you have any questions! \ No newline at end of file diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..1910660 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,30 @@ +# Local Data Platform + +A modern, modular, and developer-friendly platform for local data engineering, analytics, and experimentation. + +> Want to contribute? Check out the [**Contributing Guide**](contributing.md)! + +[Explore Recipes & Examples](recipes.md){ .md-button .md-button--primary } + +[View Open Issues](user_issues.md){ .md-button } + +--- + +## 🏆 Top Issues to Contribute On + + + + +| Title | Theme | Status | Comments | Votes | +|-------|-------|--------|----------|-------| +| Example: Add BigQuery Ingestion | Ingestion | In Progress | 5 | 12 | +| Example: Improve Error Handling | Core | Under Review | 3 | 8 | +| Example: Add Parquet Export | Egression | Planned | 2 | 6 | + +--- + +## 📋 Top PRs to Review + +For project managers and senior contributors, this section highlights key pull requests that are ready for review. + +[Review Open Pull Requests](pr_reviews.md) diff --git a/local-data-platform/local_data_platform/store/source/near/__init__.py b/docs/pr_description.md similarity index 100% rename from local-data-platform/local_data_platform/store/source/near/__init__.py rename to docs/pr_description.md diff --git a/docs/pr_reviews.md b/docs/pr_reviews.md new file mode 100644 index 0000000..3084852 --- /dev/null +++ b/docs/pr_reviews.md @@ -0,0 +1,31 @@ +# Reviewing Pull Requests + +This page provides an overview of significant pull requests that are ready for review. It helps project managers and senior developers track progress and ensure quality. + +--- + +## Example PR: Documentation and Testing Improvements + +This is an example of a pull request description to guide reviews. + +### Summary +This pull request enhances the documentation and testing for the `local-data-platform` project. The main improvements include: + +- Adding a `recipes.md` page to the documentation, featuring practical usage examples such as reading a JSON file and building a JSON-to-Parquet pipeline. +- Ensuring the recipes page appears in the sidebar/main navigation for easier access. +- Updating Sphinx and Markdown documentation structure for improved navigation and clarity. +- Adding a test (`tests/test_json_source.py`) to verify that the `JsonSource` class can read a JSON file as described in the documentation. +- Maintaining compatibility for documentation builds both locally and on Read the Docs. + +### How to Test +- Build the documentation locally: + ```sh + cd docs + make html + ``` + Verify that the "Recipes" page appears in the sidebar and renders correctly. + +- Run the test suite to ensure the new test passes: + ```sh + pytest tests/test_json_source.py + ``` \ No newline at end of file diff --git a/docs/recipes.md b/docs/recipes.md new file mode 100644 index 0000000..a80616c --- /dev/null +++ b/docs/recipes.md @@ -0,0 +1,27 @@ +# Recipes + +This section contains practical examples and step-by-step guides for using the Local Data Platform. + +--- + +## Reading a Local JSON File + +This recipe demonstrates how to use a `JsonSource` to read a local JSON file into a data structure. + +### Prerequisites + +- A local JSON file named `data.json` in your project directory. + +### Code Example + +```python +from local_data_platform.store import JsonSource + +def read_local_json(file_path: str): + json_source = JsonSource(path=file_path) + data = json_source.read() + print("Successfully read data:", data) + +if __name__ == "__main__": + read_local_json("data.json") +``` \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 10835d2..0000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,58 +0,0 @@ -# -# This file is autogenerated by pip-compile with python 3.10 -# To update, run: -# -# pip-compile docs/requirements.in -# -alabaster==0.7.12 - # via sphinx -babel==2.10.3 - # via sphinx -certifi==2024.7.4 - # via requests -charset-normalizer==2.1.0 - # via requests -docutils==0.17.1 - # via - # sphinx - # sphinx-rtd-theme -idna==3.7 - # via requests -imagesize==1.4.1 - # via sphinx -jinja2==3.1.4 - # via sphinx -markupsafe==2.1.1 - # via jinja2 -packaging==21.3 - # via sphinx -pygments==2.12.0 - # via sphinx -pyparsing==3.0.9 - # via packaging -pytz==2022.1 - # via babel -requests==2.32.2 - # via sphinx -snowballstemmer==2.2.0 - # via sphinx -sphinx==5.0.2 - # via - # -r requirements.in - # sphinx-rtd-theme -sphinx-rtd-theme==1.0.0 - # via -r requirements.in -sphinxcontrib-applehelp==1.0.2 - # via sphinx -sphinxcontrib-devhelp==1.0.2 - # via sphinx -sphinxcontrib-htmlhelp==2.0.0 - # via sphinx -sphinxcontrib-jsmath==1.0.1 - # via sphinx -sphinxcontrib-qthelp==1.0.3 - # via sphinx -sphinxcontrib-serializinghtml==1.1.5 - # via sphinx -urllib3==1.26.19 - # via requests diff --git a/docs/scripts/generate_issue_list.py b/docs/scripts/generate_issue_list.py new file mode 100644 index 0000000..e69de29 diff --git a/docs/user_issues.md b/docs/user_issues.md new file mode 100644 index 0000000..aa742fd --- /dev/null +++ b/docs/user_issues.md @@ -0,0 +1,230 @@ +# Open Issues + +This page lists all open issues in the repository. Use the filters below to sort by status or theme. + + +--- +tags: + - Status - Planned + - Theme - General +--- +### #94 - design of readthedocs. +*Status: Planned | Theme: General* + +> Great job on getting the docs started. I have a couple of comments on the design of readthedocs. - Rename API Reference to API Docs - Rename User Guid... + +--- + +--- +tags: + - Status - Planned + - Theme - General +--- +### #89 - Event driven User behaviour Analysis +*Status: Planned | Theme: General* + +> **Is your feature request related to a problem? Please describe.** User behaviour analysis can be a difficult problem to solve as the application conv... + +--- + +--- +tags: + - Status - Planned + - Theme - General +--- +### #88 - Optimistic Concurrency Iceberg +*Status: Planned | Theme: General* + +> https://github.com/apache/iceberg-python/issues/819 @redpheonixx + +--- + +--- +tags: + - Status - Planned + - Theme - General +--- +### #75 - A python function to pull data from Snowflake +*Status: Planned | Theme: General* + +> Extension Request. Add Snowflake to LDP. Create a python function to fetch data from Snowflake. **Describe the solution you'd like** A clear and conci... + +--- + +--- +tags: + - Status - Planned + - Theme - General +--- +### #59 - 0.1.8 Query this catalog through prompt +*Status: Planned | Theme: General* + +> # Feature Request: Catalog Search ## 4W1Hs **Who**: any business with listings need to expose search for user to quickly find the vendor **What**: a e... + +--- + +--- +tags: + - Status - Planned + - Theme - General +--- +### #57 - 0.1.2 Warehousing: add duckdb, dbt and iceberg packages +*Status: Planned | Theme: General* + +> **Is your feature request related to a problem? Please describe.** No, its a separate release feature 0.1.2 warehousing which requires duckdb, dbt and... + +--- + +--- +tags: + - Status - Planned + - Theme - General +--- +### #48 - 0.1.2 Warehousing: Excel to csv support +*Status: Planned | Theme: General* + +> Hi team Thanks for fixing the last issue raised #31! I am a beginner with Python and working with restaurant data. The data is extracted as an excel f... + +--- + +--- +tags: + - Status - Planned + - Theme - General +--- +### #47 - 0.1.2 Reoccurring Customer Churn Analysis +*Status: Planned | Theme: General* + +> # Reporting ## Churn Analyisis This often involves identifying the percentage of customers who have stopped using a product or service over a certain ... + +--- + +--- +tags: + - Status - Planned + - Theme - General +--- +### #45 - 0.1.4 DBT Transformation layer +*Status: Planned | Theme: General* + +> - [ ] Set up environment with dbt, DuckDB installed preferably a docker compose file - [ ] Configure dbt_project.yml file to support iceberg - [ ] Con... + +--- + +--- +tags: + - Status - Planned + - Theme - General +--- +### #44 - 0.1.3 Orchestration using cron +*Status: Planned | Theme: General* + +> Source - #27 The Problem - In the absence of an automated task scheduler like cron, essential tasks that need to run periodically, such as data pulls ... + +--- + +--- +tags: + - Status - Planned + - Theme - General +--- +### #42 - 0.1.3 Orchestration: using airflow +*Status: Planned | Theme: General* + +> writing orchestration code with Apache Airflow, along with documentation to ensure clarity and maintainability. + +--- + +--- +tags: + - Status - Planned + - Theme - General +--- +### #38 - 0.1.1 Demo: Questions by end user +*Status: Planned | Theme: General* + +> Thank you so much for writing this but it will help us understand the use case if you could answer the 5 Ws: 1. What is this? 2. Why is this? 3. Who w... + +--- + +--- +tags: + - Status - Planned + - Theme - General +--- +### #33 - 0.1.1 Documentation: Read The Docs +*Status: Planned | Theme: General* + +> Inspiration : https://py.iceberg.apache.org/#installation to-do: - [ ] .README.md project overview - [ ] read the docs tutorial - [ ] pypi release not... + +--- + +--- +tags: + - Status - Planned + - Theme - General +--- +### #29 - 0.1.1 Align on Product Framework +*Status: Planned | Theme: General* + +> **Start** It is important to understand **what we are solving and for who.** This library needs to be robust and should adapt based on the use case we... + +--- + +--- +tags: + - Status - Planned + - Theme - General +--- +### #27 - 0.1.9 Blog the class diagram +*Status: Planned | Theme: General* + +> Answer to be added for what design principles were used to choose the class structure? + +--- + +--- +tags: + - Status - Planned + - Theme - General +--- +### #23 - 0.1.2 Implement Partitioning and Version Control +*Status: Planned | Theme: General* + +> You can optimize the table for queries by partitioning it based on relevant fields such as block_timestamp or signer_account_id. This will improve que... + +--- + +--- +tags: + - Status - Planned + - Theme - General +--- +### #19 - 0.1.1 Create a Google BigQuery Client
 +*Status: Planned | Theme: General* + +> Use Python's BigQuery API to pull the data from the Near Protocol dataset. For example, this code queries the transactions table and retrieves the dat... + +--- + +--- +tags: + - Status - Planned + - Theme - General +--- +### #13 - 0.2.0 DosuBot : Github maintainer +*Status: Planned | Theme: General* + +> We need to check this out. https://github.com/apps/dosubot @tusharchou + +--- + +--- +tags: + - Status - Planned + - Theme - General +--- +### #5 - 0.1.1 Setup Wiki +*Status: Planned | Theme: General* + +> How to contribute to local-data-platform 0.1.2 release diff --git a/docs/wiki/ACTIVE_DOCS_URLS.md b/docs/wiki/ACTIVE_DOCS_URLS.md new file mode 100644 index 0000000..bf88b18 --- /dev/null +++ b/docs/wiki/ACTIVE_DOCS_URLS.md @@ -0,0 +1,69 @@ +# Active Hosted Docs URLs + +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/#getting-started](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/#getting-started) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/#hello-local-data-platform](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/#hello-local-data-platform) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#__codelineno-0-1](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#__codelineno-0-1) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#__codelineno-0-10](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#__codelineno-0-10) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#__codelineno-0-12](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#__codelineno-0-12) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#__codelineno-0-13](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#__codelineno-0-13) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#__codelineno-0-2](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#__codelineno-0-2) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#__codelineno-0-4](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#__codelineno-0-4) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#__codelineno-0-5](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#__codelineno-0-5) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#__codelineno-0-6](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#__codelineno-0-6) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#__codelineno-0-7](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#__codelineno-0-7) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#__codelineno-0-8](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#__codelineno-0-8) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#__codelineno-0-9](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#__codelineno-0-9) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#api-reference](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#api-reference) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#core-modules](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#core-modules) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#hello-world](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#hello-world) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#local_data_platform.etl](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#local_data_platform.etl) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#local_data_platform.exceptions](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#local_data_platform.exceptions) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#local_data_platform.exceptions.EngineNotFound](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#local_data_platform.exceptions.EngineNotFound) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#local_data_platform.exceptions.PipelineNotFound](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#local_data_platform.exceptions.PipelineNotFound) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#local_data_platform.exceptions.PlanNotFound](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#local_data_platform.exceptions.PlanNotFound) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#local_data_platform.exceptions.TableNotFound](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#local_data_platform.exceptions.TableNotFound) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#local_data_platform.hello_world](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#local_data_platform.hello_world) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#local_data_platform.hello_world.hello_world](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#local_data_platform.hello_world.hello_world) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#local_data_platform.logger](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#local_data_platform.logger) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#local_data_platform.pipeline](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#local_data_platform.pipeline) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#local_data_platform.store](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#local_data_platform.store) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#pipeline](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#pipeline) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#store](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/api/#store) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/developer_feature_requests/](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/developer_feature_requests/) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/developer_feature_requests/#how-to-suggest-a-feature](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/developer_feature_requests/#how-to-suggest-a-feature) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/developer_feature_requests/#requesting-developer-features](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/developer_feature_requests/#requesting-developer-features) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-0-1](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-0-1) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-0-10](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-0-10) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-0-2](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-0-2) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-0-3](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-0-3) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-0-4](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-0-4) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-0-5](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-0-5) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-0-6](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-0-6) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-0-7](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-0-7) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-0-8](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-0-8) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-0-9](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-0-9) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-1-1](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-1-1) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-1-10](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-1-10) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-1-11](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-1-11) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-1-2](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-1-2) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-1-3](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-1-3) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-1-4](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-1-4) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-1-5](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-1-5) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-1-6](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-1-6) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-1-7](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-1-7) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-1-8](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-1-8) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-1-9](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-1-9) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-2-1](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-2-1) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-2-2](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-2-2) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-2-3](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-2-3) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-2-4](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-2-4) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-2-5](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#__codelineno-2-5) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#local-data-platform-recipes](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#local-data-platform-recipes) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#recipe-json-to-parquet-data-pipeline](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#recipe-json-to-parquet-data-pipeline) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#recipe-read-a-json-file](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#recipe-read-a-json-file) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#sample-json-file](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/recipes/#sample-json-file) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/user_issues/](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/user_issues/) +- [https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/user_issues/#open-issues](https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/user_issues/#open-issues) diff --git a/docs/wiki/BRANCHES.md b/docs/wiki/BRANCHES.md new file mode 100644 index 0000000..1a65380 --- /dev/null +++ b/docs/wiki/BRANCHES.md @@ -0,0 +1,22 @@ +# Branches in local-data-platform + +| Branch Name | Purpose | +|---------------------------------------------|--------------------------------------------------------------| +| main | Main production branch, stable releases | +| develop | Development integration branch (if used) | +| docs-sidebar-recipes | Docs: Add recipes page/sidebar navigation | +| docs-sidebar-recipes-from-fix-readthedocs | Docs: Recipes/sidebar, branched from fix-readthedocs | +| feat/developer-tooling | Developer tooling, Makefile, setup guide, etc. | +| fix-readthedocs | Fixes for Read the Docs build | +| fix/stabilize-dependencies | Dependency stabilization | +| problem-statement | Problem statement documentation | +| brmhastra-patch-1 | User patch/feature branch | +| redpheonixx-patch-1 ... patch-5 | User patch/feature branches | +| tusharchou-patch-1 ... patch-8 | User patch/feature branches | +| v0.1.1 | Release tag branch | +| 3-012-viewing-data-through-duck-db-from-iceberg | Feature: DuckDB/Iceberg integration | +| 51-011-supported-file-formats-and-io | Feature: Supported file formats and IO | +| dependabot/pip/docs/pip-e49d2f513e | Automated dependency update | +| test-pyiceberg-0-7-1 | Testing pyiceberg version 0.7.1 | + +> This table lists the main local and remote branches and their purposes. For user/feature branches, see the branch name for context. diff --git a/docs/wiki/CONTENTS.md b/docs/wiki/CONTENTS.md new file mode 100644 index 0000000..d9dfec0 --- /dev/null +++ b/docs/wiki/CONTENTS.md @@ -0,0 +1,17 @@ +# Local Data Platform Wiki + +## Contents + +- [Project Overview](/wiki/PROJECT_OVERVIEW) +- [Problem Statement](/wiki/PROBLEM_STATEMENT) +- [Technical Specifications](/wiki/TECHNICAL_SPECIFICATIONS) +- [Development & Documentation](/wiki/DEVELOPMENT) +- [Project Structure](/wiki/PROJECT_STRUCTURE) +- [Recipes & Examples](/wiki/RECIPES) +- [Contributing](/wiki/CONTRIBUTING) +- [Active Docs URLs](/wiki/ACTIVE_DOCS_URLS) +- [License](../LICENSE) + +--- + +See each file above for details on that section. diff --git a/docs/wiki/CONTRIBUTING.md b/docs/wiki/CONTRIBUTING.md new file mode 100644 index 0000000..663a9ad --- /dev/null +++ b/docs/wiki/CONTRIBUTING.md @@ -0,0 +1,5 @@ +# Contributing + +1. Fork the repo and create your branch from `main`. +2. Ensure tests and docs build (`poetry run pytest`, `poetry run mkdocs serve`). +3. Open a pull request! diff --git a/docs/wiki/DEVELOPMENT.md b/docs/wiki/DEVELOPMENT.md new file mode 100644 index 0000000..a34043f --- /dev/null +++ b/docs/wiki/DEVELOPMENT.md @@ -0,0 +1,31 @@ +# Development & Documentation + +## Requirements + +- Python 3.8+ +- [Poetry](https://python-poetry.org/) +- [MkDocs](https://www.mkdocs.org/) (for documentation) + +## Setup + +```sh +# Install dependencies +poetry install + +# (Optional) Activate the poetry shell +poetry shell +``` + +## Running Tests + +```sh +poetry run pytest +``` + +## Building & Serving Documentation + +```sh +# Serve docs locally with MkDocs +poetry run mkdocs serve +# Then open http://127.0.0.1:8000 in your browser +``` diff --git a/docs/wiki/GITHUB.md b/docs/wiki/GITHUB.md new file mode 100644 index 0000000..fbb44bb --- /dev/null +++ b/docs/wiki/GITHUB.md @@ -0,0 +1,64 @@ +# Feature: A class to interact with the github API + +Okay, here's a prompt designed to be given to Gemini Code Assist, building on our previous discussion and leveraging its ability to generate multi-file projects and detailed code. + +--- + +**Gemini Code Assist Prompt:** + +"My project needs an object-oriented Python utility to interact with GitHub repositories, specifically for fetching issues and pull requests. I already have a shared low-level API fetching utility located at `scripts/github_api.py` which contains a `fetch_from_github(owner, repo, endpoint, params)` function that handles pagination and authentication via `GITHUB_TOKEN` environment variable. + +I need you to create a new Python package and populate it with two classes: `Repo` and `Item`. + +**Here are the requirements:** + +1. **Project Structure:** + * Create a new directory `local_data_platform`. + * Inside `local_data_platform`, create an empty `__init__.py` file to make it a package. + * Inside `local_data_platform`, create a new file named `github.py`. + +2. **`local_data_platform/github.py` content:** + + * **Import:** It should import `fetch_from_github` from `scripts.github_api`. + + * **`Repo` Class:** + * Represents a GitHub repository. + * `__init__(self, owner: str, name: str)`: Initializes with the repository owner (username or organization) and name. + * `__repr__(self)`: Provides a helpful string representation. + + * **`Item` Class:** + * Represents a single GitHub issue or pull request. + * `__init__(self, data: dict, repo: 'Repo')`: + * Takes a `data` dictionary (the raw JSON response for an issue/PR from GitHub API) and a `repo` object (an instance of the `Repo` class). + * Determines `self.type` as either `'issue'` or `'pull_request'` based on the presence of the `'pull_request'` key in `data`. + * Initializes common attributes directly from `data`: `number`, `title`, `html_url`, `state`, `created_at`, `updated_at`, `closed_at`, `user_login` (from `user.login`), and `labels`. + * `is_pr(self) -> bool`: Returns `True` if the item is a pull request, `False` otherwise. + * `is_issue(self) -> bool`: Returns `True` if the item is a regular issue, `False` otherwise. + * `__repr__(self)`: Provides a helpful string representation including type, number, title (truncated), state, and repo name. + * `__getattr__(self, name)`: Implement this to allow accessing any key from the underlying `_data` dictionary directly as an attribute (e.g., `item.body` or `item.assignee`). If the attribute doesn't exist in `_data`, raise an `AttributeError`. + + * **`@classmethod fetch_all_items(cls, repo: Repo, status: str = "open") -> list['Item']`**: + * This is the core fetching method. + * It should take a `Repo` object and an optional `status` string (defaulting to "open"). + * It must use the imported `fetch_from_github` function, passing `repo.owner`, `repo.name`, the endpoint `"issues"`, and `params={"state": status}`. (Note: The GitHub `/issues` endpoint returns both issues and PRs, which is suitable here). + * It should then iterate through the raw data returned by `fetch_from_github` and wrap each dictionary in an `Item` object, returning a list of `Item` instances. + * Include basic type validation for `repo` parameter. + +3. **Example Usage Script (`main.py`):** + * Create a `main.py` file in the project root (sibling to `local_data_platform` and `scripts`). + * It should demonstrate how to: + * Import `Repo` and `Item`. + * Define `REPO_OWNER` and `REPO_NAME` constants (e.g., "tusharchou", "local-data-platform"). + * Instantiate a `Repo` object. + * Call `Item.fetch_all_items` to get **open** items. + * Print the total count of open items. + * Separate the fetched items into `open_issues` and `open_prs` lists using the `is_issue()` and `is_pr()` methods. + * Print the counts for open issues and open PRs. + * Loop through and print the `number`, `title`, `type`, `state`, `user_login`, `created_at` (formatted), and `html_url` for a few (e.g., top 5) issues and PRs. + * Demonstrate fetching **closed** items as well. + * Show an example of using the `__getattr__` functionality (e.g., printing `item.body` if available). + * Include a note about setting the `GITHUB_TOKEN` environment variable for authenticated requests. + +Please provide the complete code for `local_data_platform/__init__.py`, `local_data_platform/github.py`, and `main.py`." + +--- diff --git a/docs/wiki/PROBLEM_STATEMENT.md b/docs/wiki/PROBLEM_STATEMENT.md new file mode 100644 index 0000000..6550aff --- /dev/null +++ b/docs/wiki/PROBLEM_STATEMENT.md @@ -0,0 +1,18 @@ +# Problem Statement + +| Question | Answer | +|----------|----------------------------------------------------------------------------------------------------| +| What? | a local data platform that can scale up to cloud | +| Why? | save costs on cloud infra and development time | +| When? | start of product development life cycle | +| Where? | local first | +| Who? | Business who wants a product data platform that will run locally and scale up when the time comes. | + + +# Problem Statement + +In today's data-rich world, individuals often face significant challenges managing, analyzing, and deriving meaningful insights from their personal data. Traditional "big data" solutions are typically complex, resource-intensive, and designed for enterprise-scale problems, rendering them inaccessible or overkill for personal use cases. Furthermore, pervasive privacy concerns frequently prevent individuals from leveraging convenient cloud-based tools for sensitive personal information, leading to data silos, missed opportunities for personal growth, and an inability to fully understand their own digital footprint. + +The **Local Data Platform (LDP)** directly addresses this critical gap. It empowers anyone with a laptop to harness the immense power of familiar Python big data libraries (such as Pandas, Dask, Polars, and more) to solve their unique personal data problems. LDP provides a structured, user-friendly, and private environment designed for individuals to research, process, and maintain their personal datasets entirely locally. This ensures unparalleled data privacy and guarantees that your valuable information remains alive and accessible precisely when you need to research or revisit it. + +By fostering a vibrant, community-driven approach, LDP enables users to collaborate on solutions, share best practices, and collectively evolve the platform to tackle a wide array of personal data challenges. Crucially, this collaborative growth occurs **without ever requiring users to share their sensitive raw data**, upholding the core principle of personal privacy. The ultimate goal is to cultivate a robust, accessible, and community-supported ecosystem for comprehensive personal data mastery, with all documentation and guidance readily available on the ReadTheDocs website. \ No newline at end of file diff --git a/docs/wiki/PRODUCT_DEVELOPMENT.md b/docs/wiki/PRODUCT_DEVELOPMENT.md new file mode 100644 index 0000000..056a81d --- /dev/null +++ b/docs/wiki/PRODUCT_DEVELOPMENT.md @@ -0,0 +1,28 @@ +# Product Developement + +# Problem Statement + +Github issues and PR are the soul of a project, so we should have a utility to pull and review them while working on the project. + +## Data Structure + +### github.Repo + +represents the repo + +### github.Item + +It can either be a PR or an issue. + +#### usage + +```python + +local_data_platfrom.github.item + +``` + +#### Methods + +fetch all items(status=OPEN) + diff --git a/docs/wiki/PROJECT_OVERVIEW.md b/docs/wiki/PROJECT_OVERVIEW.md new file mode 100644 index 0000000..286fd80 --- /dev/null +++ b/docs/wiki/PROJECT_OVERVIEW.md @@ -0,0 +1,10 @@ +# Project Overview + +**local-data-platform** is a Python library to build, test, and run a complete data platform on your local machine. The core idea is to provide a "toy box for data"—a local environment where you can manage the entire data lifecycle, from ingestion to reporting, before needing to scale up to a cloud environment. + +This approach allows developers and businesses to save on cloud infrastructure costs during the initial development and testing phases, with a clear path for future scaling. + +> **Vision:** Local Data Platform is used as a python library to learn +> and operate data lake house locally. +> **Mission:** Develop a python package which provides solutions for all stages +> of data organisation, ranging from ingestion to reporting. The goal is that one can build data pipelines locally, test them, and easily scale up to the cloud. diff --git a/docs/wiki/PROJECT_STRUCTURE.md b/docs/wiki/PROJECT_STRUCTURE.md new file mode 100644 index 0000000..19a5b8f --- /dev/null +++ b/docs/wiki/PROJECT_STRUCTURE.md @@ -0,0 +1,32 @@ +# Project Structure + +``` +tusharchou/local-data-platform/ +├── local_data_platform/ +│ ├── __init__.py +│ ├── storage_base.py +│ └── in_memory_storage.py +├── scripts/ +│ ├── __init__.py +│ └── github_api.py +├── main.py +├── mkdocs.yml <-- NEW FILE (MkDocs Configuration) +├── docs/ <-- DIRECTORY +│ ├── index.md <-- CHANGED (Formerly index.rst) +│ ├── problem_statement.md <-- CHANGED (Formerly problem_statement.rst) +│ ├── installation.md <-- Placeholder for new page +│ ├── usage.md <-- Placeholder for new page +│ ├── api_reference.md <-- Placeholder for new page (for mkdocstrings) +│ ├── contributing.md <-- Placeholder for new page +│ └── community.md <-- Placeholder for new page +├── requirements-dev.txt <-- NEW FILE (For dev/docs dependencies) +└── README.md +``` + +## .gitignore + +The following are ignored to keep the repo clean: +- IDE/project files (`.idea/`) +- Python cache (`__pycache__/`) +- Sphinx build output (`docs/_build/`) +- MkDocs static site output (`site/`) diff --git a/docs/wiki/RECIPES.md b/docs/wiki/RECIPES.md new file mode 100644 index 0000000..08ea838 --- /dev/null +++ b/docs/wiki/RECIPES.md @@ -0,0 +1,5 @@ +# Recipes & Examples + +See [docs/recipes.md](../docs/recipes.md) for practical usage examples, including: +- Reading a JSON file +- Building a JSON-to-Parquet pipeline diff --git a/docs/wiki/TECHNICAL_SPECIFICATIONS.md b/docs/wiki/TECHNICAL_SPECIFICATIONS.md new file mode 100644 index 0000000..4d7b38e --- /dev/null +++ b/docs/wiki/TECHNICAL_SPECIFICATIONS.md @@ -0,0 +1,3 @@ +# Technical Specifications + +[Introduction to pyiceberg](https://medium.com/@tushar.choudhary.de/internals-of-apache-pyiceberg-10c2302a5c8b) diff --git a/docs/wiki/VISION.md b/docs/wiki/VISION.md new file mode 100644 index 0000000..0e50b42 --- /dev/null +++ b/docs/wiki/VISION.md @@ -0,0 +1,29 @@ +# Welcome to the Local Data Platform (LDP)! + +**Empowering Personal Data Mastery with Python.** + +The Local Data Platform (LDP) is a revolutionary open-source initiative designed to put the power of "big data" Python libraries directly into the hands of individuals. Whether you're a data enthusiast, a researcher, or simply someone looking to gain deeper insights from your personal information, LDP provides the tools and framework to do so securely and privately, right on your laptop. + +--- + +## Problem Statement + +_The full problem statement is detailed on the [Problem Statement](./PROBLEM_STATEMENT.md) page._ + +--- + +## Why Local? + +In an era where data privacy is paramount, LDP champions a local-first approach. Your data stays on your machine, under your control. This eliminates the need to upload sensitive information to third-party cloud services, giving you peace of mind while still enabling powerful analysis. + +## Key Features (Coming Soon!) + +* **Offline Capability:** Work with your data anywhere, anytime, without an internet connection. +* **Privacy by Design:** Your personal data never leaves your device unless you explicitly choose to share it. +* **Scalable Personal Analytics:** Leverage libraries like Pandas, Dask, Polars, and more for efficient processing of large datasets. +* **Community-Driven Solutions:** Collaborate with others to develop and share solutions for common personal data challenges. +* **Extensible Architecture:** Easily integrate new data sources, processing modules, and visualization tools. + +## Get Started + +Ready to take control of your personal data? Head over to our [Installation](installation.md) guide to set up LDP on your machine. \ No newline at end of file diff --git a/docs/wiki/business/MARKETING_DATA_ANALYSIS.md b/docs/wiki/business/MARKETING_DATA_ANALYSIS.md new file mode 100644 index 0000000..772b79a --- /dev/null +++ b/docs/wiki/business/MARKETING_DATA_ANALYSIS.md @@ -0,0 +1,69 @@ +# Agentic Marketing Analyser + +## Summary + +This feature proposes the integration of an AI-powered agent that delivers location-specific marketing insights to help tour and activity operators understand what’s working and what needs attention across their local digital channels. Powered by a Local Data Platform (LDP), the agent transforms raw marketing data into clear, actionable recommendations that drive bookings, visibility, and revenue. + +## Problem It Solves + +Most small to mid-sized tourism operators struggle with marketing decisions. Common pain points include: + +- "Which campaigns are working in which locations?" +- "What should I post next to drive more bookings?" +- "Why are bookings slow this week despite good reviews?" +- "Where should I focus marketing budget or effort?" + +They lack in-house marketing teams. What they need is a smart, always-on guide that understands local trends and acts like a data-driven marketing assistant. + +## Feature Summary + +The AI agent will: +- Aggregate social media, OTA, review, and campaign data. +- Analyze performance by product, region, season, and timing. +- Generate insights in natural GPT-style language. +- Recommend next best actions to improve visibility, conversions, and bookings. + +## Rollout Plan: 6-Month Delivery + +| Phase | Duration | Milestones | +| :--- | :--- | :--- | +| **Discovery & Planning** | 2 weeks | Source mapping, data schemas, risk planning | +| **Core Infrastructure Build** | 6 weeks | Pipeline development, LDP engine setup | +| **AI Layer + Prompt Engineering** | 6 weeks | Smart recommendation engine, insight model tests | +| **UI + Chatbot Integration** | 4 weeks | Marketing dashboard surfaces, chatbot responses | +| **Pilot & Feedback Loop** | 3 weeks | Testing in 3–5 cities with live operator data | +| **Launch & Handoff** | 2 weeks | Final training, documentation, and monitoring setup | + +## Cloud & Infra Cost (Client-Side Estimate) + +| Item | Estimated Allocation | +| :--- | :--- | +| Infra & Cloud Services (GPT-4 API, server infra, storage) | ₹1.5–2 Cr | +| Security, DevOps, Monitoring | ₹0.75 Cr | +| Contingency + Risk Buffer | ₹1.5 Cr | + +## Team Structure (10 Members) + +| Role | Count | Responsibilities | +| :--- | :--- | :--- | +| Data Engineers | 2 | API integration, pipeline design, enrichment | +| Backend Engineers | 2 | Core infra, LDP performance, uptime | +| AI/ML Engineers | 2 | GPT prompt design, training, insight generation | +| Product Manager | 1 | Roadmap, delivery, stakeholder alignment | +| Product Designer | 1 | UX/UI for dashboards, insight cards, chatbot flow | +| Frontend Engineers | 2 | Dashboard & chatbot UI integration | + +## Why This Will Work for You + +- You're sitting on a goldmine of marketing and booking data — this platform unlocks its value. +- AI makes local marketing insights human-readable and intuitive, not spreadsheet-heavy. +- You’ll never have to guess the right channel, time, or campaign again. +- Built to scale across cities, seasons, tours, and customer types. +- Seamlessly integrates with your existing product ecosystem. + +## Next Step + +> Let’s schedule a free 30-minute discovery session with your team to: +> - Walk through the use cases. +> - Review how it can be embedded in your product. +> - Map early adopters and go-to-market timeline. diff --git a/docs/wiki/business/PROPOSAL.md b/docs/wiki/business/PROPOSAL.md new file mode 100644 index 0000000..8ac8b8c --- /dev/null +++ b/docs/wiki/business/PROPOSAL.md @@ -0,0 +1,31 @@ +# Smarter Tools for Local Businesses, Designed to Save Costs + +*From pricing analytics to AI automation, we craft data-first solutions that actually work.* +> [**Talk to Us →**](#) + +--- + +## Our Solutions + +Explore our plug-and-play solutions designed for businesses in tourism, retail, food, and more. + +| Solutions | Description | Call to Action | +| :--- | :--- | :--- | +| **Agentic Marketing Analyser** | Understand user behavior and improve your product. | [View Details](#) | +| **Photo Management & Easy Sharing** | Organize and share your visual assets effortlessly. | [Explore](#) | +| **Fraud Detection** (Coming Soon) | Protect your business with our upcoming fraud detection tools. | [See Use Case](#) | + +--- + +## Why Choose Us? + +* **Customizable**: Tailored to your use case, not a bloated SaaS tool. +* **Affordable**: Pricing that respects early-stage teams. +* **Private & Secure**: 100% privacy-safe, your data stays with you. + +--- + +> ### 💡 Don’t see exactly what you need? +> Let’s co-create a tool for your business. +> +> [**Talk to Us →**](#) diff --git a/docs/wiki/library/BASE.md b/docs/wiki/library/BASE.md new file mode 100644 index 0000000..fdfccca --- /dev/null +++ b/docs/wiki/library/BASE.md @@ -0,0 +1,24 @@ +# Base + +The "base class" of a new Python library is a foundational component that sets the architectural standard for other classes within the library, especially when you anticipate multiple, varied implementations of a core concept. + +Its primary purpose is to define a common interface and/or shared functionality that derived classes will inherit or adhere to. + +Here's what should generally inform the design of a base class for a new Python library: + +## What is the Core Concept/Abstraction? + +What fundamental operation or entity does your library revolve around? (e.g., a Connector to different databases, a Parser for various file formats, a Strategy for different algorithms, a DataSource for different data origins). The base class should represent this abstraction. + +## What are the Common Operations/Interface? + +What actions or behaviors must any concrete implementation of this core concept provide? These will become your abstract methods. + +What common utility methods or shared logic can be provided directly by the base class to avoid code duplication in derived classes? These will be your concrete methods. + +### To Be Abstract or Not Abstract (ABCs): + +If you want to enforce an interface: Use an Abstract Base Class (ABC) from the abc module. This is highly recommended when you want to ensure that any class inheriting from your base class must implement certain methods. If a concrete class fails to implement an @abstractmethod, Python will raise a TypeError upon instantiation. + +Example Use Case: A StorageBase class with abstract get() and put() methods. Any new storage backend (e.g., S3Storage, FileSystemStorage) must provide these methods. + diff --git a/docs/wiki/library/EXCEPTION.md b/docs/wiki/library/EXCEPTION.md new file mode 100644 index 0000000..92807b6 --- /dev/null +++ b/docs/wiki/library/EXCEPTION.md @@ -0,0 +1,164 @@ +# Exception + +Exception handling is crucial for building robust, reliable, and user-friendly Python libraries. Good exception handling communicates issues clearly to the library user, helps with debugging, and prevents silent failures. + +Here are the best practices for exception handling in a Python library: + +1. **Don't Silence Exceptions (The Golden Rule):** + + * **Avoid `except: pass` or `except Exception: pass`.** This is the most common and dangerous anti-pattern. It hides bugs, makes debugging impossible, and leads to unexpected behavior in user applications. + * **Instead, at a minimum, log the exception and re-raise it, or transform it into a more specific, higher-level exception.** + +2. **Be Specific with `except` Clauses:** + + * Catch only the specific exceptions you expect and know how to handle. + * **Bad:** + ```python + try: + # network operation + except Exception as e: + print(f"An error occurred: {e}") + # This catches everything, including KeyboardInterrupt, SystemExit, etc. + ``` + * **Good:** + ```python + import requests + try: + response = requests.get("http://example.com/api") + response.raise_for_status() + except requests.exceptions.Timeout: + raise MyLibraryTimeoutError("API request timed out.") from None + except requests.exceptions.ConnectionError: + raise MyLibraryNetworkError("Could not connect to the API server.") from None + except requests.exceptions.HTTPError as e: + if e.response.status_code == 404: + raise MyLibraryResourceNotFoundError("Requested resource not found.") from None + else: + # Re-raise generic HTTP errors or wrap in a generic library error + raise MyLibraryAPIError(f"API returned an error: {e.response.status_code}") from e + except Exception as e: # Catching a broader Exception at the very end as a last resort + # Log the unexpected exception details for debugging + import logging + logging.exception("An unexpected error occurred in API call") + raise MyLibraryUnknownError("An unexpected error occurred.") from e + ``` + +3. **Raise Custom Exceptions:** + + * **Why:** This is paramount for libraries. Custom exceptions provide clear, semantic meaning to errors originating from your library. Users can then specifically catch *your* library's errors without accidentally catching unrelated issues from other parts of their application or other libraries. + * **How:** Create a base exception for your library, and then derive more specific exceptions from it. + ```python + # In your library's exceptions.py (or similar) + class MyLibraryError(Exception): + """Base exception for MyLibrary.""" + pass + + class MyLibraryConnectionError(MyLibraryError): + """Raised when a connection to a service fails.""" + pass + + class MyLibraryConfigError(MyLibraryError): + """Raised when the library configuration is invalid.""" + pass + + class MyLibraryResourceNotFoundError(MyLibraryError): + """Raised when a specific resource cannot be found.""" + pass + ``` + * **Usage:** + ```python + if not os.path.exists(config_path): + raise MyLibraryConfigError(f"Configuration file not found at: {config_path}") + ``` + +4. **Provide Clear and Informative Error Messages:** + + * When raising or re-raising an exception, the message should explain *what went wrong*, *why it went wrong*, and ideally, *how the user might fix it* (if applicable). + * Include relevant context: input values, file paths, IDs, error codes from external services. + * **Bad:** `raise MyLibraryError("Something went wrong.")` + * **Good:** `raise MyLibraryConnectionError(f"Failed to connect to {url}. Please check your network connection.")` + +5. **Use Exception Chaining (`raise ... from ...`):** + + * When you catch a lower-level exception and re-raise a new, higher-level (custom) exception, use `raise NewException(...) from OriginalException`. + * This preserves the original exception's traceback, providing a full "cause" chain, which is invaluable for debugging. + * Use `from None` if you *don't* want the original exception to be implicitly chained (e.g., if it's an internal detail you've fully handled and transformed). + + + + ```python + import json + class MyLibraryParseError(MyLibraryError): pass + + try: + data = json.loads(invalid_json_string) + except json.JSONDecodeError as e: + # Chaining preserves the original JSONDecodeError traceback + raise MyLibraryParseError("Failed to parse JSON data.") from e + ``` + +6. **Use `finally` for Cleanup:** + + * The `finally` block *always* executes, regardless of whether an exception occurred in the `try` block or not. + * This is ideal for releasing resources like file handles, network connections, database cursors, or locks. + + + + ```python + file_handle = None + try: + file_handle = open("my_file.txt", "w") + file_handle.write("Hello") + except IOError as e: + raise MyLibraryIOError("Could not write to file.") from e + finally: + if file_handle: + file_handle.close() # Guarantees the file is closed + ``` + +7. **Prefer `with` statements for Resource Management:** + + * For resources that support the context manager protocol (like files, locks, database connections), the `with` statement is generally preferred over `try-finally` for cleanup. It automatically handles `__enter__` and `__exit__` methods, ensuring resources are properly acquired and released even if exceptions occur. + + + + ```python + try: + with open("my_file.txt", "w") as f: + f.write("Hello") + # File is automatically closed here, even if f.write() failed + except IOError as e: + raise MyLibraryIOError("Could not write to file.") from e + ``` + +8. **Log Exceptions, Don't Print:** + + * Use Python's `logging` module instead of `print()` for debugging and operational messages. + * Logging allows users of your library to configure how and where messages are stored (console, file, syslog, etc.) and at what level of detail (DEBUG, INFO, WARNING, ERROR, CRITICAL). + * `logging.exception()` is particularly useful as it automatically includes traceback information. + + + + ```python + import logging + logger = logging.getLogger(__name__) # Or get a common library logger + + try: + # some risky operation + except SomeError as e: + logger.error(f"Failed operation due to: {e}") # Basic error message + logger.exception("Detailed traceback for debugging this failure:") # Full traceback + raise # Re-raise after logging + ``` + +9. **Design Your API Around Exceptions:** + + * Document the exceptions your public functions and methods might raise. This forms part of your library's contract with its users. Users need to know what errors to anticipate and handle. + * Avoid leaking internal implementation details via low-level exceptions. Wrap them in your library's custom exceptions. + +10. **Avoid Catching `BaseException`:** + + * `BaseException` is the root of *all* exceptions, including `SystemExit` (raised by `sys.exit()`) and `KeyboardInterrupt` (Ctrl+C). Catching `BaseException` will prevent your program from exiting cleanly or responding to interrupts. + * Generally, you should only catch `Exception` (which `SystemExit` and `KeyboardInterrupt` do *not* inherit from). + +By adhering to these best practices, you can create Python libraries that are robust, easy to debug, and provide a clear, predictable error handling experience for their users. \ No newline at end of file diff --git a/docs/wiki/library/TEST.md b/docs/wiki/library/TEST.md new file mode 100644 index 0000000..a597f63 --- /dev/null +++ b/docs/wiki/library/TEST.md @@ -0,0 +1,156 @@ +# Test + + +Testing is paramount for a Python library to ensure its correctness, reliability, maintainability, and ease of use for its consumers. A well-tested library inspires confidence and reduces the burden on its users. + +Here are the best practices for testing in a Python library: + +## I. Core Principles + +1. **Correctness:** Ensure the library behaves exactly as expected for all valid inputs and scenarios. +2. **Reliability:** Ensure the library handles edge cases, invalid inputs, and error conditions gracefully without crashing or producing incorrect results. +3. **Prevent Regressions:** Catch bugs introduced in new code changes that break existing functionality. +4. **Documentation:** Tests serve as executable documentation for how to use the library's public API. +5. **Maintainability:** Well-structured tests make it easier to refactor code confidently. + +## II. Types of Tests + +1. **Unit Tests:** + + * **Focus:** Test the smallest possible unit of code (a single function, method, or class) in isolation. + * **Isolation is Key:** All external dependencies (database calls, API requests, file system interactions, complex object dependencies) should be **mocked or stubbed** to ensure that only the unit under test is being validated. + * **Characteristics:** Fast, granular, easy to pinpoint failures. + * **Purpose:** Verify the correctness of individual algorithms and logic. + +2. **Integration Tests:** + + * **Focus:** Test how different units or components of your library interact with each other, or how your library interacts with external systems (e.g., a database, an external API, the file system). + * **Less Isolation:** These tests will involve actual interaction with some dependencies, though often with test-specific configurations (e.g., an in-memory database, a local mock server). + * **Characteristics:** Slower than unit tests, but provide higher confidence in the system's overall functionality. + * **Purpose:** Verify that components work together as intended. + +3. **End-to-End (E2E) Tests (Less common for pure libraries):** + + * If your library has a CLI, a web interface built on top of it, or is a full application, E2E tests would simulate real user scenarios from start to finish. For most pure libraries, integration tests often cover this scope. + +## III. Recommended Tools & Frameworks + +1. **`pytest` (Strongly Recommended):** + + * **Advantages:** Less boilerplate code, simple `assert` statements, powerful fixtures for setup/teardown, excellent plugin ecosystem (`pytest-cov` for coverage, `pytest-mock` for mocking, `pytest-xdist` for parallel execution). + * **Standard:** It's become the de-facto standard for Python testing due to its ease of use and flexibility. + +2. **`unittest` (Built-in):** + + * **Advantages:** Part of Python's standard library, no external dependencies needed. + * **Considerations:** More verbose syntax (`assertEqual`, `assertRaises`), class-based test suites. Good for simpler projects or when external dependencies are strictly forbidden. + +## IV. Best Practices for Writing Tests + +1. **Test Public APIs (Interface, not Implementation):** + + * Focus on testing the functions, classes, and methods that users of your library will directly interact with. + * Avoid testing private or internal helper functions directly unless they contain complex, isolated logic that warrants their own unit tests. If you refactor internals, these tests shouldn't break. + +2. **Test Isolation and Mocks:** + + * **Rule:** Each test should run independently of others and produce the same result every time, regardless of the order of execution. + * **Mocks:** For unit tests, use mocking libraries (`unittest.mock` or `pytest-mock`) to simulate the behavior of external dependencies or complex internal objects. This keeps tests fast and prevents failures due to external factors. + * **Example (using `pytest-mock`):** + ```python + def test_fetch_data_from_api(mocker): + mock_response = mocker.Mock() + mock_response.json.return_value = {"key": "value"} + mocker.patch('requests.get', return_value=mock_response) # Mock requests.get + + result = my_library.fetch_data() # Your library function that calls requests.get + assert result == {"key": "value"} + ``` + +3. **Clear, Readable, and Self-Contained Tests:** + + * **Arrange-Act-Assert (AAA) Pattern:** + * **Arrange:** Set up the test environment (input data, mocks, initial state). + * **Act:** Execute the code under test. + * **Assert:** Verify the outcome (return values, side effects, exceptions raised). + * **Meaningful Test Names:** Test function names should clearly indicate what scenario they are testing and what the expected outcome is (e.g., `test_add_two_positive_numbers_returns_sum`, `test_parse_empty_string_raises_value_error`). + +4. **Test Edge Cases and Error Conditions:** + + * Test with: `None` values, empty strings/lists/dictionaries, boundary conditions (min/max values), invalid inputs, files that don't exist, network errors, permissions issues, etc. + * **Testing Exceptions:** Assert that the correct exceptions are raised under specific conditions. + ```python + import pytest + def test_divide_by_zero_raises_zero_division_error(): + with pytest.raises(ZeroDivisionError, match="division by zero"): + 1 / 0 + ``` + +5. **Use Fixtures Wisely (`pytest`):** + + * Fixtures provide a clean way to set up preconditions for tests (e.g., creating temporary files, setting up a database connection, providing pre-initialized objects). + * They promote code reuse and improve readability by centralizing setup/teardown logic. + * **Example:** + ```python + import pytest + import tempfile + + @pytest.fixture + def temp_file_path(): + with tempfile.NamedTemporaryFile(mode='w', delete=False) as tmp: + tmp.write("test content") + file_path = tmp.name + yield file_path # Provide the path to the test + os.remove(file_path) # Clean up after the test + + def test_read_from_temp_file(temp_file_path): + with open(temp_file_path, 'r') as f: + content = f.read() + assert content == "test content" + ``` + +6. **Parameterized Tests (`pytest.mark.parametrize`):** + + * When you have a function that needs to be tested with multiple sets of inputs and expected outputs, parameterization reduces code duplication. + + + + ```python + import pytest + + @pytest.mark.parametrize("input_a, input_b, expected_sum", [ + (1, 2, 3), + (0, 0, 0), + (-1, 5, 4), + (100, 200, 300) + ]) + def test_add_function(input_a, input_b, expected_sum): + assert (input_a + input_b) == expected_sum + ``` + +7. **Strive for High Test Coverage (But Don't Obsess):** + + * Use tools like `pytest-cov` (or `coverage.py`) to measure test coverage. Aim for a high percentage (e.g., 80-90%+ for core logic). + * **Caution:** High coverage doesn't guarantee correctness; it only tells you what lines were executed. You still need good assertions and tests for various scenarios. Focus on *meaningful* coverage over just line coverage. + +8. **Integrate with CI/CD:** + + * Automate your tests to run on every commit, push, or pull request using Continuous Integration (CI) services (e.g., GitHub Actions, GitLab CI, Jenkins). This catches regressions early. + +9. **Tests as Documentation:** + + * Well-written tests serve as the best, always-up-to-date examples of how to use your library's features. They demonstrate expected inputs, outputs, and behaviors for various scenarios. + +10. **Refactor Tests:** + + * Just like production code, tests need to be maintained and refactored. Keep them clean, readable, and efficient. Avoid excessive complexity in tests themselves. + +## V. What to Avoid + + * **Tests that depend on order:** Ensure each test is independent. + * **Testing private methods extensively:** Focus on the public API; if a private method is complex enough to warrant its own detailed tests, it might be a candidate for its own public function or class. + * **Over-mocking:** Only mock what's necessary. Too much mocking can make tests brittle (sensitive to internal refactors) and lose their ability to catch real integration issues. + * **Ignoring test failures:** A failing test means a bug or an outdated test. Address it immediately. + * **Slow unit tests:** Unit tests should run quickly. If they are slow, it often indicates an issue with external dependencies that should be mocked. + +By diligently applying these practices, you'll build a Python library that is not only functional but also robust, maintainable, and a pleasure for others to use. \ No newline at end of file diff --git a/docs/wiki/plan/ISSUES.md b/docs/wiki/plan/ISSUES.md new file mode 100644 index 0000000..b83fdc7 --- /dev/null +++ b/docs/wiki/plan/ISSUES.md @@ -0,0 +1,17 @@ +# How to Contribute + +We're thrilled that you're interested in contributing to the Local Data Platform! Your help is essential for keeping it great. + +This section provides guidelines for contributing to the project. Please take a moment to review this document in order to make the contribution process easy and effective for everyone involved. +Following these guidelines helps to communicate that you respect the time of the developers managing and developing this open-source project. In return, they should reciprocate that respect in addressing your issue, assessing changes, and helping you finalize your pull requests. + +## 🏆 Open Issues to Contribute On + + +| Title | Theme | Status | Comments | Votes | +|-------|-------|--------|----------|-------| +| Example: Add BigQuery Ingestion | Ingestion | In Progress | 5 | 12 | +| Example: Improve Error Handling | Core | Under Review | 3 | 8 | +| Example: Add Parquet Export | Egression | Planned | 2 | 6 | + +--- \ No newline at end of file diff --git a/docs/wiki/usecases/PHOTO_MANAGEMENT.md b/docs/wiki/usecases/PHOTO_MANAGEMENT.md new file mode 100644 index 0000000..30b2f92 --- /dev/null +++ b/docs/wiki/usecases/PHOTO_MANAGEMENT.md @@ -0,0 +1,19 @@ +# Use Case: Local Photo Management & Easy Sharing + +## The Problem + +In the digital age, our photo collections grow exponentially, quickly consuming precious local storage. Managing thousands of photos, finding specific ones, and securely sharing select albums with friends and family without relying on privacy-invasive cloud services becomes a significant personal data challenge. + +* **Storage Bloat:** High-resolution photos take up immense disk space. +* **Organization Chaos:** Photos are scattered, un-tagged, and difficult to search. +* **Privacy Concerns:** Uploading personal photos to public or semi-public cloud albums often compromises privacy. +* **Sharing Friction:** Sending large batches of photos is cumbersome, often leading to using sub-optimal methods or public platforms. + +## How LDP Solves It + +The Local Data Platform (LDP) provides a robust, privacy-first, and highly customizable solution for managing your personal photo library. By leveraging LDP, you can: + +1. **Intelligent Local Compression:** Drastically reduce file sizes of your photos using Python's rich ecosystem of image processing libraries (e.g., Pillow, OpenCV, scikit-image) and various compression algorithms (e.g., JPEG optimization, WebP conversion) – all performed locally on your machine. +2. **Automated Organization & Tagging:** Process photo metadata (EXIF data like date, time, location) to automatically organize your collection. Integrate custom tagging systems to make photos easily searchable. +3. **Privacy-Preserving Sharing:** Generate temporary, secure, and shareable links that serve photos directly from your local machine, within your local network or via controlled internet access (e.g., through a temporary tunnel). This allows you to share with ease without ever permanently uploading your entire collection to a third-party service. + diff --git a/docs/wiki/workflow/PHOTO_MANAGEMENT.md b/docs/wiki/workflow/PHOTO_MANAGEMENT.md new file mode 100644 index 0000000..451e06a --- /dev/null +++ b/docs/wiki/workflow/PHOTO_MANAGEMENT.md @@ -0,0 +1,86 @@ +## Example + +Imagine a typical scenario where you have thousands of vacation photos. Here's how LDP helps: + +```python +import os +from datetime import datetime +from local_data_platform.storage_base import StorageBase +# Assuming you'd implement a concrete FileSystemStorage or similar +from local_data_platform.in_memory_storage import InMemoryStorage # Or a new FileSystemStorage +from local_data_platform.photo_processing import PhotoCompressor, PhotoOrganizer # Hypothetical modules/classes +from local_data_platform.local_server import LocalFileShareServer # Hypothetical module/class + +# 1. Define your local photo storage +# In a real scenario, this would likely be a FileSystemStorage +# For demonstration, let's use a dummy in-memory one or assume a setup +class FileSystemStorage(StorageBase): + def __init__(self, base_path: str): + self.base_path = base_path + os.makedirs(base_path, exist_ok=True) + self._data = {} # Simulating file paths/content for example + + def put(self, key: str, value: bytes): + # In real-world, save binary 'value' to 'os.path.join(self.base_path, key)' + self._data[key] = value + print(f"Stored: {key}") + + def get(self, key: str, default=None): + # In real-world, read binary from 'os.path.join(self.base_path, key)' + return self._data.get(key, default) + +# Setup a specific storage path for photos +photo_storage = FileSystemStorage(base_path="./my_photo_vault") + +# 2. Process your raw photos +photo_compressor = PhotoCompressor(quality=80, output_format="webp") +photo_organizer = PhotoOrganizer(storage_backend=photo_storage) + +raw_photo_paths = ["./vacation/img_001.jpg", "./vacation/img_002.png"] # Paths to your original photos + +# Simulate loading and processing photos +processed_photos_info = [] +for path in raw_photo_paths: + # In reality, read image data from 'path' + image_data_raw = b"..." # Dummy binary data + + # Apply compression + compressed_data = photo_compressor.compress(image_data_raw) + + # Generate a new key/path for storage (e.g., based on hash or metadata) + photo_id = f"compressed_vacation_{os.path.basename(path).split('.')[0]}.webp" + + # Store the compressed photo + photo_storage.put(photo_id, compressed_data) + + # Extract metadata and organize + metadata = {"date": datetime.now().isoformat(), "tags": ["vacation", "beach"]} + photo_organizer.organize(photo_id, metadata) + processed_photos_info.append({"id": photo_id, "path": os.path.join(photo_storage.base_path, photo_id)}) + +print("\nPhotos processed and stored locally.") + +# 3. Share a selection of photos easily and privately +photos_to_share_ids = [processed_photos_info[0]['id']] # Just sharing the first one for example + +# The LocalFileShareServer would temporarily serve these files +share_server = LocalFileShareServer( + storage_backend=photo_storage, + allowed_ids=photos_to_share_ids, + expiration_minutes=60 +) + +# This would start a simple web server in a background thread or process +# and provide a URL that others on the same network can access. +print("\nStarting local sharing server...") +share_url = share_server.start_sharing() +print(f"Share these photos via: {share_url}") +print("Server will automatically stop after 60 minutes or when you close the application.") + +# In a real application, you'd keep the script running for the server to serve, +# or integrate it into a long-running LDP daemon/UI. +# For this example, we'll just print the URL and simulate stopping. +share_server.stop_sharing() +print("Sharing server stopped.") + +``` \ No newline at end of file diff --git a/how_to_setup.md b/how_to_setup.md new file mode 100644 index 0000000..009a914 --- /dev/null +++ b/how_to_setup.md @@ -0,0 +1,70 @@ +# How to Set Up for Development + +This guide explains how to set up your local environment for contributing to the `local-data-platform` project. Following these steps will ensure you have a consistent development environment that matches our CI pipeline. + +## Prerequisites + +Before you begin, ensure you have the following installed on your system: + +- **Git**: For version control. +- **Python**: Version 3.12 or newer. You can check with `python --version`. +- **Poetry**: For dependency management. We recommend installing it with `pipx` to avoid dependency conflicts. + ```sh + # Install pipx if you don't have it + python -m pip install --user pipx + python -m pipx ensurepath + + # Install poetry using pipx + pipx install poetry + ``` + +## Step 1: Clone the Repository + +Clone the project from GitHub and navigate into the project directory: + +```sh +git clone https://github.com/tusharchou/local-data-platform.git +cd local-data-platform +``` + +## Step 2: Install Dependencies + +This project uses Poetry to manage dependencies. To install all packages required for development, testing, and building documentation, run: + +```sh +poetry install --with dev,docs +``` +This command will create a virtual environment within the project folder and install all the necessary libraries. You can activate it by running `poetry shell`. + +## Step 3: Verify Your Setup + +To ensure everything is configured correctly, run the linters, tests, and build the documentation. + +### Run Linters +We use `flake8` to enforce code style. Check for any linting issues with: +```sh +poetry run flake8 src/ tests/ +``` + +### Run Tests +Our tests are written with `pytest`. Execute the test suite by running: +```sh +poetry run pytest +``` + +### Build Documentation +The documentation is built with `MkDocs`. To preview the docs locally with live-reloading, run: +```sh +poetry run mkdocs serve +``` +You can then open `http://127.0.0.1:8000` in your web browser. To perform a strict build like our CI process, use `poetry run mkdocs build --strict`. + +You are now ready to contribute to `local-data-platform`! + +## Troubleshooting + +If you encounter issues, especially with dependencies, try a clean re-installation: + +```sh +make reinstall +``` \ No newline at end of file diff --git a/local-data-platform/local_data_platform/__init__.py b/local_data_platform/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/__init__.py rename to local_data_platform/__init__.py diff --git a/local-data-platform/local_data_platform/catalog/__init__.py b/local_data_platform/catalog/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/catalog/__init__.py rename to local_data_platform/catalog/__init__.py diff --git a/local-data-platform/local_data_platform/catalog/local/__init__.py b/local_data_platform/catalog/local/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/catalog/local/__init__.py rename to local_data_platform/catalog/local/__init__.py diff --git a/local-data-platform/local_data_platform/catalog/local/iceberg/__init__.py b/local_data_platform/catalog/local/iceberg/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/catalog/local/iceberg/__init__.py rename to local_data_platform/catalog/local/iceberg/__init__.py diff --git a/local-data-platform/local_data_platform/engine/__init__.py b/local_data_platform/engine/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/engine/__init__.py rename to local_data_platform/engine/__init__.py diff --git a/local-data-platform/local_data_platform/etl.py b/local_data_platform/etl.py similarity index 100% rename from local-data-platform/local_data_platform/etl.py rename to local_data_platform/etl.py diff --git a/local-data-platform/local_data_platform/exceptions.py b/local_data_platform/exceptions.py similarity index 100% rename from local-data-platform/local_data_platform/exceptions.py rename to local_data_platform/exceptions.py diff --git a/local-data-platform/local_data_platform/format/__init__.py b/local_data_platform/format/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/format/__init__.py rename to local_data_platform/format/__init__.py diff --git a/local-data-platform/local_data_platform/format/csv/__init__.py b/local_data_platform/format/csv/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/format/csv/__init__.py rename to local_data_platform/format/csv/__init__.py diff --git a/local-data-platform/local_data_platform/format/iceberg/__init__.py b/local_data_platform/format/iceberg/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/format/iceberg/__init__.py rename to local_data_platform/format/iceberg/__init__.py diff --git a/local-data-platform/local_data_platform/format/parquet/__init__.py b/local_data_platform/format/parquet/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/format/parquet/__init__.py rename to local_data_platform/format/parquet/__init__.py diff --git a/local-data-platform/local_data_platform/hello_world.py b/local_data_platform/hello_world.py similarity index 100% rename from local-data-platform/local_data_platform/hello_world.py rename to local_data_platform/hello_world.py diff --git a/local-data-platform/local_data_platform/issue/__init__.py b/local_data_platform/issue/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/issue/__init__.py rename to local_data_platform/issue/__init__.py diff --git a/local-data-platform/local_data_platform/logger.py b/local_data_platform/logger.py similarity index 100% rename from local-data-platform/local_data_platform/logger.py rename to local_data_platform/logger.py diff --git a/local-data-platform/local_data_platform/pipeline/__init__.py b/local_data_platform/pipeline/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/pipeline/__init__.py rename to local_data_platform/pipeline/__init__.py diff --git a/local-data-platform/local_data_platform/pipeline/egression/__init__.py b/local_data_platform/pipeline/egression/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/pipeline/egression/__init__.py rename to local_data_platform/pipeline/egression/__init__.py diff --git a/local-data-platform/local_data_platform/pipeline/egression/csv_to_iceberg/__init__.py b/local_data_platform/pipeline/egression/csv_to_iceberg/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/pipeline/egression/csv_to_iceberg/__init__.py rename to local_data_platform/pipeline/egression/csv_to_iceberg/__init__.py diff --git a/local-data-platform/local_data_platform/pipeline/egression/iceberg_to_csv/__init__.py b/local_data_platform/pipeline/egression/iceberg_to_csv/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/pipeline/egression/iceberg_to_csv/__init__.py rename to local_data_platform/pipeline/egression/iceberg_to_csv/__init__.py diff --git a/local-data-platform/local_data_platform/pipeline/ingestion/__init__.py b/local_data_platform/pipeline/ingestion/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/pipeline/ingestion/__init__.py rename to local_data_platform/pipeline/ingestion/__init__.py diff --git a/local-data-platform/local_data_platform/pipeline/ingestion/bigquery_to_csv/__init__.py b/local_data_platform/pipeline/ingestion/bigquery_to_csv/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/pipeline/ingestion/bigquery_to_csv/__init__.py rename to local_data_platform/pipeline/ingestion/bigquery_to_csv/__init__.py diff --git a/local-data-platform/local_data_platform/pipeline/ingestion/csv_to_iceberg/__init__.py b/local_data_platform/pipeline/ingestion/csv_to_iceberg/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/pipeline/ingestion/csv_to_iceberg/__init__.py rename to local_data_platform/pipeline/ingestion/csv_to_iceberg/__init__.py diff --git a/local-data-platform/local_data_platform/pipeline/ingestion/parquet_to_iceberg/__init__.py b/local_data_platform/pipeline/ingestion/parquet_to_iceberg/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/pipeline/ingestion/parquet_to_iceberg/__init__.py rename to local_data_platform/pipeline/ingestion/parquet_to_iceberg/__init__.py diff --git a/local-data-platform/local_data_platform/pipeline/ingestion/pyarrow/__init__.py b/local_data_platform/pipeline/ingestion/pyarrow/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/pipeline/ingestion/pyarrow/__init__.py rename to local_data_platform/pipeline/ingestion/pyarrow/__init__.py diff --git a/local-data-platform/local_data_platform/store/__init__.py b/local_data_platform/store/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/store/__init__.py rename to local_data_platform/store/__init__.py diff --git a/local-data-platform/local_data_platform/store/source/__init__.py b/local_data_platform/store/source/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/store/source/__init__.py rename to local_data_platform/store/source/__init__.py diff --git a/local-data-platform/local_data_platform/store/source/gcp/__init__.py b/local_data_platform/store/source/gcp/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/store/source/gcp/__init__.py rename to local_data_platform/store/source/gcp/__init__.py diff --git a/local-data-platform/local_data_platform/store/source/gcp/bigquery/__init__.py b/local_data_platform/store/source/gcp/bigquery/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/store/source/gcp/bigquery/__init__.py rename to local_data_platform/store/source/gcp/bigquery/__init__.py diff --git a/local-data-platform/local_data_platform/store/source/json/__init__.py b/local_data_platform/store/source/json/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/store/source/json/__init__.py rename to local_data_platform/store/source/json/__init__.py diff --git a/local_data_platform/store/source/near/__init__.py b/local_data_platform/store/source/near/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/local-data-platform/local_data_platform/store/source/parquet/__init__.py b/local_data_platform/store/source/parquet/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/store/source/parquet/__init__.py rename to local_data_platform/store/source/parquet/__init__.py diff --git a/local-data-platform/local_data_platform/store/target/__init__.py b/local_data_platform/store/target/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/store/target/__init__.py rename to local_data_platform/store/target/__init__.py diff --git a/local-data-platform/local_data_platform/store/target/iceberg/__init__.py b/local_data_platform/store/target/iceberg/__init__.py similarity index 100% rename from local-data-platform/local_data_platform/store/target/iceberg/__init__.py rename to local_data_platform/store/target/iceberg/__init__.py diff --git a/local-data-platform/local_data_platform/tmp/warehouse/pyiceberg_catalog.db b/local_data_platform/tmp/warehouse/pyiceberg_catalog.db similarity index 100% rename from local-data-platform/local_data_platform/tmp/warehouse/pyiceberg_catalog.db rename to local_data_platform/tmp/warehouse/pyiceberg_catalog.db diff --git a/mkdocs.yml b/mkdocs.yml index ee7bf64..03326e5 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -7,12 +7,30 @@ repo_name: local-data-platform nav: - Home: index.md - - User Guide: - - Recipes: recipes.md - - API Reference: api.md - - Contributing: - - User Issues: user_issues.md - - Feature Requests: developer_feature_requests.md + - "About LDP": wiki/VISION.md + - Business Solution: + - Business Problem: wiki/business/PROPOSAL.md + - Agentic Marketing Analyser: wiki/business/MARKETING_DATA_ANALYSIS.md + - "Photo Management": wiki/usecases/PHOTO_MANAGEMENT.md + - "Conceptual Workflow": + - "Photo Management": wiki/workflow/PHOTO_MANAGEMENT.md + - Features: + - Library: + - "Base": wiki/library/BASE.md + - "Exceptions": wiki/library/EXCEPTION.md + - "Test": wiki/library/TEST.md + - Development: + - "Github": wiki/GITHUB.md + + - Contribute Now: + - "How To Contribute": wiki/plan/ISSUES.md + - Project Wiki: + - Overview: wiki/PROJECT_OVERVIEW.md + - Problem Statement: wiki/PROBLEM_STATEMENT.md + - Technical Specs: wiki/TECHNICAL_SPECIFICATIONS.md + - Project Structure: wiki/PROJECT_STRUCTURE.md + - Development Process: wiki/DEVELOPMENT.md + theme: name: material @@ -56,6 +74,7 @@ markdown_extensions: plugins: - search - autorefs + - tags - mkdocstrings: handlers: python: diff --git a/poetry.lock b/poetry.lock index c2cb796..0abeb9d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -25,6 +25,28 @@ files = [ [package.extras] dev = ["backports.zoneinfo", "freezegun (>=1.0,<2.0)", "jinja2 (>=3.0)", "pytest (>=6.0)", "pytest-cov", "pytz", "setuptools", "tzdata"] +[[package]] +name = "beautifulsoup4" +version = "4.13.4" +description = "Screen-scraping library" +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "beautifulsoup4-4.13.4-py3-none-any.whl", hash = "sha256:9bbbb14bfde9d79f38b8cd5f8c7c85f4b8f2523190ebed90e950a8dea4cb1c4b"}, + {file = "beautifulsoup4-4.13.4.tar.gz", hash = "sha256:dbb3c4e1ceae6aefebdaf2423247260cd062430a410e38c66f2baa50a8437195"}, +] + +[package.dependencies] +soupsieve = ">1.2" +typing-extensions = ">=4.0.0" + +[package.extras] +cchardet = ["cchardet"] +chardet = ["chardet"] +charset-normalizer = ["charset-normalizer"] +html5lib = ["html5lib"] +lxml = ["lxml"] + [[package]] name = "cachetools" version = "5.5.2" @@ -1301,6 +1323,17 @@ files = [ {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, ] +[[package]] +name = "soupsieve" +version = "2.7" +description = "A modern CSS selector implementation for Beautiful Soup." +optional = false +python-versions = ">=3.8" +files = [ + {file = "soupsieve-2.7-py3-none-any.whl", hash = "sha256:6e60cc5c1ffaf1cebcc12e8188320b72071e922c2e897f737cadce79ad5d30c4"}, + {file = "soupsieve-2.7.tar.gz", hash = "sha256:ad282f9b6926286d2ead4750552c8a6142bc4c783fd66b0293547c8fe6ae126a"}, +] + [[package]] name = "strictyaml" version = "1.7.3" @@ -1417,4 +1450,4 @@ watchmedo = ["PyYAML (>=3.10)"] [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "2d19db6500226f2c392eccfde08c2876784901c005764bca5867d017ee0a6f43" +content-hash = "24a03a384403a85bc073b0f2bd05da36a6e64b7b08ab3daf26bd655cf01eb6f5" diff --git a/pyproject.toml b/pyproject.toml index d4a1af8..34f3301 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,3 @@ -[build-system] -requires = ["poetry-core>=1.0.0"] -build-backend = "poetry.core.masonry.api" - [tool.poetry] name = "local-data-platform" version = "0.1.1" @@ -14,16 +10,22 @@ packages = [{include = "local_data_platform", from = "src"}] [tool.poetry.dependencies] python = "^3.12" pyiceberg = ">=0.5.0" +requests = "*" +beautifulsoup4 = "*" + +[tool.poetry.group.dev.dependencies] +pytest = ">=7.0.0" +flake8 = ">=5.0.0" -[tool.poetry.group.docs] -optional = true [tool.poetry.group.docs.dependencies] mkdocs = "^1.6.0" mkdocs-material = "9.5.21" mkdocstrings = {extras = ["python"], version = "^0.25.0"} pymdown-extensions = "^10.8.1" -[tool.poetry.group.dev] -optional = true -[tool.poetry.group.dev.dependencies] -pytest = ">=7.0.0" -flake8 = ">=5.0.0" + +[tool.poetry.scripts] +fetch-rtd-urls = "scripts.fetch_rtd_urls:main" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/append_pr_history.py b/scripts/append_pr_history.py new file mode 100644 index 0000000..904f1cd --- /dev/null +++ b/scripts/append_pr_history.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +""" +Script to append PR history and directory changes to docs/PR_HISTORY.md after a merge. +Intended for use in CI/CD or as a manual post-merge tool. +""" +import os +import subprocess +from datetime import datetime + +PR_HISTORY_PATH = os.path.join('docs', 'PR_HISTORY.md') + +def get_env_var(name, fallback=None): + return os.environ.get(name, fallback) + +def get_last_merge_commit(): + result = subprocess.run(['git', 'log', '--merges', '-1', '--pretty=%H'], capture_output=True, text=True, check=True) + return result.stdout.strip() + +def get_changed_dirs(base, head): + result = subprocess.run(['git', 'diff', '--dirstat=files,0', base, head], capture_output=True, text=True, check=True) + dirs = {f"/{line.split()[-1].rstrip('/')}/" for line in result.stdout.splitlines() if line.strip()} + return sorted(list(dirs)) + +def append_pr_history(pr_number, pr_title, merger, date_merged, description, changed_dirs): + with open(PR_HISTORY_PATH, 'a', encoding='utf-8') as f: + f.write(f"\n### PR #{pr_number}: {pr_title}\n\n") + f.write(f"**Merged By:** {merger} on {date_merged}\n\n") + f.write(f"**Description:**\n\n```\n{description}\n```\n\n") + f.write(f"**Directory Changes:**\n\n") + for d in changed_dirs: + f.write(f"- `{d}`\n") + f.write("\n---\n") + +if __name__ == '__main__': + merge_commit = get_last_merge_commit() + # The second parent of a merge commit is the head of the merged branch + parent_commit = f'{merge_commit}^2' + changed_dirs = get_changed_dirs(parent_commit, merge_commit) + + append_pr_history( + pr_number=get_env_var('PR_NUMBER', 'N/A'), + pr_title=get_env_var('PR_TITLE', 'No title provided'), + merger=get_env_var('PR_MERGER', 'N/A'), + date_merged=datetime.now().strftime('%Y-%m-%d'), + description=get_env_var('PR_DESCRIPTION', 'No description provided'), + changed_dirs=changed_dirs + ) + print(f"PR history successfully updated in {PR_HISTORY_PATH}") \ No newline at end of file diff --git a/scripts/fetch_closed_items.py b/scripts/fetch_closed_items.py new file mode 100644 index 0000000..00efaf2 --- /dev/null +++ b/scripts/fetch_closed_items.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +""" +Script to fetch and display closed Pull Requests and Issues from a GitHub repository +and write them to a Markdown file. +""" +import os +from typing import List +from local_data_platform.github import get_items, Item + +# --- Configuration --- +REPO_OWNER = "tusharchou" +REPO_NAME = "local-data-platform" +OUTPUT_PATH = os.path.join('docs', 'closed_items.md') +# --------------------- + +def format_items_as_markdown(items: List[Item], item_type: str) -> str: + """Formats a list of GitHub items into a Markdown list.""" + if not items: + return f"No closed {item_type} found or failed to fetch." + + markdown_list = [] + for item in items: + author = item.author + if item.closed_at: + closed_date = item.closed_at.strftime('%Y-%m-%d') + date_info = f" on {closed_date} " + else: + date_info = "" + markdown_list.append(f"- **[{item_type} #{item.number}]({item.url})**: {item.title} (closed{date_info}by @{author})") + return "\n".join(markdown_list) + +def main(): + """Main function to fetch closed items and write them to a Markdown file.""" + print("Fetching all closed items...") + + all_closed_items = get_items(REPO_OWNER, REPO_NAME, state="closed") + closed_prs = [item for item in all_closed_items if item.is_pr] + closed_issues = [item for item in all_closed_items if not item.is_pr] + + content = "# All Closed Items\n\n" + content += "This page lists all closed Pull Requests and Issues, sorted by most recently updated.\n\n" + content += "## Closed Pull Requests\n\n" + content += format_items_as_markdown(closed_prs, "PR") + content += "\n\n## Closed Issues\n\n" + content += format_items_as_markdown(closed_issues, "Issue") + + os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True) + with open(OUTPUT_PATH, 'w', encoding='utf-8') as f: + f.write(content) + + print(f"Successfully generated closed items report at {OUTPUT_PATH}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/fetch_rtd_urls.py b/scripts/fetch_rtd_urls.py new file mode 100644 index 0000000..de575df --- /dev/null +++ b/scripts/fetch_rtd_urls.py @@ -0,0 +1,58 @@ +import requests +from bs4 import BeautifulSoup +from urllib.parse import urljoin, urlparse + + +def fetch_all_rtd_urls(base_url): + visited = set() + to_visit = [base_url] + result = [] + + while to_visit: + url = to_visit.pop() + if url in visited: + continue + visited.add(url) + try: + resp = requests.get(url) + resp.raise_for_status() + except Exception as e: + print(f"Failed to fetch {url}: {e}") + continue + + soup = BeautifulSoup(resp.text, "html.parser") + result.append(url) + for link in soup.find_all("a", href=True): + href = link["href"] + # Only follow internal links + if href.startswith("http"): + if not href.startswith(base_url): + continue + elif href.startswith("/"): + href = urljoin(base_url, href) + else: + href = urljoin(url, href) + # Only crawl pages within the docs site + if urlparse(href).netloc == urlparse(base_url).netloc and href not in visited: + to_visit.append(href) + + return sorted(result) + + +def write_urls_to_wiki(urls, output_path): + with open(output_path, "w") as f: + f.write("# Active Hosted Docs URLs\n\n") + for url in urls: + f.write(f"- [{url}]({url})\n") + + +def main(): + # Updated to your actual Read the Docs URL and branch + BASE_URL = "https://local-data-platform.readthedocs.io/en/docs-sidebar-recipes-from-fix-readthedocs/" + urls = fetch_all_rtd_urls(BASE_URL) + write_urls_to_wiki(urls, "docs/wiki/ACTIVE_DOCS_URLS.md") + print(f"Found {len(urls)} URLs. Output written to docs/wiki/ACTIVE_DOCS_URLS.md") + + +if __name__ == "__main__": + main() diff --git a/scripts/generate_issue_list.py b/scripts/generate_issue_list.py new file mode 100644 index 0000000..86f1a60 --- /dev/null +++ b/scripts/generate_issue_list.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +""" +Script to generate the 'user_issues.md' page with a filterable list of open GitHub issues. +""" +import os +import re +from typing import List +from local_data_platform.github import get_items, Item + +REPO_OWNER = "tusharchou" +REPO_NAME = "local-data-platform" +OUTPUT_PATH = os.path.join('docs', 'user_issues.md') + +def parse_labels(labels: List[str]): + """Parses labels to find status and theme.""" + status = "Planned" # Default status + theme = "General" # Default theme + for label_name in labels: + label_name = label_name.lower() + if label_name.startswith('status:'): + status = label_name.replace('status:', '').replace('-', ' ').title() + elif label_name.startswith('theme:'): + theme = label_name.replace('theme:', '').replace('-', ' ').title() + return status, theme + +def generate_page_content(items: List[Item]): + """Generates the full Markdown content for the user_issues.md page.""" + header = """# Open Issues + +This page lists all open issues in the repository. Use the filters below to sort by status or theme. +""" + + issue_cards = [] + for item in items: + if item.is_pr: + continue # Skip pull requests + + status, theme = parse_labels(item.labels) + # Truncate body for preview + body_preview = (item.description or 'No description provided.') + body_preview = body_preview.replace('\n', ' ').replace('\r', ' ').strip() + body_preview = re.sub(r'\s+', ' ', body_preview) + body_preview = (body_preview[:150] + '...') if len(body_preview) > 150 else body_preview + + card = f""" +--- +tags: + - Status - {status} + - Theme - {theme} +--- +### #{item.number} - {item.title} +*Status: {status} | Theme: {theme}* + +> {body_preview} +""" + issue_cards.append(card) + + if not issue_cards: + return header + "\nNo open issues found." + + return header + "\n" + "\n---\n".join(issue_cards) + + +if __name__ == "__main__": + print("Generating 'Open Issues' page...") + open_items = get_items(REPO_OWNER, REPO_NAME, state="open") + page_content = generate_page_content(open_items) + os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True) + with open(OUTPUT_PATH, 'w', encoding='utf-8') as f: + f.write(page_content) + print(f"Successfully generated page at {OUTPUT_PATH}") \ No newline at end of file diff --git a/scripts/github_api.py b/scripts/github_api.py new file mode 100644 index 0000000..e69de29 diff --git a/src/local_data_platform/__init__.py b/src/local_data_platform/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/local_data_platform/github/__init__.py b/src/local_data_platform/github/__init__.py new file mode 100644 index 0000000..a372760 --- /dev/null +++ b/src/local_data_platform/github/__init__.py @@ -0,0 +1,101 @@ +import os +import requests +from dataclasses import dataclass +from datetime import datetime +from typing import Optional, List +from local_data_platform.exceptions import GitHubAPIError + +@dataclass +class Item: + """Represents a GitHub Issue or Pull Request.""" + number: int + title: str + author: str + description: Optional[str] + created_at: datetime + closed_at: Optional[datetime] + url: str + is_pr: bool + labels: List[str] + +def _fetch_paginated_data(api_url: str, params: dict, headers: dict) -> List[dict]: + """Handles pagination for GitHub API requests.""" + all_items = [] + page_num = 1 + while api_url: + print(f"Fetching page {page_num} from {api_url}...") + try: + # For subsequent pages, params are already in the URL, so we pass None + current_params = params if page_num == 1 else None + response = requests.get(api_url, headers=headers, params=current_params) + response.raise_for_status() + + fetched_items = response.json() + if not fetched_items: + break + + all_items.extend(fetched_items) + + if 'next' in response.links: + api_url = response.links['next']['url'] + page_num += 1 + else: + api_url = None + except requests.exceptions.RequestException as e: + raise GitHubAPIError(f"Error fetching data from GitHub: {e}") from e + return all_items + +def get_items(repo_owner: str, repo_name: str, state: str = "all") -> List[Item]: + """ + Fetches Issues and Pull Requests from a GitHub repository. + + Args: + repo_owner: The owner of the repository. + repo_name: The name of the repository. + state: The state of the items to fetch ('open', 'closed', 'all'). + + Returns: + A list of Item objects. + """ + token = os.environ.get("GITHUB_TOKEN") + headers = {"Accept": "application/vnd.github.v3+json"} + if token: + headers["Authorization"] = f"Bearer {token}" + else: + print(f"Warning: GITHUB_TOKEN not set. Making unauthenticated requests to fetch items in '{state}' state.") + + api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/issues" + params = {"state": state, "per_page": 100, "sort": "updated", "direction": "desc"} + + try: + raw_items = _fetch_paginated_data(api_url, params, headers) + except GitHubAPIError as e: + print(e) + return [] + + items = [] + for raw_item in raw_items: + # Safely parse datetime strings + created_at = datetime.fromisoformat( + raw_item['created_at'].replace('Z', '+00:00') + ) + closed_at = None + if raw_item.get('closed_at'): + closed_at = datetime.fromisoformat( + raw_item['closed_at'].replace('Z', '+00:00') + ) + + item = Item( + number=raw_item['number'], + title=raw_item['title'], + author=raw_item['user']['login'], + description=raw_item.get('body'), + created_at=created_at, + closed_at=closed_at, + url=raw_item['html_url'], + is_pr='pull_request' in raw_item, + labels=[label['name'] for label in raw_item.get('labels', [])] + ) + items.append(item) + + return items \ No newline at end of file diff --git a/src/tmp/warehouse/pyiceberg_catalog.db b/src/tmp/warehouse/pyiceberg_catalog.db new file mode 100644 index 0000000000000000000000000000000000000000..388fec89b40f136d4e0346716e24fc9b27e2fbda GIT binary patch literal 20480 zcmeI&&u^1p7zc3Zeso*5)r(n{c;K>RP5QP_paVT8Sd2uO`ikl(tuv8#JbV(7%fGwp!nPR;_QBbv-Zb9~?>hZ}#>I(#rBMhv^|dlIvI} ztaFQ~bLzL*Fiw$WdGd;{>d#+R>&McI+Of1fo@7B9CR32cFPdM;znfp&IH`$(*L2ur zbmsGcqD9+r&}&<>C@f{x-i2gW%x>iK;#GB`G`+^K82wfzmA}95_TatvQR#p-+H>y~xD~QBEn;VsW|U3ui7iN@4ql!|nZ=ER{O`)s zJ0uzvq~D(7->L5SURdL6W-SudulXAm2tWV=5P$##AOHafKmY;|xJ!ZLFteoVJIQ9U z+3F0&$%8SuW2f~sI}NMDd_TCbVtVOWq35v3^?dfQR4SI@(^lz%hU^0$ak5g{@kPfj zrJkcS7@(H;l!06(6{n%gMwt=j>ITtFw?a%^)re^@jqNmaW*cg8a7^)I6g2NX3x%kT zt~Okk5=YnVZWd)iDJ>IIp^9QqO_5z~GK-`la?`COn`G1OKdr9^*&WA6EdIlr7(aAU zP$8yjP}?>sQ4C!s8dDWwYt$v0!fe;&w<1$InfskxTqac33|nIg(aHvgD;t#1ib9EQ zs1;MOnW-s1m2ZVV*_m7-