coderainforever
diff --git a/‎.gitignore
+4 b/‎.gitignore
+4
diff --git a/‎Extract Transform Load.ipynb
+158 b/‎Extract Transform Load.ipynb
+158
diff --git a/‎covid_db_cleaned.sqlite
52 KB b/‎covid_db_cleaned.sqlite
52 KB
diff --git a/‎dags/__init__.py b/‎dags/__init__.py
diff --git a/‎dags/__pycache__/_etl_extract.cpython-310.pyc
295 Bytes b/‎dags/__pycache__/_etl_extract.cpython-310.pyc
295 Bytes
diff --git a/‎dags/__pycache__/_etl_extract.cpython-37.pyc
289 Bytes b/‎dags/__pycache__/_etl_extract.cpython-37.pyc
289 Bytes
diff --git a/‎dags/__pycache__/_etl_transform.cpython-310.pyc
530 Bytes b/‎dags/__pycache__/_etl_transform.cpython-310.pyc
530 Bytes
diff --git a/‎dags/__pycache__/dag_1.cpython-37.pyc
538 Bytes b/‎dags/__pycache__/dag_1.cpython-37.pyc
538 Bytes
diff --git a/‎dags/__pycache__/dag_2.cpython-37.pyc
1.22 KB b/‎dags/__pycache__/dag_2.cpython-37.pyc
1.22 KB
diff --git a/‎dags/__pycache__/dag_3.cpython-37.pyc
1.29 KB b/‎dags/__pycache__/dag_3.cpython-37.pyc
1.29 KB
diff --git a/‎dags/__pycache__/dag_nyc_covid.cpython-37.pyc
730 Bytes b/‎dags/__pycache__/dag_nyc_covid.cpython-37.pyc
730 Bytes
diff --git a/‎dags/__pycache__/etl_dag_veena.cpython-37.pyc
3.55 KB b/‎dags/__pycache__/etl_dag_veena.cpython-37.pyc
3.55 KB
diff --git a/‎dags/__pycache__/etl_pipe.cpython-37.pyc
1.55 KB b/‎dags/__pycache__/etl_pipe.cpython-37.pyc
1.55 KB
diff --git a/‎dags/__pycache__/extract.cpython-310.pyc
287 Bytes b/‎dags/__pycache__/extract.cpython-310.pyc
287 Bytes
diff --git a/‎dags/__pycache__/extract.cpython-37.pyc
325 Bytes b/‎dags/__pycache__/extract.cpython-37.pyc
325 Bytes
diff --git a/‎dags/__pycache__/extractspotify.cpython-310.pyc
1.35 KB b/‎dags/__pycache__/extractspotify.cpython-310.pyc
1.35 KB
diff --git a/‎dags/__pycache__/load.cpython-37.pyc
1.06 KB b/‎dags/__pycache__/load.cpython-37.pyc
1.06 KB
diff --git a/‎dags/__pycache__/spotify_dag.cpython-37.pyc
993 Bytes b/‎dags/__pycache__/spotify_dag.cpython-37.pyc
993 Bytes
diff --git a/‎dags/__pycache__/spotify_etl.cpython-37.pyc
2.89 KB b/‎dags/__pycache__/spotify_etl.cpython-37.pyc
2.89 KB
diff --git a/‎dags/__pycache__/transform.cpython-310.pyc
517 Bytes b/‎dags/__pycache__/transform.cpython-310.pyc
517 Bytes
diff --git a/‎dags/__pycache__/transform.cpython-37.pyc
593 Bytes b/‎dags/__pycache__/transform.cpython-37.pyc
593 Bytes
diff --git a/‎dags/dag_nyc_covid.py
+33 b/‎dags/dag_nyc_covid.py
+33
diff --git a/‎dags/etl_pipe.py
+39 b/‎dags/etl_pipe.py
+39
@@ -0,0 +1,4 @@
+.backup_files
+.ipynb_checkpoints
+.venv
+logs
@@ -0,0 +1,158 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Importing Dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datetime import date, datetime\n",
+    "import requests\n",
+    "import pandas as pd\n",
+    "import os\n",
+    "import json\n",
+    "import re\n",
+    "import requests\n",
+    "import sqlalchemy\n",
+    "from sqlalchemy.orm import sessionmaker\n",
+    "import sqlite3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Extract"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extracting JSON data from public API of New York City website\n",
+    "def _extract():\n",
+    "    url = \"https://data.cityofnewyork.us/resource/rc75-m7u3.json\"\n",
+    "    result_load = requests.get(url)\n",
+    "    df = pd.DataFrame(json.loads(result_load.content))\n",
+    "    df.to_csv(\"covid_db_original_{}.csv\".format(date.today().strftime(\"%Y%m%d\")))\n",
+    "_extract()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Transform"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df = pd.DataFrame(json.loads(result_load.content))\n",
+    "def _transform():\n",
+    "    df1 = pd.read_csv(\"covid_db_original_{}.csv\".format(date.today().strftime(\"%Y%m%d\")))\n",
+    "    df1['date'] = df1['date_of_interest'].str.extract('(....-..-..)', expand=True)\n",
+    "    df1.drop(df1.columns.difference(['date','case_count','hospitalized_count','death_count']), axis=1, inplace=True)\n",
+    "    df1 = df1.set_index(\"date\")\n",
+    "    df1.to_csv(\"covid_db_transformed_{}.csv\".format(date.today().strftime(\"%Y%m%d\")))\n",
+    "_transform()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Opened database successfully\n",
+      "Data already exists in the database\n",
+      "Close database successfully\n"
+     ]
+    }
+   ],
+   "source": [
+    "def _load():\n",
+    "    \n",
+    "    df2 = pd.read_csv(\"covid_db_transformed_{}.csv\".format(date.today().strftime(\"%Y%m%d\")))\n",
+    "    \n",
+    "    DATABASE_LOCATION = \"sqlite:///covid_db_cleaned.sqlite\"\n",
+    "\n",
+    "    engine = sqlalchemy.create_engine(DATABASE_LOCATION)\n",
+    "    conn = sqlite3.connect('covid_db_cleaned.sqlite')\n",
+    "    cursor = conn.cursor()\n",
+    "\n",
+    "    sql_query = \"\"\"\n",
+    "    CREATE TABLE IF NOT EXISTS covid_db_cleaned (\n",
+    "            date DATE,\n",
+    "            case_count INT,\n",
+    "            hospitalized_count INT,\n",
+    "            death_count INT,\n",
+    "            PRIMARY KEY (date)\n",
+    "    )\n",
+    "    \"\"\"\n",
+    "\n",
+    "    cursor.execute(sql_query)\n",
+    "    print(\"Opened database successfully\")\n",
+    "\n",
+    "    try:\n",
+    "        df2.to_sql(\"covid_db_cleaned\", engine, index=False, if_exists='append',con=conn)\n",
+    "    except:\n",
+    "        print(\"Data already exists in the database\")\n",
+    "\n",
+    "    conn.close()\n",
+    "    print(\"Close database successfully\")\n",
+    "_load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
@@ -0,0 +1,33 @@
+from airflow import DAG
+from airflow.operators.python_operator import PythonOperator
+# from airflow.operators.bash import BashOperator
+from datetime import datetime, timedelta
+from etl_pipe import _extract, _transform, _load 
+# import os
+
+# cwd = os.getcwd()
+
+default_args = {
+    "owner": "airflow",
+    "start_date": datetime.today() - timedelta(days=1)
+              }
+with DAG(
+    "dag_nyc_covid",
+    default_args=default_args,
+    schedule_interval = "0 1 * * *",
+    ) as dag:
+    
+    extractData = PythonOperator(
+            task_id="extract_data",
+            python_callable =_extract,
+            dag=dag)
+    transformData = PythonOperator(
+            task_id="transform_data",
+            python_callable =_transform,
+            dag=dag)
+    loadData = PythonOperator(
+            task_id="load_data",
+            python_callable =_load,
+            dag=dag)
+
+    extractData >> transformData >> loadData
@@ -0,0 +1,39 @@
+from datetime import date, datetime
+import requests
+import pandas as pd
+import json
+import requests
+from sqlalchemy import create_engine
+
+# Extract:  Extracting JSON data from public API of New York City website and saving to a .csv file
+
+def _extract():
+    url = "https://data.cityofnewyork.us/resource/rc75-m7u3.json"
+    result_load = requests.get(url)
+    df = pd.DataFrame(json.loads(result_load.content))
+    df.to_csv("data/covid_db_original_{}.csv".format(date.today().strftime("%Y%m%d")))
+_extract()
+
+# Transform: Transforming the data using pandas
+
+def _transform():
+    df1 = pd.read_csv("data/covid_db_original_{}.csv".format(date.today().strftime("%Y%m%d")))
+    df1['date'] = df1['date_of_interest'].str.extract('(....-..-..)', expand=True)
+    df1.drop(df1.columns.difference(['date','case_count','hospitalized_count','death_count']), axis=1, inplace=True)
+    df1 = df1.set_index("date")
+    df1.to_csv("data/covid_db_transformed_{}.csv".format(date.today().strftime("%Y%m%d")))
+_transform()
+
+# Load: Ingesting the transformed data into an SQLite Database
+
+def _load():
+    
+    df2 = pd.read_csv("data/covid_db_transformed_{}.csv".format(date.today().strftime("%Y%m%d")))
+    DATABASE_LOCATION = "sqlite:///covid_db_cleaned.sqlite"
+    engine = create_engine(DATABASE_LOCATION, echo=True)
+    sqlite_connection = engine.connect()
+    sqlite_table = "covid_data"
+    df2.to_sql(sqlite_table, sqlite_connection, if_exists='append')
+    sqlite_connection.close()
+
+_load()