Added Employees Log and added pyspark solution to Final destination

thecoddiwompler · thecoddiwompler · commit 84ddfcdb1561 · 2024-01-15T18:48:27.000+05:30
diff --git a/Employees Log/READme.md b/Employees Log/READme.md
@@ -0,0 +1,80 @@
+# Employees Log
+![Star Badge](https://img.shields.io/static/v1?label=%F0%9F%8C%9F&message=If%20Useful&style=style=flat&color=BC4E99)
+[![View Main Folder](https://img.shields.io/badge/View-Main_Folder-971901?)](https://github.com/thecoddiwompler/SQL-Practice-Questions/tree/main)
+[![View Repositories](https://img.shields.io/badge/View-My_Repositories-blue?logo=GitHub)](https://github.com/thecoddiwompler?tab=repositories)
+[![View My Profile](https://img.shields.io/badge/View-My_Profile-green?logo=GitHub)](https://github.com/thecoddiwompler)
+
+---
+
+## 🛠️ Problem Statement
+
+
+<b>Table Name : employee_log</b>
+</br>
+|  Column Name  |Type |
+| ------------- | ------------- |
+| emp_id    | INT |
+| log_date     | DATE    |
+| flag | CHAR(1)     |
+
+
+The table contains emp_id, log_date, and the flag('Y' or 'N') showing if the employee logins on the given log_date or not. 
+
+
+Write a SQL query to find the emp_id , the number of consecutive days logged in ,the start_date of the streak and end_date of the streak for each employee.
+Retrieve information about consecutive login streaks for employee who have logged in for at least two consecutive days.
+</br>
+</br>
+<b>The query result format is in the following example:  </b>
+</br>
+</br>
+
+ <details>
+<summary>
+Input
+</summary>
+<br>
+
+<b>Table Name : employee_log</b>
+
+| emp_id | log_date   | flag |
+|--------|------------|------|
+| 101    | 2024-01-02 | N    |
+| 101    | 2024-01-03 | Y    |
+| 101    | 2024-01-04 | N    |
+| 101    | 2024-01-07 | Y    |
+| 102    | 2024-01-01 | N    |
+| 102    | 2024-01-02 | Y    |
+| 102    | 2024-01-03 | Y    |
+| 102    | 2024-01-04 | N    |
+| 102    | 2024-01-05 | Y    |
+| 102    | 2024-01-06 | Y    |
+| 102    | 2024-01-07 | Y    |
+| 103    | 2024-01-01 | N    |
+| 103    | 2024-01-04 | N    |
+| 103    | 2024-01-05 | Y    |
+| 103    | 2024-01-06 | Y    |
+| 103    | 2024-01-07 | N    |
+
+
+<br/>
+
+
+</details>
+
+<details>
+<summary>
+Output
+</summary>
+<br>
+
+| emp_id | streak_start | streak_end  | streak_length |
+|--------|--------------|-------------|--------|
+| 102    | 2024-01-05   | 2024-01-07  | 3      |
+| 102    | 2024-01-02   | 2024-01-03  | 2      |
+| 103    | 2024-01-05   | 2024-01-06  | 2      |
+
+
+</details>
+
+---
diff --git a/Employees Log/schema.sql b/Employees Log/schema.sql
@@ -0,0 +1,29 @@
+-- Create Table
+CREATE TABLE employee_log (
+    emp_id INT,
+    log_date DATE,
+    flag CHAR(1)
+);
+
+-- Insert Data
+INSERT INTO
+    employee_log (emp_id, log_date, flag)
+VALUES
+    (101, '2024-01-02', 'N'),
+    (101, '2024-01-03', 'Y'),
+    (101, '2024-01-04', 'N'),
+    (101, '2024-01-07', 'Y'),
+    (102, '2024-01-01', 'N'),
+    (102, '2024-01-02', 'Y'),
+    (102, '2024-01-03', 'Y'),
+    (102, '2024-01-04', 'N'),
+    (102, '2024-01-05', 'Y'),
+    (102, '2024-01-06', 'Y'),
+    (102, '2024-01-07', 'Y'),
+    (103, '2024-01-01', 'N'),
+    (103, '2024-01-04', 'N'),
+    (103, '2024-01-05', 'Y'),
+    (103, '2024-01-06', 'Y'),
+    (103, '2024-01-07', 'N');
+
+COMMIT;
diff --git a/Employees Log/solution.sql b/Employees Log/solution.sql
@@ -0,0 +1,61 @@
+WITH init AS(
+    SELECT
+        emp_id,
+        log_date,
+        flag,
+        LEAD(flag) OVER(
+            PARTITION by emp_id
+            ORDER BY
+                log_date
+        ) lead_flag
+    FROM
+        employee_log
+),
+consecutive_days_tracker AS (
+    SELECT
+        a.emp_id,
+        a.log_date,
+        b.log_date prev_log_date,
+        a.log_date - b.log_date date_diff,
+        ROW_NUMBER() OVER(
+            PARTITION by a.emp_id,
+            a.log_date
+            ORDER BY
+                b.log_date DESC
+        ) rnk
+    FROM
+        init a
+        INNER JOIN init b ON a.emp_id = b.emp_id
+        AND a.log_date >= b.log_date
+        AND a.flag = 'Y'
+        AND b.flag = 'Y'
+        AND (
+            a.lead_flag = 'N'
+            OR a.lead_flag IS NULL
+        )
+),
+solution AS (
+    SELECT
+        emp_id,
+        log_date,
+        MAX(rnk) consecutive_days
+    FROM
+        consecutive_days_tracker
+    WHERE
+        date_diff + 1 = rnk
+    GROUP BY
+        emp_id,
+        log_date
+)
+SELECT
+    a.emp_id,
+    b.prev_log_date streak_start,
+    a.log_date streak_end,
+    a.consecutive_days streak_length
+FROM
+    solution a
+    INNER JOIN consecutive_days_tracker b ON a.emp_id = b.emp_id
+    AND a.consecutive_days = b.rnk
+    AND a.log_date = b.log_date
+WHERE
+    a.consecutive_days > 1
diff --git a/Final Destination/pyspark_solution.ipynb b/Final Destination/pyspark_solution.ipynb
@@ -0,0 +1,143 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Dataframe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting default log level to \"WARN\".\n",
+      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
+      "24/01/15 17:24:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-------+---------+---------+-----------+\n",
+      "|cust_id|flight_id|   origin|destination|\n",
+      "+-------+---------+---------+-----------+\n",
+      "|      1|  Flight1|    Delhi|  Hyderabad|\n",
+      "|      1|  Flight2|Hyderabad|      Kochi|\n",
+      "|      1|  Flight3|    Kochi|  Mangalore|\n",
+      "|      2|  Flight1|   Mumbai|    Ayodhya|\n",
+      "|      2|  Flight2|  Ayodhya|  Gorakhpur|\n",
+      "+-------+---------+---------+-----------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pyspark.sql import SparkSession\n",
+    "spark = SparkSession.builder.getOrCreate()\n",
+    "\n",
+    "flights_data = [(1,'Flight1' , 'Delhi' , 'Hyderabad'),\n",
+    " (1,'Flight2' , 'Hyderabad' , 'Kochi'),\n",
+    " (1,'Flight3' , 'Kochi' , 'Mangalore'),\n",
+    " (2,'Flight1' , 'Mumbai' , 'Ayodhya'),\n",
+    " (2,'Flight2' , 'Ayodhya' , 'Gorakhpur')\n",
+    " ]\n",
+    "\n",
+    "_schema = \"cust_id int, flight_id string , origin string , destination string\"\n",
+    "\n",
+    "df_flight = spark.createDataFrame(data = flights_data , schema= _schema)\n",
+    "df_flight.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyspark.sql.functions import col\n",
+    "\n",
+    "df_final_stop = df_flight.alias(\"original_df_flight\").join(df_flight.alias(\"new_df_flight\"), \n",
+    "               [col(\"original_df_flight.cust_id\") == col(\"new_df_flight.cust_id\"), \\\n",
+    "                col(\"original_df_flight.destination\") == col(\"new_df_flight.origin\")], \"left\") \\\n",
+    "                .select(col(\"original_df_flight.cust_id\"),col(\"original_df_flight.destination\"),\n",
+    "                        col(\"new_df_flight.destination\").alias(\"next_stop\")) \\\n",
+    "                .where(\"next_stop is NULL\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_origin = df_flight.alias(\"original_df_flight\").join(df_flight.alias(\"new_df_flight\"), \n",
+    "               [col(\"original_df_flight.cust_id\") == col(\"new_df_flight.cust_id\"), \\\n",
+    "                col(\"original_df_flight.origin\") == col(\"new_df_flight.destination\")], \"left\") \\\n",
+    "                .select(col(\"original_df_flight.cust_id\"),col(\"original_df_flight.origin\"),\n",
+    "                        col(\"new_df_flight.origin\").alias(\"previous_stop\")) \\\n",
+    "                .where(\"previous_stop is NULL\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-------+------+-----------+\n",
+      "|cust_id|origin|destination|\n",
+      "+-------+------+-----------+\n",
+      "|      1| Delhi|  Mangalore|\n",
+      "|      2|Mumbai|  Gorakhpur|\n",
+      "+-------+------+-----------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "df_final = df_final_stop.alias(\"destination\") \\\n",
+    "            .join(df_origin.alias(\"origin\"), df_final_stop.cust_id == df_origin.cust_id, \"inner\") \\\n",
+    "            .select(col(\"origin.cust_id\"),col(\"origin.origin\"),col(\"destination.destination\")).show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/iqsql.md b/iqsql.md
@@ -2,12 +2,13 @@
 2. Department Top 3 Salary
 3. Employees Check-in Details
 4. Employees Hiring [Difficult]
-5. Final Destination
-6. Highest-Grossing Items
-7. Increasing Sales Revenue
-8. Last Person to Fit in the Bus
-9. Manager with at least 5 direct reportees
-10. Mismatched IDs
-11. Odd and Even Measurements
-12. Onboarded Cities
-13. Qualifying Criteria
+5. Employees Log [Extra Difficult]
+6. Final Destination
+7. Highest-Grossing Items
+8. Increasing Sales Revenue
+9. Last Person to Fit in the Bus
+10. Manager with at least 5 direct reportees
+11. Mismatched IDs
+12. Odd and Even Measurements
+13. Onboarded Cities
+14. Qualifying Criteria