Python-Data-Analysis/analysis.py at main · ericdurkee/Python-Data-Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
# Download from: https://www.kaggle.com/datasets/carrie1/ecommerce-data
# Or use: https://archive.ics.uci.edu/ml/datasets/Online+Retail
df = pd.read_csv('online_retail.csv', encoding='ISO-8859-1')

print()
print(f"Dataset loaded: {len(df)} rows")
print()

# Data Cleaning Start
print("Data Cleaning...")
print()

# Remove rows with missing CustomerID or Description
initial_rows = len(df)
df = df.dropna(subset=['CustomerID', 'Description'])
print(f"Removed {initial_rows - len(df)} rows with missing data")

# Remove cancelled orders - orders starting with C
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]
print(f"Remaining rows after removing cancellations: {len(df)}")

# Remove rows with negative or zero quantities
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]
print(f"Remaining rows after removing invalid quantities/prices: {len(df)}")

# Calculate total revenue
df['TotalRevenue'] = df['Quantity'] * df['UnitPrice']

# Convert InvoiceDate to datetime for python understanding
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Extract month and year
df['Month'] = df['InvoiceDate'].dt.month
df['Year'] = df['InvoiceDate'].dt.year
df['YearMonth'] = df['InvoiceDate'].dt.to_period('M')

print("\nData cleaning complete...")
print(f"Final dataset: {len(df)} rows")

# QUESTION 1: What are the top 5 best-selling product categories by revenue?

print()
print("Question 1: Top 5 Best-Selling Products by Total Revenue")
print()

# Group by product description and calculate total revenue
product_revenue = df.groupby('Description').agg({
    'TotalRevenue': 'sum',
    'Quantity': 'sum',
    'InvoiceNo': 'count'
}).reset_index()

product_revenue.columns = ['Product', 'TotalRevenue', 'TotalQuantity', 'OrderCount']

# Sort by revenue and get top 5
top_5_products = product_revenue.sort_values('TotalRevenue', ascending=False).head(5)

print("\nTop 5 Products by Revenue:")
print(top_5_products.to_string(index=False))

print("\nAnalysis:")
print(f"1. Best-selling product: {top_5_products.iloc[0]['Product']}")
print(f"   - Revenue: £{top_5_products.iloc[0]['TotalRevenue']:,.2f}")
print(f"   - Quantity sold: {top_5_products.iloc[0]['TotalQuantity']:,.0f} units")
print(f"\n2. The top 5 products generated £{top_5_products['TotalRevenue'].sum():,.2f} in total revenue")

# QUESTION 2: What are the monthly sales trends?

print()
print("Question 2: Monthly Sales Trends")
print()

# Group by month and calculate total revenue
monthly_sales = df.groupby('YearMonth').agg({
    'TotalRevenue': 'sum',
    'InvoiceNo': 'nunique'
}).reset_index()

monthly_sales.columns = ['Month', 'TotalRevenue', 'OrderCount']

# Sort by revenue to find best/worst months
monthly_sales_sorted = monthly_sales.sort_values('TotalRevenue', ascending=False)

print("\nMonthly Sales Summary:")
print(monthly_sales.to_string(index=False))

print("\nAnalysis:")
print(f"1. Best month: {monthly_sales_sorted.iloc[0]['Month']}")
print(f"   - Revenue: £{monthly_sales_sorted.iloc[0]['TotalRevenue']:,.2f}")
print(f"   - Orders: {monthly_sales_sorted.iloc[0]['OrderCount']:,.0f}")

print(f"\n2. Worst month: {monthly_sales_sorted.iloc[-1]['Month']}")
print(f"   - Revenue: £{monthly_sales_sorted.iloc[-1]['TotalRevenue']:,.2f}")
print(f"   - Orders: {monthly_sales_sorted.iloc[-1]['OrderCount']:,.0f}")

avg_monthly_revenue = monthly_sales['TotalRevenue'].mean()
print(f"\n3. Average monthly revenue: £{avg_monthly_revenue:,.2f}")

# Visual (Stretch Challenge)

print()
print("Creating data visualization...")
print()

# Create a figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Chart 1: Top 5 Products by Revenue
ax1.barh(top_5_products['Product'], top_5_products['TotalRevenue'], color='steelblue')
ax1.set_xlabel('Total Revenue (£)', fontsize=12)
ax1.set_ylabel('Product', fontsize=12)
ax1.set_title('Top 5 Best-Selling Products by Revenue', fontsize=14, fontweight='bold')
ax1.invert_yaxis()

# Add revenue labels on bars
for i, v in enumerate(top_5_products['TotalRevenue']):
    ax1.text(v, i, f' £{v:,.0f}', va='center', fontsize=10)

# Chart 2: Monthly Sales Trend
monthly_sales['MonthStr'] = monthly_sales['Month'].astype(str)
ax2.plot(monthly_sales['MonthStr'], monthly_sales['TotalRevenue'],
         marker='o', linewidth=2, markersize=6, color='darkorange')
ax2.set_xlabel('Month', fontsize=12)
ax2.set_ylabel('Total Revenue (£)', fontsize=12)
ax2.set_title('Monthly Sales Trend', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3)
ax2.tick_params(axis='x', rotation=45)

# Format y-axis to show currency
ax2.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'£{x/1000:.0f}K'))

# Save figure
plt.tight_layout()
plt.savefig('sales_analysis_results.png', dpi=300, bbox_inches='tight')
print("Visualization saved as 'sales_analysis_results.png'")

plt.show()

print()
print("Visualization Completed")