-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata.py
More file actions
84 lines (63 loc) · 2.63 KB
/
data.py
File metadata and controls
84 lines (63 loc) · 2.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import pandas as pd
# Loading the files from the correct subfolder
customers = pd.read_csv('ing-hubs-turkiye-datathon/customers.csv')
train = pd.read_csv('ing-hubs-turkiye-datathon/referance_data.csv')
test = pd.read_csv('ing-hubs-turkiye-datathon/referance_data_test.csv')
sample_submission = pd.read_csv('ing-hubs-turkiye-datathon/sample_submission.csv')
# Clean and Prepare the data
print("Customers:")
print(customers.head())
print("\nTrain (referance_data.csv):")
print(train.head())
print("\nTest (referance_data_test.csv):")
print(test.head())
print("\nSample Submission:")
print(sample_submission.head())
# 1. MERGE DATA - Combine customer features with train/test data
print("\n=== MERGING DATA ===")
train_merged = pd.merge(train, customers, on='cust_id', how='left')
test_merged = pd.merge(test, customers, on='cust_id', how='left')
print("After merging:")
print("Train merged shape:", train_merged.shape)
print("Test merged shape:", test_merged.shape)
# 2. CHECK MISSING VALUES
print("\n=== CHECKING MISSING VALUES ===")
print("Missing values in train_merged:")
print(train_merged.isnull().sum())
print("\nMissing values in test_merged:")
print(test_merged.isnull().sum())
# 3. CLEAN MISSING VALUES
print("\n=== CLEANING MISSING VALUES ===")
# For categorical columns - fill with 'Unknown'
categorical_cols = ['gender', 'province', 'religion', 'work_type', 'work_sector']
for col in categorical_cols:
if col in train_merged.columns:
train_merged[col] = train_merged[col].fillna('Unknown')
test_merged[col] = test_merged[col].fillna('Unknown')
# For numerical columns - fill with median
numerical_cols = ['age', 'tenure']
for col in numerical_cols:
if col in train_merged.columns:
median_value = train_merged[col].median()
train_merged[col] = train_merged[col].fillna(median_value)
test_merged[col] = test_merged[col].fillna(median_value)
# 4. VERIFY CLEANING
print("\n=== AFTER CLEANING ===")
print("Missing values in train_merged:")
print(train_merged.isnull().sum())
print("\nMissing values in test_merged:")
print(test_merged.isnull().sum())
# 5. BASIC DATA EXPLORATION
print("\n=== BASIC EXPLORATION ===")
print("Target distribution (churn):")
print(train_merged['churn'].value_counts())
print("\nChurn rate:", train_merged['churn'].mean())
# Show sample of cleaned data
print("\nSample of cleaned train data:")
print(train_merged.head())
print("\nData types:")
print(train_merged.dtypes)
# Save cleaned data for later use
train_merged.to_csv('train_cleaned.csv', index=False)
test_merged.to_csv('test_cleaned.csv', index=False)
print("\nCleaned data saved as 'train_cleaned.csv' and 'test_cleaned.csv'")