diff --git a/main.py b/main.py
new file mode 100644
index 0000000..1eb35f8
--- /dev/null
+++ b/main.py
@@ -0,0 +1,151 @@
+import streamlit as st
+import os
+import sys
+from pathlib import Path
+
+# Set page configuration first
+st.set_page_config(
+ page_title="Analytics Dashboard",
+ page_icon="📊",
+ layout="wide"
+)
+
+# Add src directory and subdirectories to sys.path
+project_root = Path(__file__).parent
+src_path = project_root / 'src'
+sys.path.insert(0, str(src_path))
+
+# Add each application's directory to sys.path
+app_dirs = ['DescriptiveAnalysis', 'PredictiveAnalytics1', 'preprocessing', 'PrescriptiveAnalysis1']
+for app_dir in app_dirs:
+ app_path = src_path / app_dir
+ if app_path.exists():
+ sys.path.insert(0, str(app_path))
+ else:
+ print(f"Directory {app_dir} not found in src/. Please check the directory structure.")
+
+# Add PredictiveAnalytics1/Frontend and PrescriptiveAnalysis1 subdirectories to sys.path
+predictive_frontend_path = src_path / 'PredictiveAnalytics1' / 'Frontend'
+if predictive_frontend_path.exists():
+ sys.path.insert(0, str(predictive_frontend_path))
+else:
+ print(f"Frontend directory not found at {predictive_frontend_path}.")
+
+prescriptive_frontend_path = src_path / 'PrescriptiveAnalysis1' / 'Frontend'
+if prescriptive_frontend_path.exists():
+ sys.path.insert(0, str(prescriptive_frontend_path))
+else:
+ print(f"Frontend directory not found at {prescriptive_frontend_path}.")
+
+prescriptive_backend_path = src_path / 'PrescriptiveAnalysis1' / 'Backend'
+if prescriptive_backend_path.exists():
+ sys.path.insert(0, str(prescriptive_backend_path))
+else:
+ print(f"Backend directory not found at {prescriptive_backend_path}.")
+
+# Import main functions from each application
+try:
+ from DescriptiveAnalysis.frontend import main as descriptive_main
+except ModuleNotFoundError as e:
+ print(f"Error importing DescriptiveAnalysis: {e}")
+ descriptive_main = None
+
+try:
+ from PredictiveAnalytics1.Frontend.app import main as predictive_main
+except ModuleNotFoundError as e:
+ print(f"Error importing PredictiveAnalytics1: {e}")
+ predictive_main = None
+
+try:
+ from preprocessing.app import main as preprocessing_main
+except ModuleNotFoundError as e:
+ print(f"Error importing preprocessing: {e}")
+ preprocessing_main = None
+
+try:
+ from PrescriptiveAnalysis1.Frontend.main import main as prescriptive_main
+except ModuleNotFoundError as e:
+ print(f"Error importing PrescriptiveAnalysis1: {e}")
+ prescriptive_main = None
+
+# Custom CSS for styling
+st.markdown("""
+
+""", unsafe_allow_html=True)
+
+def main():
+ """
+ Main function to run the Streamlit dashboard application.
+ Provides a sidebar to navigate between different analytics applications.
+ """
+ st.title("Analytics Dashboard")
+
+ # Display import errors, if any
+ if descriptive_main is None:
+ st.error("Cannot import DescriptiveAnalysis. Please check the directory structure and files.")
+ if predictive_main is None:
+ st.error("Cannot import PredictiveAnalytics1. Please check the directory structure and files.")
+ if preprocessing_main is None:
+ st.error("Cannot import preprocessing. Please check the directory structure and files.")
+ if prescriptive_main is None:
+ st.error("Cannot import PrescriptiveAnalysis1. Please check the directory structure and files.")
+
+ # Sidebar for application selection
+ st.sidebar.title("Navigation")
+ app_options = [
+ "Descriptive Analysis",
+ "Predictive Analytics",
+ "Preprocessing",
+ "Prescriptive Analysis"
+ ]
+ selected_app = st.sidebar.selectbox("Select Application", app_options)
+
+ # Map selected app to the corresponding main function
+ app_functions = {
+ "Descriptive Analysis": descriptive_main,
+ "Predictive Analytics": predictive_main,
+ "Preprocessing": preprocessing_main,
+ "Prescriptive Analysis": prescriptive_main
+ }
+
+ # Run the selected application's main function
+ with st.container():
+ st.markdown('
', unsafe_allow_html=True)
+ selected_function = app_functions[selected_app]
+ if selected_function is None:
+ st.error(f"Cannot run {selected_app}. The module could not be imported. Please check the error messages above.")
+ else:
+ selected_function()
+ st.markdown('
', unsafe_allow_html=True)
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/src/DescriptiveAnalysis/__init__.py b/src/DescriptiveAnalysis/__init__.py
new file mode 100644
index 0000000..8d1c8b6
--- /dev/null
+++ b/src/DescriptiveAnalysis/__init__.py
@@ -0,0 +1 @@
+
diff --git a/src/DescriptiveAnalysis/frontend.py b/src/DescriptiveAnalysis/frontend.py
index 4b9e281..7a0a88a 100644
--- a/src/DescriptiveAnalysis/frontend.py
+++ b/src/DescriptiveAnalysis/frontend.py
@@ -5,42 +5,7 @@
import streamlit.components.v1 as components
import pandas as pd
-from backend import generate_data_cubes, process_download
-
-# Set page configuration and custom CSS styling
-st.set_page_config(layout="wide")
-
-st.markdown("""
-
-""", unsafe_allow_html=True)
+from .backend import generate_data_cubes, process_download
def load_data():
"""
@@ -347,6 +312,39 @@ def main():
2. Drill-Down and Roll-Up
3. Batch Processing
"""
+ # Apply custom CSS styling
+ st.markdown("""
+
+ """, unsafe_allow_html=True)
+
st.title("Data Warehouse & Cube Generator")
# Load data from CSV file
@@ -387,4 +385,4 @@ def main():
batch_processing()
if __name__ == "__main__":
- main()
+ main()
\ No newline at end of file
diff --git a/src/PredictiveAnalytics1/Backend/__init__.py b/src/PredictiveAnalytics1/Backend/__init__.py
new file mode 100644
index 0000000..8d1c8b6
--- /dev/null
+++ b/src/PredictiveAnalytics1/Backend/__init__.py
@@ -0,0 +1 @@
+
diff --git a/src/PredictiveAnalytics1/Frontend/__init__.py b/src/PredictiveAnalytics1/Frontend/__init__.py
new file mode 100644
index 0000000..8d1c8b6
--- /dev/null
+++ b/src/PredictiveAnalytics1/Frontend/__init__.py
@@ -0,0 +1 @@
+
diff --git a/src/PredictiveAnalytics1/Frontend/app.py b/src/PredictiveAnalytics1/Frontend/app.py
index 0c1dd48..01cd523 100644
--- a/src/PredictiveAnalytics1/Frontend/app.py
+++ b/src/PredictiveAnalytics1/Frontend/app.py
@@ -4,20 +4,13 @@
import pandas as pd
import numpy as np
import os
-import sys
from pathlib import Path
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
-from Backend.backend import (
- load_data, preprocess_data, apply_pca, determine_problem_type, train_model)
+from ..Backend.backend import load_data, preprocess_data, apply_pca, determine_problem_type, train_model
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import DBSCAN, SpectralClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
-
-st.set_page_config(page_title="Model Comparison Tool", layout="wide")
-
-
@st.cache_data
def cached_load_data(file_path):
return load_data(file_path)
@@ -127,7 +120,11 @@ def main():
else:
current_file = Path(__file__).resolve()
dataset_dir = current_file.parents[3] / 'Datasets' / 'predictive-analytics-1'
- dataset_files = [f for f in os.listdir(dataset_dir) if f.endswith(".csv")]
+ if not dataset_dir.exists():
+ st.warning("Dataset directory not found. Please upload a CSV file.")
+ dataset_files = []
+ else:
+ dataset_files = [f for f in os.listdir(dataset_dir) if f.endswith(".csv")]
if dataset_files:
selected_dataset = st.selectbox("Select a dataset", dataset_files)
@@ -267,13 +264,11 @@ def main():
st.dataframe(comparison_df.set_index('Model').T)
-
st.subheader("Performance Visualization")
chart_data = comparison_df.melt(id_vars=['Model'], var_name='Metric', value_name='Score')
chart_data['Score'] = pd.to_numeric(chart_data['Score'], errors='coerce')
chart_data = chart_data.dropna(subset=['Score'])
-
if not chart_data.empty:
st.bar_chart(chart_data, x='Metric', y='Score', color='Model', stack=False)
else:
diff --git a/src/PredictiveAnalytics1/__init__.py b/src/PredictiveAnalytics1/__init__.py
new file mode 100644
index 0000000..8d1c8b6
--- /dev/null
+++ b/src/PredictiveAnalytics1/__init__.py
@@ -0,0 +1 @@
+
diff --git a/src/PrescriptiveAnalysis1/Backend/__init__.py b/src/PrescriptiveAnalysis1/Backend/__init__.py
index e69de29..8d1c8b6 100644
--- a/src/PrescriptiveAnalysis1/Backend/__init__.py
+++ b/src/PrescriptiveAnalysis1/Backend/__init__.py
@@ -0,0 +1 @@
+
diff --git a/src/PrescriptiveAnalysis1/Frontend/__init__.py b/src/PrescriptiveAnalysis1/Frontend/__init__.py
index e69de29..8d1c8b6 100644
--- a/src/PrescriptiveAnalysis1/Frontend/__init__.py
+++ b/src/PrescriptiveAnalysis1/Frontend/__init__.py
@@ -0,0 +1 @@
+
diff --git a/src/PrescriptiveAnalysis1/Frontend/main.py b/src/PrescriptiveAnalysis1/Frontend/main.py
index 3e601d7..ef5a9e3 100644
--- a/src/PrescriptiveAnalysis1/Frontend/main.py
+++ b/src/PrescriptiveAnalysis1/Frontend/main.py
@@ -1,19 +1,19 @@
import streamlit as st
-import sys
-import os
import pandas as pd
+import os
import time
from collections import defaultdict
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
-from Backend.gspan import run_gspan_analysis, construct_dfs_code, load_graphs_from_json
-from Backend.apriori_graph import parse_graph_file, apriori_graph_mining
-from Backend.gsp import preprocess_sequences_ordered, gsp_algorithm
-from Backend.apriori import run_apriori_analysis
-from Backend.fp_growth import run_fp_growth_analysis
+
+# Relative imports for Backend modules
+from ..Backend.gspan import run_gspan_analysis, construct_dfs_code, load_graphs_from_json
+from ..Backend.apriori_graph import parse_graph_file, apriori_graph_mining
+from ..Backend.gsp import preprocess_sequences_ordered, gsp_algorithm
+from ..Backend.apriori import run_apriori_analysis
+from ..Backend.fp_growth import run_fp_growth_analysis
def apriori_graph_mining_app():
st.title("Apriori-Based Graph Mining")
- uploaded_file = st.file_uploader("Upload your graph dataset file ", type=['txt'], key="apriori_file")
+ uploaded_file = st.file_uploader("Upload your graph dataset file", type=['txt'], key="apriori_file")
if uploaded_file is not None:
graphs = parse_graph_file(uploaded_file)
st.write(f"Number of graphs loaded: {len(graphs)}")
diff --git a/src/PrescriptiveAnalysis1/__init__.py b/src/PrescriptiveAnalysis1/__init__.py
new file mode 100644
index 0000000..8d1c8b6
--- /dev/null
+++ b/src/PrescriptiveAnalysis1/__init__.py
@@ -0,0 +1 @@
+
diff --git a/src/preprocessing/__init__.py b/src/preprocessing/__init__.py
new file mode 100644
index 0000000..8d1c8b6
--- /dev/null
+++ b/src/preprocessing/__init__.py
@@ -0,0 +1 @@
+
diff --git a/src/preprocessing/app.py b/src/preprocessing/app.py
index 219a5ee..b4b4b5f 100644
--- a/src/preprocessing/app.py
+++ b/src/preprocessing/app.py
@@ -9,10 +9,6 @@
from scipy.ndimage import gaussian_filter1d
from statsmodels.nonparametric.smoothers_lowess import lowess
-
-# Page config and custom UI styling
-st.set_page_config(page_title="Data Preprocessor", page_icon="📊", layout="wide")
-
st.markdown("""
""", unsafe_allow_html=True)
-
def handle_missing_values(data: pd.DataFrame) -> pd.DataFrame:
"""
Handles missing values using multiple imputation strategies.
@@ -54,7 +49,7 @@ def handle_missing_values(data: pd.DataFrame) -> pd.DataFrame:
])
if imputation_method == "Drop Rows with Missing Values":
- data.dropna(subset=columns_with_na, inplace=True)
+ data = data.dropna(subset=columns_with_na)
st.write("Dropped rows with missing values.")
else:
if numeric_columns:
@@ -62,35 +57,28 @@ def handle_missing_values(data: pd.DataFrame) -> pd.DataFrame:
strategy = st.selectbox("Select strategy", ["Mean", "Median", "Mode"])
if strategy == "Mode":
- # Determine threshold based on dataset size
total_rows = len(data)
if total_rows < 100:
- threshold = 0.05 # 5% for small datasets (< 100 rows)
+ threshold = 0.05
elif total_rows < 1000:
- threshold = 0.10 # 10% for medium datasets (100–999 rows)
+ threshold = 0.10
else:
threshold = 0.15
- # Check if data has few unique values or strong central tendency
for column in numeric_columns:
unique_values = data[column].nunique()
total_values = len(data[column].dropna())
unique_ratio = unique_values / total_values if total_values > 0 else 0
- # If less than 10% unique values, assume discrete with central tendency
if unique_ratio < threshold:
- # Can use mode imputation
- strategy = "Mode"
imputed_value = data[column].mode()[0]
data[column] = data[column].fillna(imputed_value)
st.write(f"Filled missing values in {column} with mode (discrete data): {imputed_value:.2f}")
-
else:
- # Offer mean or median for continuous data
strategy = st.radio(
- f"Select strategy for {column} (continuous data)",
- ["Median", "Mean"],
- horizontal=True
+ f"Select strategy for {column} (continuous data)",
+ ["Median", "Mean"],
+ horizontal=True
)
imputed_value = (
data[column].mean() if strategy == "Mean"
@@ -98,10 +86,8 @@ def handle_missing_values(data: pd.DataFrame) -> pd.DataFrame:
)
data[column] = data[column].fillna(imputed_value)
st.write(f"Filled missing values in {column} with {strategy.lower()}: {imputed_value:.2f}")
-
else:
for column in numeric_columns:
- # Compute value based on strategy
imputed_value = (
data[column].mean() if strategy == "Mean"
else data[column].median()
@@ -109,18 +95,15 @@ def handle_missing_values(data: pd.DataFrame) -> pd.DataFrame:
data[column] = data[column].fillna(imputed_value)
st.write(f"Filled missing values in {column} with {strategy.lower()}: {imputed_value:.2f}")
else:
- # Use regression or tree-based imputation
imputer = IterativeImputer(
estimator=DecisionTreeRegressor() if imputation_method == "Decision Tree Imputation" else None,
random_state=42
)
data[numeric_columns] = imputer.fit_transform(data[numeric_columns])
st.write(f"{imputation_method} applied on numeric columns.")
-
else:
- st.warning(f"No missing value is found in numeric columns.")
+ st.warning("No missing values found in numeric columns.")
- # Fill categorical columns with mode
categorical_columns = [col for col in columns_with_na if col not in numeric_columns]
for column in categorical_columns:
mode_value = data[column].mode()[0] if not data[column].mode().empty else ""
@@ -131,7 +114,6 @@ def handle_missing_values(data: pd.DataFrame) -> pd.DataFrame:
st.dataframe(data.head())
return data
-
def smooth_data(data: pd.DataFrame) -> pd.DataFrame:
"""
Applies smoothing to all numeric columns using selected technique.
@@ -148,9 +130,8 @@ def smooth_data(data: pd.DataFrame) -> pd.DataFrame:
smoothing_window = st.slider("Smoothing intensity", 3, 15, 5)
for column in numeric_columns:
- # Apply selected smoothing technique
if smoothing_method == "Moving Average":
- data[f"{column}_smoothed"] = data[column].rolling(window=smoothing_window).mean()
+ data[f"{column}_smoothed"] = data[column].rolling(window=smoothing_window, min_periods=1).mean()
elif smoothing_method == "Exponential":
data[f"{column}_smoothed"] = data[column].ewm(span=smoothing_window).mean()
elif smoothing_method == "Gaussian":
@@ -159,19 +140,15 @@ def smooth_data(data: pd.DataFrame) -> pd.DataFrame:
loess_result = lowess(data[column], np.arange(len(data)), frac=smoothing_window / len(data))
data[f"{column}_smoothed"] = loess_result[:, 1]
- # Store a backup and display results
st.session_state.smoothing_df = data.copy()
st.write("Preview after smoothing:")
st.dataframe(data.head())
- # Plot all smoothed columns
smoothed_cols = [f"{col}_smoothed" for col in numeric_columns if f"{col}_smoothed" in data.columns]
fig = px.line(data[smoothed_cols], title="Smoothed Data Visualization")
-
st.plotly_chart(fig, use_container_width=True)
return data
-
def handle_outliers(data: pd.DataFrame) -> pd.DataFrame:
"""
Detects and treats outliers in numeric columns using selected detection and treatment strategies.
@@ -190,12 +167,10 @@ def handle_outliers(data: pd.DataFrame) -> pd.DataFrame:
threshold, lower_percentile, upper_percentile = 3.0, 1.0, 99.0
iqr_multiplier = 1.5
- # Get user input based on method
if detection_method == "IQR":
iqr_multiplier = st.slider("IQR Multiplier", 1.0, 5.0, 1.5)
-
elif detection_method in ["Z-Score", "Modified Z-Score"]:
- threshold = st.slider("Threshold", 1.0, 5.0, 3.0)
+ threshold = st.slider("Threshold", 1.0, 5.0, 3.0)
else:
lower_percentile = st.slider("Lower percentile", 0.0, 10.0, 1.0)
upper_percentile = st.slider("Upper percentile", 90.0, 100.0, 99.0)
@@ -203,7 +178,6 @@ def handle_outliers(data: pd.DataFrame) -> pd.DataFrame:
total_outlier_count = 0
for column in numeric_columns:
- # Detect outliers based on selected method
if detection_method == "IQR":
q1 = data[column].quantile(0.25)
q3 = data[column].quantile(0.75)
@@ -211,17 +185,14 @@ def handle_outliers(data: pd.DataFrame) -> pd.DataFrame:
lower_bound = q1 - iqr_multiplier * iqr
upper_bound = q3 + iqr_multiplier * iqr
outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
-
elif detection_method == "Z-Score":
z_scores = (data[column] - data[column].mean()) / data[column].std()
outliers = data[np.abs(z_scores) > threshold]
-
elif detection_method == "Modified Z-Score":
median = data[column].median()
median_abs_dev = np.median(np.abs(data[column] - median))
modified_z_scores = 0.6745 * (data[column] - median) / median_abs_dev
outliers = data[np.abs(modified_z_scores) > threshold]
-
else:
lower_bound = data[column].quantile(lower_percentile / 100)
upper_bound = data[column].quantile(upper_percentile / 100)
@@ -230,14 +201,11 @@ def handle_outliers(data: pd.DataFrame) -> pd.DataFrame:
num_outliers = len(outliers)
total_outlier_count += num_outliers
- # Apply chosen treatment to outliers
if num_outliers > 0:
if treatment_strategy == "Remove":
data = data[~data.index.isin(outliers.index)]
-
elif treatment_strategy == "Cap":
data[column] = np.clip(data[column], lower_bound, upper_bound)
-
else:
median_value = data[column].median()
data.loc[outliers.index, column] = median_value
@@ -247,12 +215,10 @@ def handle_outliers(data: pd.DataFrame) -> pd.DataFrame:
st.write(f"Total outliers handled: {total_outlier_count}")
st.dataframe(data.head())
- # Show box plot for updated data
fig = px.box(data.select_dtypes(include=np.number), title="Box Plot After Outlier Treatment")
st.plotly_chart(fig, use_container_width=True)
return data
-
def analyse_variance(data: pd.DataFrame) -> pd.DataFrame:
"""
Performs variance analysis and feature selection on numeric columns.
@@ -268,7 +234,6 @@ def analyse_variance(data: pd.DataFrame) -> pd.DataFrame:
st.error("No numeric columns found for variance analysis!")
return data
- # Compute variance, standard deviation, and coefficient of variation
variance_summary = pd.DataFrame({
'Feature': numeric_columns,
'Variance': [data[column].var() for column in numeric_columns],
@@ -279,13 +244,10 @@ def analyse_variance(data: pd.DataFrame) -> pd.DataFrame:
]
}).sort_values('Variance', ascending=False).reset_index(drop=True)
- # Tabs for chart view and table view
tab_visuals, tab_table = st.tabs(["Visualization", "Data"])
with tab_visuals:
st.subheader("Feature Variance Distribution")
-
- # Variance bar chart
fig_variance = px.bar(
variance_summary,
x='Feature',
@@ -296,7 +258,6 @@ def analyse_variance(data: pd.DataFrame) -> pd.DataFrame:
)
st.plotly_chart(fig_variance, use_container_width=True)
- # Coefficient of Variation chart
cv_data = variance_summary.dropna(subset=['Coefficient of Variation'])
if not cv_data.empty:
fig_cv = px.bar(
@@ -313,8 +274,6 @@ def analyse_variance(data: pd.DataFrame) -> pd.DataFrame:
st.dataframe(variance_summary, use_container_width=True)
st.subheader("Feature Selection")
-
- # Choose selection method
selection_strategy = st.radio(
"Selection method:",
["Variance Threshold", "Top N Features", "Manual Selection"],
@@ -330,7 +289,6 @@ def analyse_variance(data: pd.DataFrame) -> pd.DataFrame:
step=0.05
)
selected_features = variance_summary[variance_summary['Variance'] >= min_variance]['Feature'].tolist()
-
elif selection_strategy == "Top N Features":
top_n = st.slider(
"Number of top features to keep:",
@@ -340,9 +298,7 @@ def analyse_variance(data: pd.DataFrame) -> pd.DataFrame:
step=1
)
selected_features = variance_summary.head(top_n)['Feature'].tolist()
-
else:
- # Manual feature picker
selected_features = st.multiselect(
"Select features to keep:",
options=numeric_columns,
@@ -354,18 +310,15 @@ def analyse_variance(data: pd.DataFrame) -> pd.DataFrame:
st.write("Selected features:")
st.write(", ".join(selected_features))
- # Optionally create new dataset with selected features
if st.checkbox("Create dataset with only selected features"):
include_categorical = st.checkbox("Include non-numeric columns", value=True)
non_numeric_columns = [col for col in data.columns if col not in numeric_columns]
-
final_columns = selected_features + non_numeric_columns if include_categorical else selected_features
selected_data = data[final_columns].copy()
st.write("Preview of dataset with selected features:")
st.dataframe(selected_data.head(), use_container_width=True)
- # Download option
csv_data = selected_data.to_csv(index=False).encode('utf-8')
st.download_button(
"Download selected features dataset",
@@ -374,18 +327,15 @@ def analyse_variance(data: pd.DataFrame) -> pd.DataFrame:
mime="text/csv"
)
- # Optionally update original dataset
if st.checkbox("Update main dataset to only include selected features"):
data = selected_data.copy()
st.info("Main dataset updated to include only selected features")
else:
st.warning("No features selected. Please adjust your selection criteria.")
- # Correlation analysis among selected features
if len(selected_features) > 1:
st.subheader("Correlation Analysis for Selected Features")
correlation_matrix = data[selected_features].corr()
-
fig_corr = px.imshow(
correlation_matrix,
title="Correlation Matrix for Selected Features",
@@ -396,12 +346,9 @@ def analyse_variance(data: pd.DataFrame) -> pd.DataFrame:
st.plotly_chart(fig_corr, use_container_width=True)
correlation_threshold = st.slider("Correlation threshold for highlighting:", 0.0, 1.0, 0.8, 0.05)
-
- # Find pairs with high correlation
high_correlation_pairs = correlation_matrix.where(
(np.abs(correlation_matrix) > correlation_threshold) & (np.abs(correlation_matrix) < 1.0)
).stack().reset_index()
-
high_correlation_pairs.columns = ['Feature 1', 'Feature 2', 'Correlation']
if not high_correlation_pairs.empty:
@@ -413,8 +360,7 @@ def analyse_variance(data: pd.DataFrame) -> pd.DataFrame:
else:
st.info(f"No feature pairs with correlation above {correlation_threshold} found.")
- return data
-
+ return data
def main():
"""
@@ -431,9 +377,8 @@ def main():
if uploaded_file:
try:
- # Read the uploaded file
raw_data = pd.read_csv(uploaded_file) if uploaded_file.name.endswith('.csv') else pd.read_excel(uploaded_file)
- st.session_state.df = raw_data
+ st.session_state.df = raw_data.copy()
st.success(f"✅ Loaded {len(raw_data)} rows × {len(raw_data.columns)} columns")
@@ -462,13 +407,11 @@ def main():
- **Duplicate Rows:** {raw_data.duplicated().sum()}
""")
- # Apply preprocessing steps sequentially
- processed_data = handle_missing_values(raw_data)
+ processed_data = handle_missing_values(raw_data.copy())
processed_data = smooth_data(processed_data)
processed_data = handle_outliers(processed_data)
processed_data = analyse_variance(processed_data)
- # Downloading feature : Preprocessed dataset
st.subheader("📥 Download Final Preprocessed Dataset")
csv_final = processed_data.to_csv(index=False).encode('utf-8')
st.download_button(
@@ -482,4 +425,4 @@ def main():
st.error(f"Error loading file: {str(error)}")
if __name__ == "__main__":
- main()
+ main()
\ No newline at end of file