Intro_to_machine_learning/run.py at master · will03216/Intro_to_machine_learning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import argparse
import sys
from CW1.src.main import recreate_results, build_and_evaluate_decision_tree, cross_validate_decision_tree

def print_help():
    """Print detailed help information about available commands."""
    help_text = """
DECISION TREE ANALYSIS - COMMAND LINE INTERFACE
===============================================

USAGE:
    python run.py [COMMAND] [OPTIONS]

COMMANDS:
    (no command)    Run complete analysis pipeline (recreate_results)
    build_tree      Build and evaluate a single decision tree
    cross_validate  Perform k-fold cross-validation
    --help, -h      Show this help message

OPTIONS FOR build_tree:
    --data_path PATH        Path to dataset (default: wifi_db/clean_dataset.txt)
    --train_split FLOAT     Training split ratio (default: 0.8)
    --random_seed INT       Random seed for reproducibility (default: -1)

OPTIONS FOR cross_validate:
    --k INT                Number of folds (default: 10)
    --data_path PATH       Path to dataset (default: None)
    --random_seed INT      Random seed for reproducibility (default: -1)

OPTIONS FOR recreate_results (default):
    --random_seed INT      Random seed for reproducibility (default: -1)

EXAMPLES:
    python3 run.py
    python3 run.py --random_seed 42

    python3 run.py build_tree --data_path wifi_db/noisy_dataset.txt
    python3 run.py build_tree --train_split 0.7 --random_seed 123

    python3 run.py cross_validate --k 5 --data_path wifi_db/noisy_dataset.txt

===============================================
"""
    print(help_text)

def main():
    parser = argparse.ArgumentParser(description='Decision Tree Analysis Tool', add_help=False)

    # Add custom help flag
    parser.add_argument('--help', '-h', action='store_true', help='Show help message')

    # Command selection
    parser.add_argument('command', nargs='?', choices=['build_tree', 'cross_validate'],
                       help='Command to execute')

    # Common arguments
    parser.add_argument('--random_seed', type=int, default=-1,
                       help='Random seed for reproducibility (default: -1)')

    # build_tree specific arguments
    parser.add_argument('--data_path', type=str, default='wifi_db/clean_dataset.txt',
                       help='Path to dataset (default: wifi_db/clean_dataset.txt)')
    parser.add_argument('--train_split', type=float, default=0.8,
                       help='Training split ratio (default: 0.8)')

    # cross_validate specific arguments
    parser.add_argument('--k', type=int, default=10,
                       help='Number of folds for cross-validation (default: 10)')

    args = parser.parse_args()

    # Handle help
    if args.help:
        print_help()
        return

    # Handle different commands
    if args.command is None:
        # Default: recreate_results
        print("🔄 Running complete analysis pipeline...")
        print(f"⚙️  Random seed: {args.random_seed}")
        recreate_results(random_seed=args.random_seed)

    elif args.command == 'build_tree':
        # Validate arguments for build_tree
        if not (0.1 <= args.train_split <= 0.9):
            print("❌ Error: --train_split must be between 0.1 and 0.9")
            print("💡 Suggestion: Use --train_split 0.8 for 80% training data")
            return

        if not args.data_path.endswith('.txt'):
            print("❌ Error: --data_path must point to a .txt file")
            print("💡 Suggestion: Use a valid dataset path like wifi_db/clean_dataset.txt")
            return

        print("🌳 Building and evaluating decision tree...")
        print(f"📁 Data path: {args.data_path}")
        print(f"📊 Train split: {args.train_split}")
        print(f"⚙️  Random seed: {args.random_seed}")

        # Call with only the parameters the function accepts
        build_and_evaluate_decision_tree(
            data_path=args.data_path,
            train_test_split=args.train_split,  # Changed from train_split to train_test_split
            prune=False,  # Pruning option removed
            random_seed=args.random_seed
        )

    elif args.command == 'cross_validate':
        # Validate arguments for cross_validate
        if args.k < 2:
            print("❌ Error: --k must be at least 2 for cross-validation")
            print("💡 Suggestion: Use --k 10 for 10-fold cross-validation")
            return

        if args.k > 20:
            print("⚠️  Warning: Large k values (>20) may be computationally expensive")
            response = input("Continue? (y/n): ")
            if response.lower() != 'y':
                return

        print("📊 Performing k-fold cross-validation...")
        print(f"🔢 K-folds: {args.k}")
        if not args.data_path.endswith('.txt'):
            print("❌ Error: --data_path must point to a .txt file")
            print("💡 Suggestion: Use a valid dataset path like wifi_db/clean_dataset.txt")
            return
        print(f"📁 Data path: {args.data_path}")
        print(f"⚙️  Random seed: {args.random_seed}")

        # Call with only the parameters the function accepts
        cross_validate_decision_tree(
                k=args.k,
                data_path=args.data_path,
                random_seed=args.random_seed
        )

if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\n⚠️  Operation cancelled by user")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Error: {e}")
        print("💡 Use 'python run.py --help' for usage information")
        sys.exit(1)