Tox2020Analysis/test_data_preparation.py at master · donniv86/Tox2020Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python3
"""
Test script for data preparation
"""

import sys
import os
sys.path.append('src')

from data_preparation import Tox21DataLoader

def test_data_preparation():
    """Test the data preparation pipeline"""

    print("=" * 60)
    print("TOX21 DATA PREPARATION TEST")
    print("=" * 60)

    # Initialize data loader
    print("\n1. Initializing data loader...")
    loader = Tox21DataLoader()

    # Load descriptors
    print("\n2. Loading descriptors...")
    try:
        descriptors = loader.load_descriptors()
        print(f"✓ Successfully loaded descriptors: {descriptors.shape}")
    except Exception as e:
        print(f"✗ Error loading descriptors: {e}")
        return

    # Load targets
    print("\n3. Loading target labels...")
    try:
        targets = loader.load_targets_from_sdf()
        print(f"✓ Successfully loaded targets: {targets.shape}")
    except Exception as e:
        print(f"✗ Error loading targets: {e}")
        return

    # Get target statistics
    print("\n4. Analyzing target statistics...")
    try:
        target_stats = loader.get_target_statistics()
        print("\nTarget Statistics:")
        print(target_stats.to_string(index=False))
    except Exception as e:
        print(f"✗ Error getting target statistics: {e}")
        return

    # Remove low variance features
    print("\n5. Removing low variance features...")
    try:
        loader.remove_low_variance_features(threshold=0.01)
        print(f"✓ Features after variance thresholding: {loader.descriptors.shape[1]}")
    except Exception as e:
        print(f"✗ Error in variance thresholding: {e}")
        return

    # Handle missing values
    print("\n6. Handling missing values...")
    try:
        loader.handle_missing_values(strategy='drop')
        print(f"✓ Samples after handling missing values: {loader.descriptors.shape[0]}")
    except Exception as e:
        print(f"✗ Error handling missing values: {e}")
        return

    # Get data summary
    print("\n7. Data summary...")
    try:
        summary = loader.get_data_summary()
        print("\nData Summary:")
        for key, value in summary.items():
            if key != 'target_statistics':
                print(f"  {key}: {value}")
    except Exception as e:
        print(f"✗ Error getting data summary: {e}")
        return

    # Test data preparation for first target
    print("\n8. Testing data preparation for first target...")
    try:
        data_dict = loader.prepare_data_for_target(
            target_idx=0,
            handle_imbalance=True,
            scale_features=True
        )

        print(f"\n✓ Successfully prepared data for {data_dict['target_name']}")
        print(f"  Training samples: {data_dict['X_train'].shape[0]}")
        print(f"  Validation samples: {data_dict['X_val'].shape[0]}")
        print(f"  Test samples: {data_dict['X_test'].shape[0]}")
        print(f"  Features: {data_dict['X_train'].shape[1]}")
        print(f"  Class weights: {data_dict['class_weights']}")

    except Exception as e:
        print(f"✗ Error preparing data for target: {e}")
        return

    print("\n" + "=" * 60)
    print("✓ ALL TESTS PASSED!")
    print("=" * 60)

    # Save prepared data for later use
    print("\n9. Saving prepared data...")
    try:
        import numpy as np
        np.save('results/prepared_data_target_0.npy', data_dict)
        print("✓ Saved prepared data to results/prepared_data_target_0.npy")
    except Exception as e:
        print(f"✗ Error saving data: {e}")

if __name__ == "__main__":
    test_data_preparation()