BDM_P2/src/DistributedML/ml_predict.py at master · anbipa/BDM_P2 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from hdfs import InsecureClient
from pyspark.ml.regression import RandomForestRegressionModel
from pyspark.sql import SparkSession
from DistributedML.ml_trainer import preprocess_data, load_data, evaluate_model


def choose_model(path):
    # Connect to the HDFS cluster
    hdfs = InsecureClient(url='http://10.4.41.44:9870', user='bdm')

    # List the models
    model_files = hdfs.list(path)

    # Extract the model names from the file paths
    model_names = [file_path.split('/')[-1] for file_path in model_files]

    # Display the model names and let the user choose
    print("Available models:")
    for i, model_name in enumerate(model_names):
        print(f"{i+1}. {model_name}")

    # Prompt the user to choose a model
    while True:
        try:
            choice = int(input("Enter the number corresponding to the model you want to choose: "))
            if 1 <= choice <= len(model_names):
                break
            else:
                print("Invalid choice. Please enter a valid number.")
        except ValueError:
            print("Invalid input. Please enter a number.")

    # get the model file path
    model_file_path = "hdfs://10.4.41.44:27000/user/bdm/" + path + '/' + model_files[choice - 1]

    # Load the model
    model = RandomForestRegressionModel.load(model_file_path)

    # Return the chosen model
    return model


def deploy_and_predict():
    # Define the path to the models
    model_path = "models"

    # Create a SparkSession
    spark = SparkSession.builder.appName("ModelPrediction").getOrCreate()

    # Choose the model
    model = choose_model(model_path)

    # define dataset path
    dataset_path = "hdfs://10.4.41.44:27000/user/bdm/dataset/rentdataset.csv"

    # Load the data
    data = load_data(dataset_path)

    # Preprocess the data
    train_data, test_data = preprocess_data(data)

    # Evaluate the model
    evaluate_model(model, test_data)