diff --git a/notebooks/XGBWW_Catalog_Random100_XGBoost_Accuracy.ipynb b/notebooks/XGBWW_Catalog_Random100_XGBoost_Accuracy.ipynb index b79d451..f23a0e6 100644 --- a/notebooks/XGBWW_Catalog_Random100_XGBoost_Accuracy.ipynb +++ b/notebooks/XGBWW_Catalog_Random100_XGBoost_Accuracy.ipynb @@ -677,6 +677,14 @@ " max_dense_elements=int(2e8),\n", ")\n", "\n", + "OVERFIT_MODES = [\n", + " \"deep_trees\",\n", + " \"high_learning_rate\",\n", + " \"no_regularization\",\n", + " \"full_sampling\",\n", + " \"long_training\",\n", + "]\n", + "\n", "\n", "class ModelTrainingFailed(RuntimeError):\n", " \"\"\"Raised when XGBoost model fitting fails for a dataset.\"\"\"\n", @@ -711,9 +719,34 @@ "print(f\"XGBoost compute backend detected: {XGB_COMPUTE_BACKEND} | params={XGB_COMPUTE_PARAMS}\")\n", "\n", "\n", - "def fit_and_score(row_data: dict):\n", + "def apply_overfit_mode(base_params: dict, overfit_mode: str) -> dict:\n", + " params = dict(base_params)\n", + "\n", + " if overfit_mode == \"deep_trees\":\n", + " params.update({\"max_depth\": 14, \"min_child_weight\": 0.5})\n", + " elif overfit_mode == \"high_learning_rate\":\n", + " params.update({\"learning_rate\": 0.35})\n", + " elif overfit_mode == \"no_regularization\":\n", + " params.update({\"reg_lambda\": 0.0, \"reg_alpha\": 0.0, \"min_child_weight\": 0.0})\n", + " elif overfit_mode == \"full_sampling\":\n", + " params.update({\"subsample\": 1.0, \"colsample_bytree\": 1.0})\n", + " elif overfit_mode == \"long_training\":\n", + " params.update({\"learning_rate\": max(params.get(\"learning_rate\", 0.05), 0.08)})\n", + " else:\n", + " raise ValueError(f\"Unknown overfit_mode={overfit_mode!r}. Expected one of {OVERFIT_MODES}\")\n", + "\n", + " return params\n", + "\n", + "\n", + "def fit_and_score(\n", + " row_data: dict,\n", + " case_type: str = \"good\",\n", + " overfit_mode: str = \"none\",\n", + " seed_offset: int = 0,\n", + "):\n", " dataset_uid = row_data[\"dataset_uid\"]\n", " source = row_data[\"source\"]\n", + " run_seed = int(RANDOM_SEED + seed_offset)\n", "\n", " X, y, meta = load_dataset(dataset_uid, filters=filters)\n", "\n", @@ -725,7 +758,7 @@ "\n", " stratify = y_enc if n_classes > 1 else None\n", " X_train, X_test, y_train, y_test = train_test_split(\n", - " X, y_enc, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=stratify\n", + " X, y_enc, test_size=TEST_SIZE, random_state=run_seed, stratify=stratify\n", " )\n", "\n", " dtrain = xgb.DMatrix(X_train, label=y_train)\n", @@ -743,21 +776,33 @@ " \"colsample_bytree\": 0.85,\n", " \"min_child_weight\": 2.0,\n", " \"reg_lambda\": 2.0,\n", - " \"seed\": RANDOM_SEED,\n", + " \"seed\": run_seed,\n", " }\n", - " cv = xgb.cv(\n", - " params=params,\n", - " dtrain=dtrain,\n", - " num_boost_round=1200,\n", - " nfold=5,\n", - " stratified=True,\n", - " early_stopping_rounds=50,\n", - " seed=RANDOM_SEED,\n", - " verbose_eval=False,\n", - " )\n", - " rounds = len(cv)\n", - " model = xgb.train(params=params, dtrain=dtrain, num_boost_round=rounds, verbose_eval=False)\n", "\n", + " if case_type == \"overfit\":\n", + " params = apply_overfit_mode(params, overfit_mode)\n", + " round_map = {\n", + " \"deep_trees\": 1800,\n", + " \"high_learning_rate\": 900,\n", + " \"no_regularization\": 2200,\n", + " \"full_sampling\": 1800,\n", + " \"long_training\": 3200,\n", + " }\n", + " rounds = round_map[overfit_mode]\n", + " else:\n", + " cv = xgb.cv(\n", + " params=params,\n", + " dtrain=dtrain,\n", + " num_boost_round=1200,\n", + " nfold=5,\n", + " stratified=True,\n", + " early_stopping_rounds=50,\n", + " seed=run_seed,\n", + " verbose_eval=False,\n", + " )\n", + " rounds = len(cv)\n", + "\n", + " model = xgb.train(params=params, dtrain=dtrain, num_boost_round=rounds, verbose_eval=False)\n", " yhat_tr = (model.predict(dtrain) >= 0.5).astype(int)\n", " yhat_te = (model.predict(dtest) >= 0.5).astype(int)\n", " else:\n", @@ -772,21 +817,33 @@ " \"colsample_bytree\": 0.9,\n", " \"min_child_weight\": 1.0,\n", " \"reg_lambda\": 1.0,\n", - " \"seed\": RANDOM_SEED,\n", + " \"seed\": run_seed,\n", " }\n", - " cv = xgb.cv(\n", - " params=params,\n", - " dtrain=dtrain,\n", - " num_boost_round=1200,\n", - " nfold=5,\n", - " stratified=True,\n", - " early_stopping_rounds=60,\n", - " seed=RANDOM_SEED,\n", - " verbose_eval=False,\n", - " )\n", - " rounds = len(cv)\n", - " model = xgb.train(params=params, dtrain=dtrain, num_boost_round=rounds, verbose_eval=False)\n", "\n", + " if case_type == \"overfit\":\n", + " params = apply_overfit_mode(params, overfit_mode)\n", + " round_map = {\n", + " \"deep_trees\": 1900,\n", + " \"high_learning_rate\": 1000,\n", + " \"no_regularization\": 2400,\n", + " \"full_sampling\": 1900,\n", + " \"long_training\": 3400,\n", + " }\n", + " rounds = round_map[overfit_mode]\n", + " else:\n", + " cv = xgb.cv(\n", + " params=params,\n", + " dtrain=dtrain,\n", + " num_boost_round=1200,\n", + " nfold=5,\n", + " stratified=True,\n", + " early_stopping_rounds=60,\n", + " seed=run_seed,\n", + " verbose_eval=False,\n", + " )\n", + " rounds = len(cv)\n", + "\n", + " model = xgb.train(params=params, dtrain=dtrain, num_boost_round=rounds, verbose_eval=False)\n", " yhat_tr = np.argmax(model.predict(dtrain), axis=1)\n", " yhat_te = np.argmax(model.predict(dtest), axis=1)\n", " except Exception as e:\n", @@ -799,7 +856,7 @@ " W=\"W7\",\n", " nfolds=5,\n", " t_points=160,\n", - " random_state=RANDOM_SEED,\n", + " random_state=run_seed,\n", " train_params=params,\n", " num_boost_round=rounds,\n", " multiclass=\"avg\" if n_classes > 2 else \"error\",\n", @@ -827,9 +884,12 @@ " \"rounds\": int(rounds),\n", " \"train_accuracy\": float(accuracy_score(y_train, yhat_tr)),\n", " \"test_accuracy\": float(accuracy_score(y_test, yhat_te)),\n", + " \"accuracy_gap\": float(accuracy_score(y_train, yhat_tr) - accuracy_score(y_test, yhat_te)),\n", " \"alpha\": alpha,\n", " \"ERG_gap\": erg_gap,\n", " \"num_traps\": num_traps,\n", + " \"case_type\": case_type,\n", + " \"overfit_mode\": overfit_mode,\n", " \"xgboost_params\": json.dumps(params, sort_keys=True),\n", " \"status\": \"completed\",\n", " \"error_message\": \"\",\n", @@ -978,30 +1038,87 @@ }, "outputs": [], "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "OVERFIT_REPEATS_PER_MODEL = 5\n", + "PLOT_METRICS = [\"alpha\", \"ERG_gap\", \"num_traps\"]\n", + "\n", "if results_df.empty:\n", - " print(\"No successful trainings to plot.\")\n", + " print(\"No completed models. Cannot build overfit histograms.\")\n", "else:\n", - " import matplotlib.pyplot as plt\n", - "\n", - " plot_df = results_df.sort_values([\"source\", \"dataset_uid\"]).copy()\n", - "\n", - " metrics = [\"alpha\", \"ERG_gap\", \"num_traps\"]\n", - " fig, axes = plt.subplots(1, len(metrics), figsize=(6 * len(metrics), 5), squeeze=False)\n", - "\n", - " for ax, metric in zip(axes[0], metrics):\n", - " ax.scatter(plot_df[metric], plot_df[\"train_accuracy\"], label=\"Train accuracy\", alpha=0.8)\n", - " ax.scatter(plot_df[metric], plot_df[\"test_accuracy\"], label=\"Test accuracy\", alpha=0.8)\n", - " ax.set_xlabel(metric)\n", - " ax.set_ylabel(\"Accuracy\")\n", - " ax.set_ylim(0.4, 1.05)\n", - " if metric=='alpha':\n", - " ax.set_xlim(1.5,6)\n", - " ax.set_title(f\"Accuracy vs {metric}\")\n", - " ax.grid(alpha=0.2)\n", - "\n", - " axes[0, 0].legend()\n", - " fig.tight_layout()\n", - " plt.show()\n" + " good_df = results_df.copy()\n", + " if \"case_type\" in good_df.columns:\n", + " good_df = good_df[good_df[\"case_type\"].eq(\"good\")].copy()\n", + "\n", + " if good_df.empty:\n", + " print(\"No base models found for overfit generation.\")\n", + " else:\n", + " overfit_rows = []\n", + " total_runs = len(good_df) * len(OVERFIT_MODES)\n", + " run_idx = 0\n", + " print(\n", + " f\"Overfitting each model {len(OVERFIT_MODES)} ways ({OVERFIT_MODES}) \"\n", + " f\"with {OVERFIT_REPEATS_PER_MODEL} total modes per model.\"\n", + " )\n", + "\n", + " for _, row in good_df.iterrows():\n", + " row_data = row.to_dict()\n", + " uid = row_data[\"dataset_uid\"]\n", + " for mode_idx, mode in enumerate(OVERFIT_MODES):\n", + " run_idx += 1\n", + " print(f\"[overfit {run_idx}/{total_runs}] dataset={uid} mode={mode}\")\n", + " try:\n", + " overfit_rows.append(\n", + " fit_and_score(\n", + " row_data,\n", + " case_type=\"overfit\",\n", + " overfit_mode=mode,\n", + " seed_offset=10000 + mode_idx,\n", + " )\n", + " )\n", + " except Exception as e:\n", + " print(f\" SKIP dataset={uid} mode={mode}: {e}\")\n", + "\n", + " overfit_df = pd.DataFrame(overfit_rows)\n", + "\n", + " if overfit_df.empty:\n", + " print(\"Overfit generation produced no rows.\")\n", + " else:\n", + " mode_order = OVERFIT_MODES[:]\n", + " fig, axes = plt.subplots(\n", + " len(PLOT_METRICS),\n", + " len(mode_order),\n", + " figsize=(4.5 * len(mode_order), 3.8 * len(PLOT_METRICS)),\n", + " squeeze=False,\n", + " )\n", + "\n", + " print(\n", + " f\"Generated {len(overfit_df)} overfit runs from {len(good_df)} models. \"\n", + " f\"Plotting {len(PLOT_METRICS)}x{len(mode_order)}={len(PLOT_METRICS) * len(mode_order)} histograms.\"\n", + " )\n", + "\n", + " for col_idx, mode in enumerate(mode_order):\n", + " mode_df = overfit_df[overfit_df[\"overfit_mode\"] == mode]\n", + " for row_idx, metric in enumerate(PLOT_METRICS):\n", + " ax = axes[row_idx, col_idx]\n", + " values = mode_df[metric].dropna()\n", + "\n", + " if values.empty:\n", + " ax.text(0.5, 0.5, \"No data\", ha=\"center\", va=\"center\")\n", + " else:\n", + " bins = min(20, max(5, int(np.sqrt(len(values)))))\n", + " ax.hist(values, bins=bins, alpha=0.85, edgecolor=\"black\")\n", + "\n", + " if row_idx == 0:\n", + " ax.set_title(f\"Mode: {mode}\")\n", + " if col_idx == 0:\n", + " ax.set_ylabel(metric)\n", + " ax.set_xlabel(metric)\n", + " ax.grid(alpha=0.2)\n", + "\n", + " plt.tight_layout()\n", + " plt.show()\n", + "\n" ], "id": "wVlB0ke9Rp1N" },