Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
219 changes: 168 additions & 51 deletions notebooks/XGBWW_Catalog_Random100_XGBoost_Accuracy.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,14 @@
" max_dense_elements=int(2e8),\n",
")\n",
"\n",
"OVERFIT_MODES = [\n",
" \"deep_trees\",\n",
" \"high_learning_rate\",\n",
" \"no_regularization\",\n",
" \"full_sampling\",\n",
" \"long_training\",\n",
"]\n",
"\n",
"\n",
"class ModelTrainingFailed(RuntimeError):\n",
" \"\"\"Raised when XGBoost model fitting fails for a dataset.\"\"\"\n",
Expand Down Expand Up @@ -711,9 +719,34 @@
"print(f\"XGBoost compute backend detected: {XGB_COMPUTE_BACKEND} | params={XGB_COMPUTE_PARAMS}\")\n",
"\n",
"\n",
"def fit_and_score(row_data: dict):\n",
"def apply_overfit_mode(base_params: dict, overfit_mode: str) -> dict:\n",
" params = dict(base_params)\n",
"\n",
" if overfit_mode == \"deep_trees\":\n",
" params.update({\"max_depth\": 14, \"min_child_weight\": 0.5})\n",
" elif overfit_mode == \"high_learning_rate\":\n",
" params.update({\"learning_rate\": 0.35})\n",
" elif overfit_mode == \"no_regularization\":\n",
" params.update({\"reg_lambda\": 0.0, \"reg_alpha\": 0.0, \"min_child_weight\": 0.0})\n",
" elif overfit_mode == \"full_sampling\":\n",
" params.update({\"subsample\": 1.0, \"colsample_bytree\": 1.0})\n",
" elif overfit_mode == \"long_training\":\n",
" params.update({\"learning_rate\": max(params.get(\"learning_rate\", 0.05), 0.08)})\n",
" else:\n",
" raise ValueError(f\"Unknown overfit_mode={overfit_mode!r}. Expected one of {OVERFIT_MODES}\")\n",
"\n",
" return params\n",
"\n",
"\n",
"def fit_and_score(\n",
" row_data: dict,\n",
" case_type: str = \"good\",\n",
" overfit_mode: str = \"none\",\n",
" seed_offset: int = 0,\n",
"):\n",
" dataset_uid = row_data[\"dataset_uid\"]\n",
" source = row_data[\"source\"]\n",
" run_seed = int(RANDOM_SEED + seed_offset)\n",
"\n",
" X, y, meta = load_dataset(dataset_uid, filters=filters)\n",
"\n",
Expand All @@ -725,7 +758,7 @@
"\n",
" stratify = y_enc if n_classes > 1 else None\n",
" X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y_enc, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=stratify\n",
" X, y_enc, test_size=TEST_SIZE, random_state=run_seed, stratify=stratify\n",
" )\n",
"\n",
" dtrain = xgb.DMatrix(X_train, label=y_train)\n",
Expand All @@ -743,21 +776,33 @@
" \"colsample_bytree\": 0.85,\n",
" \"min_child_weight\": 2.0,\n",
" \"reg_lambda\": 2.0,\n",
" \"seed\": RANDOM_SEED,\n",
" \"seed\": run_seed,\n",
" }\n",
" cv = xgb.cv(\n",
" params=params,\n",
" dtrain=dtrain,\n",
" num_boost_round=1200,\n",
" nfold=5,\n",
" stratified=True,\n",
" early_stopping_rounds=50,\n",
" seed=RANDOM_SEED,\n",
" verbose_eval=False,\n",
" )\n",
" rounds = len(cv)\n",
" model = xgb.train(params=params, dtrain=dtrain, num_boost_round=rounds, verbose_eval=False)\n",
"\n",
" if case_type == \"overfit\":\n",
" params = apply_overfit_mode(params, overfit_mode)\n",
" round_map = {\n",
" \"deep_trees\": 1800,\n",
" \"high_learning_rate\": 900,\n",
" \"no_regularization\": 2200,\n",
" \"full_sampling\": 1800,\n",
" \"long_training\": 3200,\n",
" }\n",
" rounds = round_map[overfit_mode]\n",
" else:\n",
" cv = xgb.cv(\n",
" params=params,\n",
" dtrain=dtrain,\n",
" num_boost_round=1200,\n",
" nfold=5,\n",
" stratified=True,\n",
" early_stopping_rounds=50,\n",
" seed=run_seed,\n",
" verbose_eval=False,\n",
" )\n",
" rounds = len(cv)\n",
"\n",
" model = xgb.train(params=params, dtrain=dtrain, num_boost_round=rounds, verbose_eval=False)\n",
" yhat_tr = (model.predict(dtrain) >= 0.5).astype(int)\n",
" yhat_te = (model.predict(dtest) >= 0.5).astype(int)\n",
" else:\n",
Expand All @@ -772,21 +817,33 @@
" \"colsample_bytree\": 0.9,\n",
" \"min_child_weight\": 1.0,\n",
" \"reg_lambda\": 1.0,\n",
" \"seed\": RANDOM_SEED,\n",
" \"seed\": run_seed,\n",
" }\n",
" cv = xgb.cv(\n",
" params=params,\n",
" dtrain=dtrain,\n",
" num_boost_round=1200,\n",
" nfold=5,\n",
" stratified=True,\n",
" early_stopping_rounds=60,\n",
" seed=RANDOM_SEED,\n",
" verbose_eval=False,\n",
" )\n",
" rounds = len(cv)\n",
" model = xgb.train(params=params, dtrain=dtrain, num_boost_round=rounds, verbose_eval=False)\n",
"\n",
" if case_type == \"overfit\":\n",
" params = apply_overfit_mode(params, overfit_mode)\n",
" round_map = {\n",
" \"deep_trees\": 1900,\n",
" \"high_learning_rate\": 1000,\n",
" \"no_regularization\": 2400,\n",
" \"full_sampling\": 1900,\n",
" \"long_training\": 3400,\n",
" }\n",
" rounds = round_map[overfit_mode]\n",
" else:\n",
" cv = xgb.cv(\n",
" params=params,\n",
" dtrain=dtrain,\n",
" num_boost_round=1200,\n",
" nfold=5,\n",
" stratified=True,\n",
" early_stopping_rounds=60,\n",
" seed=run_seed,\n",
" verbose_eval=False,\n",
" )\n",
" rounds = len(cv)\n",
"\n",
" model = xgb.train(params=params, dtrain=dtrain, num_boost_round=rounds, verbose_eval=False)\n",
" yhat_tr = np.argmax(model.predict(dtrain), axis=1)\n",
" yhat_te = np.argmax(model.predict(dtest), axis=1)\n",
" except Exception as e:\n",
Expand All @@ -799,7 +856,7 @@
" W=\"W7\",\n",
" nfolds=5,\n",
" t_points=160,\n",
" random_state=RANDOM_SEED,\n",
" random_state=run_seed,\n",
" train_params=params,\n",
" num_boost_round=rounds,\n",
" multiclass=\"avg\" if n_classes > 2 else \"error\",\n",
Expand Down Expand Up @@ -827,9 +884,12 @@
" \"rounds\": int(rounds),\n",
" \"train_accuracy\": float(accuracy_score(y_train, yhat_tr)),\n",
" \"test_accuracy\": float(accuracy_score(y_test, yhat_te)),\n",
" \"accuracy_gap\": float(accuracy_score(y_train, yhat_tr) - accuracy_score(y_test, yhat_te)),\n",
" \"alpha\": alpha,\n",
" \"ERG_gap\": erg_gap,\n",
" \"num_traps\": num_traps,\n",
" \"case_type\": case_type,\n",
" \"overfit_mode\": overfit_mode,\n",
" \"xgboost_params\": json.dumps(params, sort_keys=True),\n",
" \"status\": \"completed\",\n",
" \"error_message\": \"\",\n",
Expand Down Expand Up @@ -978,30 +1038,87 @@
},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"OVERFIT_REPEATS_PER_MODEL = 5\n",
"PLOT_METRICS = [\"alpha\", \"ERG_gap\", \"num_traps\"]\n",
"\n",
"if results_df.empty:\n",
" print(\"No successful trainings to plot.\")\n",
" print(\"No completed models. Cannot build overfit histograms.\")\n",
"else:\n",
" import matplotlib.pyplot as plt\n",
"\n",
" plot_df = results_df.sort_values([\"source\", \"dataset_uid\"]).copy()\n",
"\n",
" metrics = [\"alpha\", \"ERG_gap\", \"num_traps\"]\n",
" fig, axes = plt.subplots(1, len(metrics), figsize=(6 * len(metrics), 5), squeeze=False)\n",
"\n",
" for ax, metric in zip(axes[0], metrics):\n",
" ax.scatter(plot_df[metric], plot_df[\"train_accuracy\"], label=\"Train accuracy\", alpha=0.8)\n",
" ax.scatter(plot_df[metric], plot_df[\"test_accuracy\"], label=\"Test accuracy\", alpha=0.8)\n",
" ax.set_xlabel(metric)\n",
" ax.set_ylabel(\"Accuracy\")\n",
" ax.set_ylim(0.4, 1.05)\n",
" if metric=='alpha':\n",
" ax.set_xlim(1.5,6)\n",
" ax.set_title(f\"Accuracy vs {metric}\")\n",
" ax.grid(alpha=0.2)\n",
"\n",
" axes[0, 0].legend()\n",
" fig.tight_layout()\n",
" plt.show()\n"
" good_df = results_df.copy()\n",
" if \"case_type\" in good_df.columns:\n",
" good_df = good_df[good_df[\"case_type\"].eq(\"good\")].copy()\n",
"\n",
" if good_df.empty:\n",
" print(\"No base models found for overfit generation.\")\n",
" else:\n",
" overfit_rows = []\n",
" total_runs = len(good_df) * len(OVERFIT_MODES)\n",
" run_idx = 0\n",
" print(\n",
" f\"Overfitting each model {len(OVERFIT_MODES)} ways ({OVERFIT_MODES}) \"\n",
" f\"with {OVERFIT_REPEATS_PER_MODEL} total modes per model.\"\n",
" )\n",
"\n",
" for _, row in good_df.iterrows():\n",
" row_data = row.to_dict()\n",
" uid = row_data[\"dataset_uid\"]\n",
" for mode_idx, mode in enumerate(OVERFIT_MODES):\n",
" run_idx += 1\n",
" print(f\"[overfit {run_idx}/{total_runs}] dataset={uid} mode={mode}\")\n",
" try:\n",
" overfit_rows.append(\n",
" fit_and_score(\n",
" row_data,\n",
" case_type=\"overfit\",\n",
" overfit_mode=mode,\n",
" seed_offset=10000 + mode_idx,\n",
" )\n",
" )\n",
" except Exception as e:\n",
" print(f\" SKIP dataset={uid} mode={mode}: {e}\")\n",
"\n",
" overfit_df = pd.DataFrame(overfit_rows)\n",
"\n",
" if overfit_df.empty:\n",
" print(\"Overfit generation produced no rows.\")\n",
" else:\n",
" mode_order = OVERFIT_MODES[:]\n",
" fig, axes = plt.subplots(\n",
" len(PLOT_METRICS),\n",
" len(mode_order),\n",
" figsize=(4.5 * len(mode_order), 3.8 * len(PLOT_METRICS)),\n",
" squeeze=False,\n",
" )\n",
"\n",
" print(\n",
" f\"Generated {len(overfit_df)} overfit runs from {len(good_df)} models. \"\n",
" f\"Plotting {len(PLOT_METRICS)}x{len(mode_order)}={len(PLOT_METRICS) * len(mode_order)} histograms.\"\n",
" )\n",
"\n",
" for col_idx, mode in enumerate(mode_order):\n",
" mode_df = overfit_df[overfit_df[\"overfit_mode\"] == mode]\n",
" for row_idx, metric in enumerate(PLOT_METRICS):\n",
" ax = axes[row_idx, col_idx]\n",
" values = mode_df[metric].dropna()\n",
"\n",
" if values.empty:\n",
" ax.text(0.5, 0.5, \"No data\", ha=\"center\", va=\"center\")\n",
" else:\n",
" bins = min(20, max(5, int(np.sqrt(len(values)))))\n",
" ax.hist(values, bins=bins, alpha=0.85, edgecolor=\"black\")\n",
"\n",
" if row_idx == 0:\n",
" ax.set_title(f\"Mode: {mode}\")\n",
" if col_idx == 0:\n",
" ax.set_ylabel(metric)\n",
" ax.set_xlabel(metric)\n",
" ax.grid(alpha=0.2)\n",
"\n",
" plt.tight_layout()\n",
" plt.show()\n",
"\n"
],
"id": "wVlB0ke9Rp1N"
},
Expand Down