|
9 | 9 | },
|
10 | 10 | {
|
11 | 11 | "cell_type": "code",
|
12 |
| - "execution_count": 15, |
| 12 | + "execution_count": null, |
13 | 13 | "metadata": {},
|
14 | 14 | "outputs": [],
|
15 | 15 | "source": [
|
|
61 | 61 | "metadata": {},
|
62 | 62 | "outputs": [],
|
63 | 63 | "source": [
|
64 |
| - "sz = \"350M\"\n", |
| 64 | + "import numpy as np\n", |
| 65 | + "\n", |
| 66 | + "sz = \"124M\"\n", |
65 | 67 | "loss_baseline = {\n",
|
66 | 68 | " \"124M\": 3.424958,\n",
|
67 | 69 | " \"350M\": 3.083089,\n",
|
68 | 70 | " \"774M\": 3.000580,\n",
|
69 | 71 | " \"1558M\": 2.831273,\n",
|
70 | 72 | "}[sz]\n",
|
71 |
| - "hella_baseline = {\n", |
| 73 | + "hella2_baseline = { # for GPT-2\n", |
72 | 74 | " \"124M\": 0.294463,\n",
|
73 | 75 | " \"350M\": 0.375224,\n",
|
74 | 76 | " \"774M\": 0.431986,\n",
|
75 | 77 | " \"1558M\": 0.488946,\n",
|
76 | 78 | "}[sz]\n",
|
77 |
| - "\n", |
| 79 | + "hella3_baseline = { # for GPT-3\n", |
| 80 | + " \"124M\": 0.337,\n", |
| 81 | + " \"350M\": 0.436,\n", |
| 82 | + " \"774M\": 0.510,\n", |
| 83 | + " \"1558M\": 0.547,\n", |
| 84 | + "}[sz]\n", |
78 | 85 | "# assumes each model run is stored in this way\n",
|
79 |
| - "logfile = f\"../log{sz}/main.log\"\n", |
| 86 | + "logfile = f\"../log_gpt2_{sz}/main.log\"\n", |
80 | 87 | "streams = parse_logfile(logfile)\n",
|
81 | 88 | "\n",
|
| 89 | + "# optional function that smooths out the loss some\n", |
| 90 | + "def smooth_moving_average(signal, window_size):\n", |
| 91 | + " if signal.ndim != 1:\n", |
| 92 | + " raise ValueError(\"smooth_moving_average only accepts 1D arrays.\")\n", |
| 93 | + " if signal.size < window_size:\n", |
| 94 | + " raise ValueError(\"Input vector needs to be bigger than window size.\")\n", |
| 95 | + " if window_size < 3:\n", |
| 96 | + " return signal\n", |
| 97 | + "\n", |
| 98 | + " s = np.pad(signal, (window_size//2, window_size-1-window_size//2), mode='edge')\n", |
| 99 | + " w = np.ones(window_size) / window_size\n", |
| 100 | + " smoothed_signal = np.convolve(s, w, mode='valid')\n", |
| 101 | + " return smoothed_signal\n", |
| 102 | + "\n", |
82 | 103 | "plt.figure(figsize=(16, 6))\n",
|
83 | 104 | "\n",
|
84 | 105 | "# Panel 1: losses: both train and val\n",
|
85 | 106 | "plt.subplot(121)\n",
|
86 | 107 | "xs, ys = streams[\"trl\"] # training loss\n",
|
| 108 | + "ys = np.array(ys)\n", |
| 109 | + "# smooth out ys using a rolling window\n", |
| 110 | + "# ys = smooth_moving_average(ys, 21) # optional\n", |
87 | 111 | "plt.plot(xs, ys, label=f'llm.c ({sz}) train loss')\n",
|
88 | 112 | "print(\"Min Train Loss:\", min(ys))\n",
|
89 | 113 | "xs, ys = streams[\"tel\"] # validation loss\n",
|
90 | 114 | "plt.plot(xs, ys, label=f'llm.c ({sz}) val loss')\n",
|
91 | 115 | "# horizontal line at GPT-2 baseline\n",
|
| 116 | + "# we don't have GPT-3 loss on this dataset because the weights were never released\n", |
92 | 117 | "if loss_baseline is not None:\n",
|
93 | 118 | " plt.axhline(y=loss_baseline, color='r', linestyle='--', label=f\"OpenAI GPT-2 ({sz}) checkpoint val loss\")\n",
|
94 | 119 | "plt.xlabel(\"steps\")\n",
|
95 | 120 | "plt.ylabel(\"loss\")\n",
|
96 | 121 | "plt.yscale('log')\n",
|
| 122 | + "plt.ylim(top=4.0)\n", |
97 | 123 | "plt.legend()\n",
|
98 | 124 | "plt.title(\"Loss\")\n",
|
99 | 125 | "print(\"Min Validation Loss:\", min(ys))\n",
|
100 | 126 | "\n",
|
101 | 127 | "# Panel 2: HellaSwag eval\n",
|
102 | 128 | "plt.subplot(122)\n",
|
103 |
| - "xs, ys = streams[\"eval\"] # HellaSwag eval\n", |
104 |
| - "plt.plot(xs, ys, label=f\"llm.c ({sz})\")\n", |
105 |
| - "# horizontal line at GPT-2 baseline\n", |
106 |
| - "if hella_baseline:\n", |
107 |
| - " plt.axhline(y=hella_baseline, color='r', linestyle='--', label=f\"OpenAI GPT-2 ({sz}) checkpoint\")\n", |
108 |
| - "plt.xlabel(\"steps\")\n", |
109 |
| - "plt.ylabel(\"accuracy\")\n", |
110 |
| - "plt.legend()\n", |
111 |
| - "plt.title(\"HellaSwag eval\")\n", |
112 |
| - "print(\"Max Hellaswag eval:\", max(ys))" |
| 129 | + "if \"eval\" in streams:\n", |
| 130 | + " xs, ys = streams[\"eval\"] # HellaSwag eval\n", |
| 131 | + " ys = np.array(ys)\n", |
| 132 | + " plt.plot(xs, ys, label=f\"llm.c ({sz})\")\n", |
| 133 | + " # horizontal line at GPT-2/3 baselines\n", |
| 134 | + " if hella2_baseline:\n", |
| 135 | + " plt.axhline(y=hella2_baseline, color='r', linestyle='--', label=f\"OpenAI GPT-2 ({sz}) checkpoint\")\n", |
| 136 | + " if hella3_baseline:\n", |
| 137 | + " plt.axhline(y=hella3_baseline, color='g', linestyle='--', label=f\"OpenAI GPT-3 ({sz}) checkpoint\")\n", |
| 138 | + " plt.xlabel(\"steps\")\n", |
| 139 | + " plt.ylabel(\"accuracy\")\n", |
| 140 | + " plt.legend()\n", |
| 141 | + " plt.title(\"HellaSwag eval\")\n", |
| 142 | + " print(\"Max Hellaswag eval:\", max(ys))\n" |
113 | 143 | ]
|
114 |
| - }, |
115 |
| - { |
116 |
| - "cell_type": "code", |
117 |
| - "execution_count": null, |
118 |
| - "metadata": {}, |
119 |
| - "outputs": [], |
120 |
| - "source": [] |
121 | 144 | }
|
122 | 145 | ],
|
123 | 146 | "metadata": {
|
|
0 commit comments