diff --git a/notebooks/34_analytical_approx.ipynb b/notebooks/34_analytical_approx.ipynb index a114a888..a305c4eb 100644 --- a/notebooks/34_analytical_approx.ipynb +++ b/notebooks/34_analytical_approx.ipynb @@ -2049,223 +2049,10 @@ "### Bootstrap estimate of the standard error\n", "\n", "Another way to obtain the standard error of the mean\n", - "(the standard deviation sampling distribution of the mean)\n", - "is to use the bootstrap approach.\n", + "(the standard deviation of the sampling distribution of the mean)\n", + "is to use the bootstrap estimate.\n", "\n", - "\n", - "TODO: import formulas" - ] - }, - { - "cell_type": "markdown", - "id": "b85af6aa-078a-4a02-9f0c-a71871d0985a", - "metadata": {}, - "source": [ - "#### Example 1BT: test for the mean of Batch 04" - ] - }, - { - "cell_type": "code", - "execution_count": 132, - "id": "65005726-e0a2-421c-aada-27842968d6eb", - "metadata": {}, - "outputs": [], - "source": [ - "kombucha = pd.read_csv(\"../datasets/kombucha.csv\")\n", - "ksample04 = kombucha[kombucha[\"batch\"]==4][\"volume\"]\n", - "n04 = len(ksample04)" - ] - }, - { - "cell_type": "code", - "execution_count": 133, - "id": "ed62b815-5faf-458e-9849-85cb09a76892", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1.2252842390160474" - ] - }, - "execution_count": 133, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# bootstrap estimate for standard error of the mean\n", - "from stats_helpers import gen_boot_dist\n", - "kbars_boot04 = gen_boot_dist(ksample04, estfunc=mean)\n", - "sehat_boot04 = std(kbars_boot04)\n", - "sehat_boot04" - ] - }, - { - "cell_type": "code", - "execution_count": 134, - "id": "3769ca83-069b-407f-8712-4f3b44fc8fbe", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "3.128661805915673" - ] - }, - "execution_count": 134, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# compute the t statistic using bootstrap se\n", - "obst04bt = (obsmean04 - muK0) / sehat_boot04\n", - "obst04bt" - ] - }, - { - "cell_type": "code", - "execution_count": 135, - "id": "5316bae0-aae8-4b97-8602-68fe70e7a248", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.012147863549560177" - ] - }, - "execution_count": 135, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from scipy.stats import t as tdist\n", - "rvT04 = tdist(n04 - 1)\n", - "pvalue04bt = tailprobs(rvT, obst04bt, alt=\"two-sided\")\n", - "pvalue04bt" - ] - }, - { - "cell_type": "markdown", - "id": "78df6c04-514c-4515-9ef8-5700de0861e9", - "metadata": {}, - "source": [ - "The $p$-value is very small,\n", - "so our decision is to reject $H_0$." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "275cff66-0f71-46ed-ac3f-4448936397f0", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "660fa68f-0140-4d7d-91f9-47466f4c881c", - "metadata": {}, - "source": [ - "#### Example 2BT: test for the mean of Batch 01" - ] - }, - { - "cell_type": "code", - "execution_count": 136, - "id": "90aa8990-0fa7-4e80-9206-d62c34a3990e", - "metadata": {}, - "outputs": [], - "source": [ - "kombucha = pd.read_csv(\"../datasets/kombucha.csv\")\n", - "ksample01 = kombucha[kombucha[\"batch\"]==1][\"volume\"]\n", - "n01 = len(ksample01)" - ] - }, - { - "cell_type": "code", - "execution_count": 137, - "id": "178af06b-310a-43f1-94c1-daa0b9c5ed1f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1.5501517384893604" - ] - }, - "execution_count": 137, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# bootstrap estimate for standard error of the mean\n", - "from stats_helpers import gen_boot_dist\n", - "kbars_boot01 = gen_boot_dist(ksample01, estfunc=mean)\n", - "sehat_boot01 = std(kbars_boot01)\n", - "sehat_boot01" - ] - }, - { - "cell_type": "code", - "execution_count": 138, - "id": "c02d1766-5a31-4963-9603-62d545742e1e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "-0.5781692061148894" - ] - }, - "execution_count": 138, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# compute the t statistic using bootstrap se\n", - "obst01bt = (obsmean01 - muK0) / sehat_boot01\n", - "obst01bt" - ] - }, - { - "cell_type": "code", - "execution_count": 139, - "id": "0a858812-cd15-4b32-adfb-a3d7f4f1ed6e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.5664736858023267" - ] - }, - "execution_count": 139, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from scipy.stats import t as tdist\n", - "\n", - "rvT01 = tdist(n01-1)\n", - "#######################################################\n", - "pvalue01bt = tailprobs(rvT01, obst01bt, alt=\"two-sided\")\n", - "pvalue01bt" - ] - }, - { - "cell_type": "markdown", - "id": "50db8103-3aee-430e-bf2a-c1ca6ef1262d", - "metadata": {}, - "source": [ - "The $p$-value is very large,\n", - "so we have no reason to reject $H_0$." + "See problems...\n" ] }, { diff --git a/notebooks/35_two_sample_tests.ipynb b/notebooks/35_two_sample_tests.ipynb index 914a3e25..81e87cde 100644 --- a/notebooks/35_two_sample_tests.ipynb +++ b/notebooks/35_two_sample_tests.ipynb @@ -2414,27 +2414,15 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 55, "id": "1e0fc5ae-ae1a-4fdc-98af-5f25d8563e88", "metadata": {}, - "outputs": [ - { - "ename": "TypeError", - "evalue": "ttest_dmeans() got an unexpected keyword argument 'equal_var'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[54], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m scoresD \u001b[38;5;241m=\u001b[39m students[students[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcurriculum\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m==\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdebate\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mscore\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 3\u001b[0m scoresL \u001b[38;5;241m=\u001b[39m students[students[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcurriculum\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m==\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlecture\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mscore\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m----> 4\u001b[0m \u001b[43mttest_dmeans\u001b[49m\u001b[43m(\u001b[49m\u001b[43mscoresD\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mscoresL\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mequal_var\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n", - "\u001b[0;31mTypeError\u001b[0m: ttest_dmeans() got an unexpected keyword argument 'equal_var'" - ] - } - ], + "outputs": [], "source": [ "students = pd.read_csv(\"../datasets/students.csv\")\n", "scoresD = students[students[\"curriculum\"]==\"debate\"][\"score\"]\n", "scoresL = students[students[\"curriculum\"]==\"lecture\"][\"score\"]\n", - "ttest_dmeans(scoresD, scoresL, equal_var=True)" + "# ttest_dmeans(scoresD, scoresL, equal_var=True)" ] }, { diff --git a/problems/chapter3_problems.ipynb b/problems/chapter3_problems.ipynb new file mode 100644 index 00000000..79fa69e1 --- /dev/null +++ b/problems/chapter3_problems.ipynb @@ -0,0 +1,386 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e0b97d86-6fc4-4476-b96d-99d6724e56ee", + "metadata": {}, + "source": [ + "# Chapter 3 Problems" + ] + }, + { + "cell_type": "markdown", + "id": "22116b0f-4475-4c69-a8fd-b51cee926deb", + "metadata": { + "tags": [] + }, + "source": [ + "#### Notebook setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "354bc124-8e55-4ff7-9827-bd7012810e18", + "metadata": {}, + "outputs": [], + "source": [ + "# load Python modules\n", + "import os\n", + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1d924fc8-4dfe-428f-b6aa-9148741c185a", + "metadata": {}, + "outputs": [], + "source": [ + "# Useful colors\n", + "snspal = sns.color_palette()\n", + "blue, orange, purple = snspal[0], snspal[1], snspal[4]\n", + "# red = sns.color_palette(\"tab10\")[3]\n", + "\n", + "# High-resolution please\n", + "%config InlineBackend.figure_format = 'retina'\n", + "\n", + "# Where to store figures\n", + "DESTDIR = \"figures/stats/intro_to_NHST\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8dfa6a70-c4a6-4cfe-9f2d-87b4d9d3def6", + "metadata": {}, + "outputs": [], + "source": [ + "# set random seed for repeatability\n", + "np.random.seed(42)" + ] + }, + { + "cell_type": "markdown", + "id": "13ee88bb-f7e6-47f0-967c-65316539c3a9", + "metadata": {}, + "source": [ + "$\\def\\stderr#1{\\mathbf{se}_{#1}}$\n", + "$\\def\\stderrhat#1{\\hat{\\mathbf{se}}_{#1}}$\n", + "$\\newcommand{\\Mean}{\\textbf{Mean}}$\n", + "$\\newcommand{\\Var}{\\textbf{Var}}$\n", + "$\\newcommand{\\Std}{\\textbf{Std}}$\n", + "$\\newcommand{\\Freq}{\\textbf{Freq}}$\n", + "$\\newcommand{\\RelFreq}{\\textbf{RelFreq}}$\n", + "$\\newcommand{\\DMeans}{\\textbf{DMeans}}$\n", + "$\\newcommand{\\Prop}{\\textbf{Prop}}$\n", + "$\\newcommand{\\DProps}{\\textbf{DProps}}$\n", + "\n", + "$$\n", + "\\newcommand{\\CI}[1]{\\textbf{CI}_{#1}}\n", + "\\newcommand{\\CIL}[1]{\\textbf{L}_{#1}}\n", + "\\newcommand{\\CIU}[1]{\\textbf{U}_{#1}}\n", + "\\newcommand{\\ci}[1]{\\textbf{ci}_{#1}}\n", + "\\newcommand{\\cil}[1]{\\textbf{l}_{#1}}\n", + "\\newcommand{\\ciu}[1]{\\textbf{u}_{#1}}\n", + "$$\n", + "\n", + "\n", + "(this cell contains the macro definitions like $\\stderr{\\overline{\\mathbf{x}}}$, $\\stderrhat{}$, $\\Mean$, ...)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8e72c5a-819e-47ed-a01a-c507eccd6eec", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81c8b0a0-7c1b-46f0-8867-42511be5609b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0a7f8ab-d03a-426d-a860-6bcda2b41258", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "086d8674-20c9-4263-9ea9-5f246012044f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "ce12fb85-d626-4eb5-8388-0e0654d44ed0", + "metadata": {}, + "source": [ + "#### Problem NN: alt t-test for the mean of Batch 04 (Example 1BT)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "974407f2-0133-4082-9a04-89f71569d267", + "metadata": {}, + "outputs": [], + "source": [ + "muK0 = 1000 # population mean (expected kombucha volume)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "74b4f817-c6a5-4287-9c06-81185c599a06", + "metadata": {}, + "outputs": [], + "source": [ + "kombucha = pd.read_csv(\"../datasets/kombucha.csv\")\n", + "ksample04 = kombucha[kombucha[\"batch\"]==4][\"volume\"]\n", + "n04 = len(ksample04)\n", + "obsmean04 = np.mean(ksample04)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "da0edb55-02b2-401b-b82c-717d2d9ea40d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.225161704465105" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# bootstrap estimate for standard error of the mean\n", + "from stats_helpers import gen_boot_dist\n", + "\n", + "np.random.seed(42)\n", + "kbars_boot04 = gen_boot_dist(ksample04, estfunc=np.mean)\n", + "sehat_boot04 = np.std(kbars_boot04)\n", + "sehat_boot04" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e1c9bcc1-d4d3-42ac-a65f-d9acbeff900d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3.1289747190340322" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# compute the t statistic using bootstrap se\n", + "obst04bt = (obsmean04 - muK0) / sehat_boot04\n", + "obst04bt" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "44527229-74f4-4473-a83d-18aca57366be", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.003314349648233716" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from scipy.stats import t as tdist\n", + "from stats_helpers import tailprobs\n", + "rvT04 = tdist(n04 - 1)\n", + "pvalue04bt = tailprobs(rvT04, obst04bt, alt=\"two-sided\")\n", + "pvalue04bt" + ] + }, + { + "cell_type": "markdown", + "id": "b237fe8d-4ffd-40d0-a348-2b8cfcd146db", + "metadata": {}, + "source": [ + "The $p$-value is very small,\n", + "so our decision is to reject $H_0$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "821916cf-16dc-481c-be96-f35031bb1561", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "3c38c16b-9adc-4038-9256-aa7a79cf411a", + "metadata": {}, + "source": [ + "### Problem NN: alt t-test for the mean of Batch 01 (Example 2BT)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "02d06cdb-777c-4cfd-8cda-f8ad61133805", + "metadata": {}, + "outputs": [], + "source": [ + "muK0 = 1000 # population mean (expected kombucha volume)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "48395f89-ed9a-4ca1-9fd9-ad96ce5e113b", + "metadata": {}, + "outputs": [], + "source": [ + "kombucha = pd.read_csv(\"../datasets/kombucha.csv\")\n", + "ksample01 = kombucha[kombucha[\"batch\"]==1][\"volume\"]\n", + "n01 = len(ksample01)\n", + "obsmean01 = np.mean(ksample01)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d884bf11-1d94-4b6f-bc0c-38423b290239", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.530831183342292" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# bootstrap estimate for standard error of the mean\n", + "from stats_helpers import gen_boot_dist\n", + "np.random.seed(42)\n", + "kbars_boot01 = gen_boot_dist(ksample01, estfunc=np.mean)\n", + "sehat_boot01 = np.std(kbars_boot01)\n", + "sehat_boot01" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "9b753c28-dc9c-42c5-81ff-04c873f4198f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "-0.5854662550335628" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# compute the t statistic using bootstrap se\n", + "obst01bt = (obsmean01 - muK0) / sehat_boot01\n", + "obst01bt" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "fd48f6e0-a6a1-4ef1-919b-3a6e50027935", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.5616069624592427" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from scipy.stats import t as tdist\n", + "from stats_helpers import tailprobs\n", + "rvT01 = tdist(n01-1)\n", + "pvalue01bt = tailprobs(rvT01, obst01bt, alt=\"two-sided\")\n", + "pvalue01bt" + ] + }, + { + "cell_type": "markdown", + "id": "fb10344f-e830-4b05-912c-9130b7fd3f35", + "metadata": {}, + "source": [ + "The $p$-value is very large,\n", + "so we have no reason to reject $H_0$." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/problems/stats_helpers.py b/problems/stats_helpers.py new file mode 120000 index 00000000..1315c255 --- /dev/null +++ b/problems/stats_helpers.py @@ -0,0 +1 @@ +../notebooks/stats_helpers.py \ No newline at end of file