how_big_sample.html

<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>

<meta charset="utf-8">
<meta name="generator" content="quarto-1.6.1">

<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">


<title>30&nbsp; How Large a Sample? – Resampling statistics</title>
<style>
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.columns{display: flex; gap: min(4vw, 1.5em);}
div.column{flex: auto; overflow-x: auto;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
ul.task-list li input[type="checkbox"] {
  width: 0.8em;
  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
  vertical-align: middle;
}
/* CSS for syntax highlighting */
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
  { counter-reset: source-line 0; }
pre.numberSource code > span
  { position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
  { content: counter(source-line);
    position: relative; left: -1em; text-align: right; vertical-align: baseline;
    border: none; display: inline-block;
    -webkit-touch-callout: none; -webkit-user-select: none;
    -khtml-user-select: none; -moz-user-select: none;
    -ms-user-select: none; user-select: none;
    padding: 0 4px; width: 4em;
  }
pre.numberSource { margin-left: 3em;  padding-left: 4px; }
div.sourceCode
  {   }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
/* CSS for citations */
div.csl-bib-body { }
div.csl-entry {
  clear: both;
  margin-bottom: 0em;
}
.hanging-indent div.csl-entry {
  margin-left:2em;
  text-indent:-2em;
}
div.csl-left-margin {
  min-width:2em;
  float:left;
}
div.csl-right-inline {
  margin-left:2em;
  padding-left:1em;
}
div.csl-indent {
  margin-left: 2em;
}</style>


<script src="site_libs/quarto-nav/quarto-nav.js"></script>
<script src="site_libs/quarto-nav/headroom.min.js"></script>
<script src="site_libs/clipboard/clipboard.min.js"></script>
<script src="site_libs/quarto-search/autocomplete.umd.js"></script>
<script src="site_libs/quarto-search/fuse.min.js"></script>
<script src="site_libs/quarto-search/quarto-search.js"></script>
<meta name="quarto:offset" content="./">
<link href="./bayes_simulation.html" rel="next">
<link href="./correlation_causation.html" rel="prev">
<script src="site_libs/quarto-html/quarto.js"></script>
<script src="site_libs/quarto-html/popper.min.js"></script>
<script src="site_libs/quarto-html/tippy.umd.min.js"></script>
<script src="site_libs/quarto-html/anchor.min.js"></script>
<link href="site_libs/quarto-html/tippy.css" rel="stylesheet">
<link href="site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
<script src="site_libs/bootstrap/bootstrap.min.js"></script>
<link href="site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
<link href="site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
<script id="quarto-search-options" type="application/json">{
  "location": "sidebar",
  "copy-button": false,
  "collapse-after": 3,
  "panel-placement": "start",
  "type": "textbox",
  "limit": 50,
  "keyboard-shortcut": [
    "f",
    "/",
    "s"
  ],
  "show-item-context": false,
  "language": {
    "search-no-results-text": "No results",
    "search-matching-documents-text": "matching documents",
    "search-copy-link-title": "Copy link to search",
    "search-hide-matches-text": "Hide additional matches",
    "search-more-match-text": "more match in this document",
    "search-more-matches-text": "more matches in this document",
    "search-clear-button-title": "Clear",
    "search-text-placeholder": "",
    "search-detached-cancel-button-title": "Cancel",
    "search-submit-button-title": "Submit",
    "search-label": "Search"
  }
}</script>
<script type="text/javascript">
  $(document).ready(function() {
    $("table").addClass('lightable-paper lightable-striped lightable-hover')
  });
</script>


<link rel="stylesheet" href="style.css">
<link rel="stylesheet" href="font-awesome.min.css">
</head>

<body class="nav-sidebar floating">

<div id="quarto-search-results"></div>
  <header id="quarto-header" class="headroom fixed-top">
  <nav class="quarto-secondary-nav">
    <div class="container-fluid d-flex">
      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
        <i class="bi bi-layout-text-sidebar-reverse"></i>
      </button>
        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="./how_big_sample.html"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">How Large a Sample?</span></a></li></ol></nav>
        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
        </a>
      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
        <i class="bi bi-search"></i>
      </button>
    </div>
  </nav>
</header>
<!-- content -->
<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
<!-- sidebar -->
  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
    <div class="pt-lg-2 mt-2 text-left sidebar-header">
    <div class="sidebar-title mb-0 py-0">
      <a href="./">Resampling statistics</a> 
    </div>
      </div>
        <div class="mt-2 flex-shrink-0 align-items-center">
        <div class="sidebar-search">
        <div id="quarto-search" class="" title="Search"></div>
        </div>
        </div>
    <div class="sidebar-menu-container"> 
    <ul class="list-unstyled mt-1">
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./index.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text">Python version</span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./preface_third.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text">Preface to the third edition</span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./preface_second.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text">Preface to the second edition</span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./intro.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./resampling_method.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">The resampling method</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./what_is_probability.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">What is probability?</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./about_technology.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Introducing Python and the Jupyter notebook</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./resampling_with_code.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Resampling with code</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./resampling_with_code2.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">More resampling with code</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./sampling_tools.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Tools for samples and sampling</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./probability_theory_1a.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Probability Theory, Part 1</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./probability_theory_1b.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Probability Theory Part 1 (continued)</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./more_sampling_tools.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Two puzzles and more tools</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./probability_theory_2_compound.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Probability Theory, Part 2: Compound Probability</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./probability_theory_3.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Probability Theory, Part 3</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./probability_theory_4_finite.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Probability Theory, Part 4: Estimating Probabilities from Finite Universes</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./sampling_variability.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">On Variability in Sampling</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./monte_carlo.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">The Procedures of Monte Carlo Simulation (and Resampling)</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./standard_scores.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Ranks, Quantiles and Standard Scores</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./inference_ideas.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">The Basic Ideas in Statistical Inference</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./inference_intro.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Introduction to Statistical Inference</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./point_estimation.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Point Estimation</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./framing_questions.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Framing Statistical Questions</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./testing_counts_1.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Hypothesis-Testing with Counted Data, Part 1</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./significance.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">The Concept of Statistical Significance in Testing Hypotheses</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./testing_counts_2.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">The Statistics of Hypothesis-Testing with Counted Data, Part 2</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./testing_measured.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">The Statistics of Hypothesis-Testing With Measured Data</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./testing_procedures.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">General Procedures for Testing Hypotheses</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./confidence_1.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Confidence Intervals, Part 1: Assessing the Accuracy of Samples</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./confidence_2.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Confidence Intervals, Part 2: The Two Approaches to Estimating Confidence Intervals</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./reliability_average.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Some Last Words About the Reliability of Sample Averages</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./correlation_causation.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Correlation and Causation</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./how_big_sample.html" class="sidebar-item-text sidebar-link active">
 <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">How Large a Sample?</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./bayes_simulation.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Bayesian Analysis by Simulation</span></span></a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./references.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text">References</span></a>
  </div>
</li>
        <li class="sidebar-item sidebar-item-section">
      <div class="sidebar-item-container"> 
            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true">
 <span class="menu-text">Appendices</span></a>
          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true" aria-label="Toggle section">
            <i class="bi bi-chevron-right ms-2"></i>
          </a> 
      </div>
      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
          <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./exercise_solutions.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">A</span>&nbsp; <span class="chapter-title">Exercise Solutions</span></span></a>
  </div>
</li>
          <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./technical_note.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">B</span>&nbsp; <span class="chapter-title">Technical Note to the Professional Reader</span></span></a>
  </div>
</li>
          <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./acknowlegements.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">C</span>&nbsp; <span class="chapter-title">Acknowledgements</span></span></a>
  </div>
</li>
          <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./code_topics.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">D</span>&nbsp; <span class="chapter-title">Code topics</span></span></a>
  </div>
</li>
          <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./errors_suggestions.html" class="sidebar-item-text sidebar-link">
 <span class="menu-text"><span class="chapter-number">E</span>&nbsp; <span class="chapter-title">Errors and suggestions</span></span></a>
  </div>
</li>
      </ul>
  </li>
    </ul>
    </div>
</nav>
<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
<!-- margin-sidebar -->
    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
        <nav id="TOC" role="doc-toc" class="toc-active">
    <h2 id="toc-title">Table of contents</h2>
   
  <ul>
  <li><a href="#issues-in-determining-sample-size" id="toc-issues-in-determining-sample-size" class="nav-link active" data-scroll-target="#issues-in-determining-sample-size"><span class="header-section-number">30.1</span> Issues in determining sample size</a></li>
  <li><a href="#some-practical-examples" id="toc-some-practical-examples" class="nav-link" data-scroll-target="#some-practical-examples"><span class="header-section-number">30.2</span> Some practical examples</a>
  <ul class="collapse">
  <li><a href="#sec-proportion-radio" id="toc-sec-proportion-radio" class="nav-link" data-scroll-target="#sec-proportion-radio"><span class="header-section-number">30.2.1</span> Example: what proportion of homes are listening to a radio station?</a></li>
  <li><a href="#example-average-weight-gain-for-pig-rations" id="toc-example-average-weight-gain-for-pig-rations" class="nav-link" data-scroll-target="#example-average-weight-gain-for-pig-rations"><span class="header-section-number">30.2.2</span> Example: average weight gain for pig rations</a></li>
  </ul></li>
  <li><a href="#example-sample-size-for-inference-on-fruit-fly-sex-difference" id="toc-example-sample-size-for-inference-on-fruit-fly-sex-difference" class="nav-link" data-scroll-target="#example-sample-size-for-inference-on-fruit-fly-sex-difference"><span class="header-section-number">30.3</span> Example: sample size for inference on fruit fly sex difference</a>
  <ul class="collapse">
  <li><a href="#example-sample-size-for-an-internet-provider-poll" id="toc-example-sample-size-for-an-internet-provider-poll" class="nav-link" data-scroll-target="#example-sample-size-for-an-internet-provider-poll"><span class="header-section-number">30.3.1</span> Example: sample size for an internet-provider poll</a></li>
  <li><a href="#example-how-large-a-sample-for-pig-rations" id="toc-example-how-large-a-sample-for-pig-rations" class="nav-link" data-scroll-target="#example-how-large-a-sample-for-pig-rations"><span class="header-section-number">30.3.2</span> Example: how large a sample for pig rations?</a></li>
  </ul></li>
  <li><a href="#step-wise-sample-size-determination" id="toc-step-wise-sample-size-determination" class="nav-link" data-scroll-target="#step-wise-sample-size-determination"><span class="header-section-number">30.4</span> Step-wise sample-size determination</a></li>
  <li><a href="#summary" id="toc-summary" class="nav-link" data-scroll-target="#summary"><span class="header-section-number">30.5</span> Summary</a></li>
  </ul>
</nav>
    </div>
<!-- main -->
<main class="content" id="quarto-document-content">

<header id="title-block-header" class="quarto-title-block default">
<div class="quarto-title">
<h1 class="title"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">How Large a Sample?</span></h1>
</div>


<div class="quarto-title-meta">

    
  </div>
  

</header>


<section id="issues-in-determining-sample-size" class="level2" data-number="30.1">
<h2 data-number="30.1" class="anchored" data-anchor-id="issues-in-determining-sample-size"><span class="header-section-number">30.1</span> Issues in determining sample size</h2>
<p>Sometime in the course of almost every study — preferably early in the planning stage — the researcher must decide how large a sample to take. Deciding the size of sample to take is likely to puzzle and distress you at the beginning of your research career. You have to decide somehow, but there are no simple, obvious guides for the decision.</p>
<p>For example, one of the first studies I worked on was a study of library economics <span class="citation" data-cites="fussler1961patterns">(<a href="references.html#ref-fussler1961patterns" role="doc-biblioref">Fussler and Simon 1961</a>)</span>, which required taking a sample of the books from the library’s collections. Sampling was expensive, and we wanted to take a correctly sized sample. But how large should the sample be? The longer we searched the literature, and the more people we asked, the more frustrated we got because there just did not seem to be a clear-cut answer. Eventually we found out that, even though there are some fairly rational ways of fixing the sample size, most sample sizes in most studies are fixed simply (and irrationally) by the amount of money that is available or by the sample size that similar research has used in the past.</p>
<p>The rational way to choose a sample size is by weighing the benefits you can expect in information against the cost of increasing the sample size. In principle you should continue to increase the sample size until the benefit and cost of an additional sampled unit are equal.<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a></p>
<p>The benefit of additional information is not easy to estimate even in applied research, and it is extraordinarily difficult to estimate in basic research. Therefore, it has been the practice of researchers to set up target goals of the <em>degree of accuracy</em> they wish to achieve, or to consider various degrees of accuracy that might be achieved with various sample sizes, and then to balance the degree of accuracy with the cost of achieving that accuracy. The bulk of this chapter is devoted to learning how the sample size is related to accuracy in simple situations.</p>
<p>In complex situations, however, and even in simple situations for beginners, you are likely to feel frustrated by the difficulties of relating accuracy to sample size, in which case you cry out to a supervisor, “Don’t give me complicated methods, just give me a rough number based on your greatest experience.” My inclination is to reply to you, “Sometimes life is hard and there is no shortcut.” On the other hand, perhaps you can get more information than misinformation out of knowing sample sizes that have been used in other studies. <a href="#tbl-sample-sizes-opinion" class="quarto-xref">Table&nbsp;<span>30.1</span></a> shows the middle (modal), 25th percentile, and 75th percentile scores for — please keep this in mind — <em>National Opinion Surveys</em> <a href="#tbl-sample-sizes-subgroup" class="quarto-xref">Table&nbsp;<span>30.2</span></a> shows how subgroup analyses affect sample size. The source for both tables is <em>Applied Sampling</em>, by Seymour Sudman <span class="citation" data-cites="sudman1976applied">(<a href="references.html#ref-sudman1976applied" role="doc-biblioref">1976, 86–87</a>)</span> copyright Academic Press, reprinted by permission.</p>
<p>Pretest sample sizes are smaller, of course, perhaps 25-100 observations. Samples in research for Master’s and Ph.D.&nbsp;theses are likely to be closer to a pretest than to national samples.</p>
<div id="tbl-sample-sizes-opinion" class="quarto-float quarto-figure quarto-figure-center anchored">
<figure class="quarto-float quarto-float-tbl figure">
<figcaption class="quarto-float-caption-top quarto-float-caption quarto-float-tbl" id="tbl-sample-sizes-opinion-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
Table&nbsp;30.1: Common sample sizes for national and regional studies by subject
</figcaption>
<div aria-describedby="tbl-sample-sizes-opinion-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
<table class="caption-top table">
<colgroup>
<col style="width: 30%">
<col style="width: 10%">
<col style="width: 10%">
<col style="width: 13%">
<col style="width: 10%">
<col style="width: 10%">
<col style="width: 7%">
</colgroup>
<thead>
<tr class="header">
<th rowspan="2">Subject Matter</th>
<th colspan="3">National</th>
<th colspan="3">Regional</th>
</tr>
<tr class="odd">
<th>Mode</th>
<th>Q3</th>
<th>Q1</th>
<th>Mode</th>
<th>Q3</th>
<th>Q1</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>Financial</td>
<td>1000+</td>
<td>—</td>
<td>—</td>
<td>100</td>
<td>400</td>
<td>50</td>
</tr>
<tr class="even">
<td>Medical</td>
<td>1000+</td>
<td>1000+</td>
<td>500</td>
<td>1000+</td>
<td>1000+</td>
<td>250</td>
</tr>
<tr class="odd">
<td>Other Behavior</td>
<td>1000+</td>
<td>—</td>
<td>—</td>
<td>700</td>
<td>1000</td>
<td>300</td>
</tr>
<tr class="even">
<td>Attitudes</td>
<td>1000+</td>
<td>1000+</td>
<td>500</td>
<td>700</td>
<td>1000</td>
<td>400</td>
</tr>
<tr class="odd">
<td>Laboratory Experiments</td>
<td>—</td>
<td>—</td>
<td>—</td>
<td>100</td>
<td>200</td>
<td>50</td>
</tr>
</tbody>
</table>
</div>
</figure>
</div>
<div id="tbl-sample-sizes-subgroup" class="quarto-float quarto-figure quarto-figure-center anchored">
<figure class="quarto-float quarto-float-tbl figure">
<figcaption class="quarto-float-caption-top quarto-float-caption quarto-float-tbl" id="tbl-sample-sizes-subgroup-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
Table&nbsp;30.2: Typical sample sizes for studies of human and institutional populations
</figcaption>
<div aria-describedby="tbl-sample-sizes-subgroup-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
<table class="caption-top table">
<colgroup>
<col style="width: 28%">
<col style="width: 17%">
<col style="width: 16%">
<col style="width: 17%">
<col style="width: 16%">
</colgroup>
<thead>
<tr class="header">
<th rowspan="2">Subgroup analysis</th>
<th colspan="2">People or households</th>
<th colspan="2">Institutions</th>
</tr>
<tr class="odd">
<th>National</th>
<th>Special</th>
<th>National</th>
<th>Special</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>None or few</td>
<td>1000-1500</td>
<td>200-500</td>
<td>200-500</td>
<td>50-200</td>
</tr>
<tr class="even">
<td>Average</td>
<td>1500-2500</td>
<td>500-1000</td>
<td>500-1000</td>
<td>200-500</td>
</tr>
<tr class="odd">
<td>Many</td>
<td>2500+</td>
<td>1000+</td>
<td>1000+</td>
<td>500+</td>
</tr>
</tbody>
</table>
</div>
</figure>
</div>
<p>Once again, the sample size ought to depend on the proportions of the sample that have the characteristics you are interested in, the extent to which you want to learn about subgroups as well as the universe as a whole, and of course the purpose of your study, the value of the information, and the cost. Also, keep in mind that the <em>added</em> information that you obtain from an additional sample observation tends to be smaller as the sample size gets larger. You must quadruple the sample to halve the error.</p>
<p>Now let us consider some specific cases. The first examples taken up here are from the descriptive type of study, and the latter deal with sample sizes in relationship research.</p>
</section>
<section id="some-practical-examples" class="level2" data-number="30.2">
<h2 data-number="30.2" class="anchored" data-anchor-id="some-practical-examples"><span class="header-section-number">30.2</span> Some practical examples</h2>
<section id="sec-proportion-radio" class="level3" data-number="30.2.1">
<h3 data-number="30.2.1" class="anchored" data-anchor-id="sec-proportion-radio"><span class="header-section-number">30.2.1</span> Example: what proportion of homes are listening to a radio station?</h3>
<p>What proportion of the homes in Countryville watch television station WCNT’s ten o’clock news program? That is the question your phone survey aims to answer, and you want to know how many randomly selected homes you must phone to obtain a sufficiently large sample.</p>
<p>Begin by guessing the likeliest answer, say 30 percent in this case. Do not worry if you are off by 5 per cent or even 10 per cent; and you will probably not be further off than that. Select a first-approximation sample size of perhaps 400; this number is selected from my general experience, but it is just a starting point. Then proceed through the first 400 numbers in a random-number table, marking down a <em>yes</em> for numbers 1-3 and <em>no</em> for numbers 4-10 (because 3/10 was your estimate of the proportion listening). Then add the number of <em>yes</em> and <em>no</em>. Carry out perhaps ten sets of such trials, the results of which are in <a href="#tbl-phone-trials" class="quarto-xref">Table&nbsp;<span>30.3</span></a>.</p>
<div id="tbl-phone-trials" class="quarto-float quarto-figure quarto-figure-center anchored">
<figure class="quarto-float quarto-float-tbl figure">
<figcaption class="quarto-float-caption-top quarto-float-caption quarto-float-tbl" id="tbl-phone-trials-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
Table&nbsp;30.3: Ten example trials from phone survey simulation
</figcaption>
<div aria-describedby="tbl-phone-trials-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
<table class="caption-top table">
<colgroup>
<col style="width: 20%">
<col style="width: 20%">
<col style="width: 19%">
<col style="width: 27%">
</colgroup>
<thead>
<tr class="header">
<th>Trial number</th>
<th>Number “yes”</th>
<th>Number “no”</th>
<th>% difference from expected mean of 30% (120 “yes”)</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>1</td>
<td>115</td>
<td>285</td>
<td>1.25</td>
</tr>
<tr class="even">
<td>2</td>
<td>119</td>
<td>281</td>
<td>0.25</td>
</tr>
<tr class="odd">
<td>3</td>
<td>116</td>
<td>284</td>
<td>1.00</td>
</tr>
<tr class="even">
<td>4</td>
<td>114</td>
<td>286</td>
<td>1.50</td>
</tr>
<tr class="odd">
<td>5</td>
<td>107</td>
<td>293</td>
<td>3.25</td>
</tr>
<tr class="even">
<td>6</td>
<td>116</td>
<td>284</td>
<td>1.00</td>
</tr>
<tr class="odd">
<td>7</td>
<td>132</td>
<td>268</td>
<td>3.00</td>
</tr>
<tr class="even">
<td>8</td>
<td>123</td>
<td>277</td>
<td>0.75</td>
</tr>
<tr class="odd">
<td>9</td>
<td>121</td>
<td>279</td>
<td>0.25</td>
</tr>
<tr class="even">
<td>10</td>
<td>114</td>
<td>286</td>
<td>1.50</td>
</tr>
</tbody><tfoot>
<tr class="odd">
<td colspan="3">Mean</td>
<td>1.375</td>
</tr>
</tfoot>

</table>
</div>
</figure>
</div>
<p>Based on these ten trials, you can estimate that if you take a sample of 400 and if the “real” viewing level is 30 percent, your average percentage error will be 1.375 percent on either side of 30 percent. That is, with a sample of 400, half the time your error will be greater than 1.375 percent if 3/10 of the universe is listening.</p>
<p>Now you must decide whether the estimated error is small enough for your needs. If you want greater accuracy than a sample of 400 will give you, increase the sample size, using this important rule of thumb: To cut the error in half, you must <em>quadruple</em> the sample size. In other words, if you want a sample that will give you an error of only 0.6875 percent on the average, you must increase the sample size to 1,600 interviews. Similarly, if you cut the sample size to 100, the average error will be only 2.75 percent (double 1.375 percent) on either side of 30 percent. If you distrust this rule of thumb, run ten or so trials on sample sizes of 100 or 1,600, and see what error you can expect to obtain on the average.</p>
<p>If the “real” viewership is 20 percent or 40 percent, instead of 30 percent, the accuracy you will obtain from a sample size of 400 will not be very different from an “actual” viewership of 30 percent, so do not worry about that too much, as long as you are in the right general vicinity.</p>
<p>Accuracy is <em>slightly</em> greater in smaller universes but <em>only</em> slightly. For example, a sample of 400 would give <em>perfect</em> accuracy if Countryville had only 400 residents. And a sample of 400 will give <em>slightly</em> greater accuracy for a town of 800 residents than for a city of 80,000 residents. But, beyond the point at which the sample is a <em>large fraction</em> of the total universe, there is no difference in accuracy with increases in the size of universe. This point is very important. For any given level of accuracy, <em>identical</em> sample sizes give the same level of accuracy for Podunk (population 8,000) or New York City (population 8 million). The <em>ratio</em> of the sample size to the population of Podunk or New York City means nothing at all, even though it intuitively seems to be important.</p>
<p>The size of the sample must depend upon which population or sub-populations you wish to describe. For example, Alfred Kinsey’s sample size for the classic “Sexual Behavior in the Human Male” <span class="citation" data-cites="kinsey1948sexual">(<a href="references.html#ref-kinsey1948sexual" role="doc-biblioref">1948</a>)</span> would have seemed large, by customary practice, for generalizations about the United States population as a whole. But, as Kinsey explains: “… the chief concern of the present study is an understanding of the sexual behavior of <em>each segment of the population</em>, and that it is only secondarily concerned with generalization for the population as a whole.” <span class="citation" data-cites="kinsey1948sexual">(<a href="references.html#ref-kinsey1948sexual" role="doc-biblioref">1948, 82</a>, italics added)</span>. Therefore Kinsey’s sample had to include sub-samples large enough to obtain the desired accuracy in <em>each</em> of these sub-universes. The U.S. Census offers a similar illustration. When the U.S. Bureau of the Census aims to estimate only a total or an average for the United States as a whole — as, for example, in the Current Population Survey estimate of unemployment — a sample of perhaps 50,000 is big enough. But the decennial census aims to make estimates for all the various communities in the country, estimates that require adequate sub-samples in each of these sub-universes; such is the justification for the decennial census’ sample size of so many millions. Television ratings illustrate both types of purpose. Nielsen ratings, for example, are sold primarily to national network advertisers. These advertisers on national television networks usually sell their goods all across the country and are therefore interested primarily in the total United States viewership for a program, rather than in the viewership in various demographic subgroups. The appropriate calculations for Nielsen sample size will therefore refer to the total United States sample. But other organizations sell rating services to <em>local</em> television and radio stations for use in soliciting advertising over the local stations rather than over the network as a whole. Each local sample must then be large enough to provide reasonable accuracy, and, considered as a whole, the samples for the local stations therefore add up to a much larger sample than the Nielsen and other nationwide samples.</p>
<p>The problem may be handled with the following Python program. This program represents viewers with the string <code>'viewers'</code> and non-viewers as <code>'not viewers'</code>. It then asks <span class="python"><code>rnd.choice</code></span> to choose randomly between <code>'viewer'</code> and <code>'not viewer'</code> with a 30% (p=0.3) chance of getting a <code>'viewer'</code> and a 70% chance of getting a <code>'not viewer'</code>. It gets a sample of 400 such numbers, counts (with <span class="python"><code>np.sum</code></span> the “viewers” then finds how much this sample diverges from the expected number of viewers (30% of 400 = 120). It repeats this procedure 10000 times, and then calculates the average divergence.</p>
<div id="nte-viewer_numbers" class="callout callout-style-default callout-note callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Note&nbsp;30.1: Notebook: Number of viewers
</div>
</div>
<div class="callout-body-container callout-body">
<div class="nb-links">
<p><a class="notebook-link" href="notebooks/viewer_numbers.ipynb">Download notebook</a> <a class="interact-button" href="./interact/lab/index.html?path=viewer_numbers.ipynb">Interact</a></p>
</div>
</div>
</div>
<div class="nb-start" name="viewer_numbers" title="Number of viewers">

</div>
<div class="cell" data-layout-align="center">
<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="co"># set up the random number generator</span></span>
<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a>rnd <span class="op">=</span> np.random.default_rng()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<div class="cell" data-layout-align="center">
<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="co"># set the number of trials</span></span>
<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>n_trials <span class="op">=</span> <span class="dv">10000</span></span>
<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="co"># an empty array to store the results</span></span>
<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a>results <span class="op">=</span> np.zeros(n_trials)</span>
<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a><span class="co"># What are the options to choose from?</span></span>
<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a>options <span class="op">=</span> [<span class="st">'viewer'</span>, <span class="st">'not viewer'</span>]</span>
<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a><span class="co"># do n_trials trials</span></span>
<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(n_trials):</span>
<span id="cb2-12"><a href="#cb2-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb2-13"><a href="#cb2-13" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Choose 'viewer' 30% of the time.</span></span>
<span id="cb2-14"><a href="#cb2-14" aria-hidden="true" tabindex="-1"></a>    a <span class="op">=</span> rnd.choice(options, size<span class="op">=</span><span class="dv">400</span>, p<span class="op">=</span>[<span class="fl">0.3</span>, <span class="fl">0.7</span>])</span>
<span id="cb2-15"><a href="#cb2-15" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb2-16"><a href="#cb2-16" aria-hidden="true" tabindex="-1"></a>    <span class="co"># count the viewers</span></span>
<span id="cb2-17"><a href="#cb2-17" aria-hidden="true" tabindex="-1"></a>    b <span class="op">=</span> np.<span class="bu">sum</span>(a <span class="op">==</span> <span class="st">'viewer'</span>)</span>
<span id="cb2-18"><a href="#cb2-18" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb2-19"><a href="#cb2-19" aria-hidden="true" tabindex="-1"></a>    <span class="co"># how different from expected?</span></span>
<span id="cb2-20"><a href="#cb2-20" aria-hidden="true" tabindex="-1"></a>    c <span class="op">=</span> <span class="dv">120</span> <span class="op">-</span> b</span>
<span id="cb2-21"><a href="#cb2-21" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb2-22"><a href="#cb2-22" aria-hidden="true" tabindex="-1"></a>    <span class="co"># absolute value of the difference</span></span>
<span id="cb2-23"><a href="#cb2-23" aria-hidden="true" tabindex="-1"></a>    d <span class="op">=</span> np.<span class="bu">abs</span>(c)</span>
<span id="cb2-24"><a href="#cb2-24" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb2-25"><a href="#cb2-25" aria-hidden="true" tabindex="-1"></a>    <span class="co"># express as a proportion of sample</span></span>
<span id="cb2-26"><a href="#cb2-26" aria-hidden="true" tabindex="-1"></a>    e <span class="op">=</span> d <span class="op">/</span> <span class="dv">400</span></span>
<span id="cb2-27"><a href="#cb2-27" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb2-28"><a href="#cb2-28" aria-hidden="true" tabindex="-1"></a>    <span class="co"># keep score of the result</span></span>
<span id="cb2-29"><a href="#cb2-29" aria-hidden="true" tabindex="-1"></a>    results[i] <span class="op">=</span> e</span>
<span id="cb2-30"><a href="#cb2-30" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb2-31"><a href="#cb2-31" aria-hidden="true" tabindex="-1"></a><span class="co"># find the mean divergence</span></span>
<span id="cb2-32"><a href="#cb2-32" aria-hidden="true" tabindex="-1"></a>k <span class="op">=</span> np.mean(results)</span>
<span id="cb2-33"><a href="#cb2-33" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb2-34"><a href="#cb2-34" aria-hidden="true" tabindex="-1"></a><span class="co"># Show the result</span></span>
<span id="cb2-35"><a href="#cb2-35" aria-hidden="true" tabindex="-1"></a>k</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>np.float64(0.018184000000000002)</code></pre>
</div>
</div>
<!---
End of notebook.
-->
<div class="nb-end">

</div>
<div class="callout callout-style-default callout-note callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
End of notebook: Number of viewers
</div>
</div>
<div class="callout-body-container callout-body">
<p><code>viewer_numbers</code> starts at <a href="#nte-viewer_numbers" class="quarto-xref">Note&nbsp;<span>30.1</span></a>.</p>
</div>
</div>
<p>It is a simple matter to go back and try a sample size of (say) 1600 rather than 400, and examine the effect on the mean difference.</p>
</section>
<section id="example-average-weight-gain-for-pig-rations" class="level3" data-number="30.2.2">
<h3 data-number="30.2.2" class="anchored" data-anchor-id="example-average-weight-gain-for-pig-rations"><span class="header-section-number">30.2.2</span> Example: average weight gain for pig rations</h3>
<p>This example, like <a href="#sec-proportion-radio" class="quarto-xref"><span>Section 30.2.1</span></a>, illustrates the choice of sample size for estimating a summarization statistic. Later examples deal with sample sizes for probability statistics.</p>
<p>Hark back to the pig-ration problems presented earlier (e.g. <a href="testing_measured.html#sec-pig-rations-measured" class="quarto-xref"><span>Section 24.0.1</span></a>), and consider the following set of pig weight-gains recorded for ration A: 31, 34, 29, 26, 32, 35, 38, 34, 31, 29, 32, 30. Assume that our purpose now is to estimate the average weight gain for ration A, so that the feed company can advertise to farmers how much weight gain to expect from ration A. If the universe is made up of pig weight-gains like those we observed, we can simulate the universe with, say, 1 million weight gains of thirty-one pounds, 1 million of thirty-four pounds, and so on for the twelve observed weight gains. Or, more conveniently, as accuracy will not be affected much, we can make up a universe of say, thirty cards for each thirty-one-pound gain, thirty cards for each thirty-four-pound gains and so forth, yielding a deck of 30 x 12 = 360 cards. Then shuffle, and, just for a starting point, try sample sizes of twelve pigs. The means of the samples for twenty such trials are as in <a href="#tbl-weight-bootstrap" class="quarto-xref">Table&nbsp;<span>30.4</span></a>.</p>
<div id="tbl-weight-bootstrap" class="quarto-float quarto-figure quarto-figure-center anchored">
<figure class="quarto-float quarto-float-tbl figure">
<figcaption class="quarto-float-caption-top quarto-float-caption quarto-float-tbl" id="tbl-weight-bootstrap-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
Table&nbsp;30.4: Simulated average weight gains from pig ration A
</figcaption>
<div aria-describedby="tbl-weight-bootstrap-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
<table class="caption-top table">
<colgroup>
<col style="width: 15%">
<col style="width: 15%">
<col style="width: 15%">
<col style="width: 15%">
<col style="width: 15%">
<col style="width: 15%">
</colgroup>
<thead>
<tr class="header">
<th>Trial</th>
<th>Mean</th>
<th>Absolute deviation of trial mean from actual mean</th>
<th>Trial</th>
<th>Mean</th>
<th>Absolute deviation of trial mean from actual mean</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>1</td>
<td>31.77</td>
<td>.02</td>
<td>11</td>
<td>32.10</td>
<td>.35</td>
</tr>
<tr class="even">
<td>2</td>
<td>32.27</td>
<td>1.52</td>
<td>12</td>
<td>30.67</td>
<td>1.08</td>
</tr>
<tr class="odd">
<td>3</td>
<td>31.75</td>
<td>.00</td>
<td>13</td>
<td>32.42</td>
<td>.67</td>
</tr>
<tr class="even">
<td>4</td>
<td>30.83</td>
<td>.92</td>
<td>14</td>
<td>30.67</td>
<td>1.08</td>
</tr>
<tr class="odd">
<td>5</td>
<td>30.52</td>
<td>1.23</td>
<td>15</td>
<td>32.25</td>
<td>.50</td>
</tr>
<tr class="even">
<td>6</td>
<td>31.60</td>
<td>.15</td>
<td>16</td>
<td>31.60</td>
<td>.15</td>
</tr>
<tr class="odd">
<td>7</td>
<td>32.46</td>
<td>.71</td>
<td>17</td>
<td>32.33</td>
<td>.58</td>
</tr>
<tr class="even">
<td>8</td>
<td>31.10</td>
<td>.65</td>
<td>18</td>
<td>33.08</td>
<td>1.33</td>
</tr>
<tr class="odd">
<td>9</td>
<td>32.42</td>
<td>.35</td>
<td>19</td>
<td>33.01</td>
<td>1.26</td>
</tr>
<tr class="even">
<td>10</td>
<td>30.60</td>
<td>1.15</td>
<td>20</td>
<td>30.60</td>
<td>1.15</td>
</tr>
</tbody><tfoot>
<tr class="odd">
<td colspan="4">Mean</td>
<td colspan="2">31.75</td>
</tr>
</tfoot>

</table>
</div>
</figure>
</div>
<p>Now ask yourself whether a sample size of twelve pigs gives you enough accuracy. If we sort the absolute deviations, we find the middle two values (the 10th and 11th values) of 20 are 0.67 and 0.71, so the median is the average of these values: 0.69. There is a .5 chance that the mean for any given sample will be more than 0.69 points from the mean of the universe that generates such samples, which in this situation is 31.75 pounds. Is this close enough? That is up to you to decide in light of the purposes for which you are running the experiment. (The logic of the inference you make here is inevitably murky, and use of the term “real mean” can make it even murkier, as is seen in the discussion in <a href="confidence_1.html" class="quarto-xref"><span>Chapter 26</span></a> — <a href="reliability_average.html" class="quarto-xref"><span>Chapter 28</span></a> on confidence intervals.)</p>
<p>To see how accuracy is affected by larger samples, try a sample size of forty-eight “pigs” dealt from the same deck. (But, if the sample size were to be much larger than forty-eight, you might need a “universe” greater than 360 cards.) The results of twenty trials are in <a href="#tbl-weight-forty-eight" class="quarto-xref">Table&nbsp;<span>30.5</span></a>.</p>
<div id="tbl-weight-forty-eight" class="quarto-float quarto-figure quarto-figure-center anchored">
<figure class="quarto-float quarto-float-tbl figure">
<figcaption class="quarto-float-caption-top quarto-float-caption quarto-float-tbl" id="tbl-weight-forty-eight-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
Table&nbsp;30.5: Simulated average weight gain from 48 pigs
</figcaption>
<div aria-describedby="tbl-weight-forty-eight-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
<table class="caption-top table">
<colgroup>
<col style="width: 15%">
<col style="width: 15%">
<col style="width: 15%">
<col style="width: 15%">
<col style="width: 15%">
<col style="width: 15%">
</colgroup>
<thead>
<tr class="header">
<th>Trial</th>
<th>Mean</th>
<th>Absolute deviation of trial mean from actual mean</th>
<th>Trial</th>
<th>Mean</th>
<th>Absolute deviation of trial mean from actual mean</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>1</td>
<td>31.80</td>
<td>.05</td>
<td>11</td>
<td>31.93</td>
<td>.18</td>
</tr>
<tr class="even">
<td>2</td>
<td>32.27</td>
<td>.52</td>
<td>12</td>
<td>32.40</td>
<td>.65</td>
</tr>
<tr class="odd">
<td>3</td>
<td>31.82</td>
<td>.07</td>
<td>13</td>
<td>31.32</td>
<td>.43</td>
</tr>
<tr class="even">
<td>4</td>
<td>31.39</td>
<td>.36</td>
<td>14</td>
<td>32.07</td>
<td>.68</td>
</tr>
<tr class="odd">
<td>5</td>
<td>31.22</td>
<td>.53</td>
<td>15</td>
<td>32.03</td>
<td>.28</td>
</tr>
<tr class="even">
<td>6</td>
<td>31.88</td>
<td>.13</td>
<td>16</td>
<td>31.95</td>
<td>.20</td>
</tr>
<tr class="odd">
<td>7</td>
<td>31.37</td>
<td>.38</td>
<td>17</td>
<td>31.75</td>
<td>.00</td>
</tr>
<tr class="even">
<td>8</td>
<td>31.48</td>
<td>.27</td>
<td>18</td>
<td>31.11</td>
<td>.64</td>
</tr>
<tr class="odd">
<td>9</td>
<td>31.20</td>
<td>.55</td>
<td>19</td>
<td>31.96</td>
<td>.21</td>
</tr>
<tr class="even">
<td>10</td>
<td>32.01</td>
<td>.26</td>
<td>20</td>
<td>31.32</td>
<td>.43</td>
</tr>
</tbody><tfoot>
<tr class="odd">
<td colspan="4">Mean</td>
<td colspan="2">31.75</td>
</tr>
</tfoot>

</table>
</div>
</figure>
</div>
<p>The median of mean absolute deviations in <a href="#tbl-weight-forty-eight" class="quarto-xref">Table&nbsp;<span>30.5</span></a> is 0.32. In half the trials with a sample size of forty-eight the difference between the sample mean and the “real” mean of 31.75 will be .32 pound, smaller than with the 0.69 of samples of 12 pigs. Again, is this too little accuracy for you? If so, increase the sample size further.</p>
<p>The attentive reader of this example may have been troubled by this question: How do you know what kind of a distribution of values is contained in the universe before the sample is taken? The answer is that you guess, just as in <a href="#sec-proportion-radio" class="quarto-xref"><span>Section 30.2.1</span></a> you guessed at the mean of the universe. If you guess wrong, you will get either more accuracy or less accuracy than you expected from a given sample size, but the results will not be fatal; if you obtain more accuracy than you wanted, you have wasted some money, and, if you obtain less accuracy, your sample dispersion will tell you so, and you can then augment the sample to boost the accuracy. But an error in guessing will not introduce error into your final results.</p>
<p>The guess should be based on something, however. One source for guessing is your general knowledge of the likely dispersion; for example, if you were estimating male heights in Rhode Island, you would be able to guess what proportion of observations would fall within 2 inches, 4 inches, 6 inches, and 8 inches, perhaps, of the real value. Or, much better yet, a very small pretest will yield quite satisfactory estimates of the dispersion.</p>
<p>Here is a Python program that will let you try different sample sizes, and then take bootstrap samples to determine the range of sampling error. You set the sample size by setting the <code>sampsize</code> variable. Above I noted that we could sample without replacement from a “deck” of thirty “31”’s, thirty “34”’s, etc, as a substitute for creating a universe of a million “31”’s, a million “34”’s, etc. We can achieve the same effect if we replace each card after we sample it; this is equivalent to creating a “deck” of an infinite number of “31”’s, “34”’s, etc. That is what the <code>rnd.choice</code> command does, below. Note that the sample size is determined by the value of the <code>sampsize</code> variable, which you set at the beginning. From here on the program takes the <code>mean</code> of each sample, keeps score of that result in the <code>results</code> array and produces a histogram. The <code>quantile</code> function will also tell you what values enclose 90% of all sample results, excluding those below the 5th percentile and above the 95th percentile.</p>
<p>Here is a notebook for a sample size of 12.</p>
<div id="nte-sampling_error_bootstrap" class="callout callout-style-default callout-note callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Note&nbsp;30.2: Notebook: Sampling error for pig ration weight gain via bootstrap
</div>
</div>
<div class="callout-body-container callout-body">
<div class="nb-links">
<p><a class="notebook-link" href="notebooks/sampling_error_bootstrap.ipynb">Download notebook</a> <a class="interact-button" href="./interact/lab/index.html?path=sampling_error_bootstrap.ipynb">Interact</a></p>
</div>
</div>
</div>
<div class="nb-start" name="sampling_error_bootstrap" title="Sampling error for pig ration weight gain via bootstrap">

</div>
<div class="cell" data-layout-align="center">
<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a>rnd <span class="op">=</span> np.random.default_rng()</span>
<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a>sampsize <span class="op">=</span> <span class="dv">12</span></span>
<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a>gains <span class="op">=</span> np.array([<span class="dv">31</span>, <span class="dv">34</span>, <span class="dv">29</span>, <span class="dv">26</span>, <span class="dv">32</span>, <span class="dv">35</span>, <span class="dv">38</span>, <span class="dv">34</span>, <span class="dv">32</span>, <span class="dv">31</span>, <span class="dv">30</span>, <span class="dv">29</span>])</span>
<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a>n_trials <span class="op">=</span> <span class="dv">10_000</span></span>
<span id="cb4-11"><a href="#cb4-11" aria-hidden="true" tabindex="-1"></a>results <span class="op">=</span> np.zeros(n_trials)</span>
<span id="cb4-12"><a href="#cb4-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb4-13"><a href="#cb4-13" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(n_trials):</span>
<span id="cb4-14"><a href="#cb4-14" aria-hidden="true" tabindex="-1"></a>    fake_gains <span class="op">=</span> rnd.choice(gains, size<span class="op">=</span>sampsize)</span>
<span id="cb4-15"><a href="#cb4-15" aria-hidden="true" tabindex="-1"></a>    results[i] <span class="op">=</span> np.mean(fake_gains)</span>
<span id="cb4-16"><a href="#cb4-16" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb4-17"><a href="#cb4-17" aria-hidden="true" tabindex="-1"></a>plt.hist(results, bins<span class="op">=</span><span class="dv">25</span>)</span>
<span id="cb4-18"><a href="#cb4-18" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">'Distribution of mean of '</span> <span class="op">+</span> <span class="bu">str</span>(sampsize) <span class="op">+</span></span>
<span id="cb4-19"><a href="#cb4-19" aria-hidden="true" tabindex="-1"></a>          <span class="st">' weights from '</span> <span class="op">+</span> <span class="bu">str</span>(n_trials) <span class="op">+</span> <span class="st">' bootstrap samples'</span>)</span>
<span id="cb4-20"><a href="#cb4-20" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">'Mean weight'</span>)</span>
<span id="cb4-21"><a href="#cb4-21" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb4-22"><a href="#cb4-22" aria-hidden="true" tabindex="-1"></a><span class="co"># Find values such that to 5%, 95% of values are below given value.</span></span>
<span id="cb4-23"><a href="#cb4-23" aria-hidden="true" tabindex="-1"></a><span class="co"># (These are the 5% and 95% percentile values).</span></span>
<span id="cb4-24"><a href="#cb4-24" aria-hidden="true" tabindex="-1"></a>hi_lo_range <span class="op">=</span> np.quantile(results, [<span class="fl">0.05</span>, <span class="fl">0.95</span>])</span>
<span id="cb4-25"><a href="#cb4-25" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb4-26"><a href="#cb4-26" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">'5% and 95% percentiles of bootstrap means:'</span>, np.<span class="bu">round</span>(hi_lo_range, <span class="dv">2</span>))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>5% and 95% percentiles of bootstrap means: [30.25 33.25]</code></pre>
</div>
<div class="cell-output-display">
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="how_big_sample_files/figure-html/unnamed-chunk-4-1.png" class="img-fluid quarto-figure quarto-figure-center figure-img" style="width:70.0%"></p>
</figure>
</div>
</div>
</div>
<div class="nb-end">

</div>
<div class="callout callout-style-default callout-note callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
End of notebook: Sampling error for pig ration weight gain via bootstrap
</div>
</div>
<div class="callout-body-container callout-body">
<p><code>sampling_error_bootstrap</code> starts at <a href="#nte-sampling_error_bootstrap" class="quarto-xref">Note&nbsp;<span>30.2</span></a>.</p>
</div>
</div>
<!---
End of notebook.
-->
</section>
</section>
<section id="example-sample-size-for-inference-on-fruit-fly-sex-difference" class="level2" data-number="30.3">
<h2 data-number="30.3" class="anchored" data-anchor-id="example-sample-size-for-inference-on-fruit-fly-sex-difference"><span class="header-section-number">30.3</span> Example: sample size for inference on fruit fly sex difference</h2>
<p>This is the first example of sample-size estimation for <em>probability</em> (testing) statistics, rather than the summarization statistics dealt with above.</p>
<p>Recall the problem of the sex of fruit-fly offspring discussed in <a href="testing_counts_1.html#sec-fruitfly" class="quarto-xref"><span>Section 21.2.1</span></a>. The question now is, how large a sample is needed to determine whether the radiation treatment results in a sex ratio other than a 50-50 male-female split?</p>
<p>The first step is, as usual, difficult but necessary. As the researcher, you must <em>guess</em> what the sex ratio will be if the treatment <em>does</em> have an effect. Let’s say that you use all your general knowledge of genetics and of this treatment and that you guess the sex ratio will be 75 percent males and 25 percent females <em>if</em> the treatment alters the ratio from 50-50.</p>
<p>In table of random numbers from 00-99 let “01-25” stand for females and “26-00” for males. Take twenty successive pairs of numbers for each trial, and run perhaps fifty trials, as in <a href="#tbl-flies-more-males" class="quarto-xref">Table&nbsp;<span>30.6</span></a>.</p>
<div id="tbl-flies-more-males" class="quarto-float quarto-figure quarto-figure-center anchored">
<figure class="quarto-float quarto-float-tbl figure">
<figcaption class="quarto-float-caption-top quarto-float-caption quarto-float-tbl" id="tbl-flies-more-males-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
Table&nbsp;30.6: Simulated numbers of males and females for 75/25% universe
</figcaption>
<div aria-describedby="tbl-flies-more-males-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
<table class="caption-top table">
<colgroup>
<col style="width: 10%">
<col style="width: 13%">
<col style="width: 10%">
<col style="width: 10%">
<col style="width: 13%">
<col style="width: 10%">
<col style="width: 10%">
<col style="width: 13%">
<col style="width: 10%">
</colgroup>
<thead>
<tr class="header">
<th>Trial</th>
<th>Females</th>
<th>Males</th>
<th>Trial</th>
<th>Females</th>
<th>Males</th>
<th>Trial</th>
<th>Females</th>
<th>Males</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>1</td>
<td>4</td>
<td>16</td>
<td>18</td>
<td>7</td>
<td>13</td>
<td>34</td>
<td>4</td>
<td>16</td>
</tr>
<tr class="even">
<td>2</td>
<td>6</td>
<td>14</td>
<td>19</td>
<td>3</td>
<td>17</td>
<td>35</td>
<td>6</td>
<td>14</td>
</tr>
<tr class="odd">
<td>3</td>
<td>6</td>
<td>14</td>
<td>20</td>
<td>7</td>
<td>13</td>
<td>36</td>
<td>3</td>
<td>17</td>
</tr>
<tr class="even">
<td>4</td>
<td>5</td>
<td>15</td>
<td>21</td>
<td>4</td>
<td>16</td>
<td>37</td>
<td>8</td>
<td>12</td>
</tr>
<tr class="odd">
<td>5</td>
<td>5</td>
<td>15</td>
<td>22</td>
<td>4</td>
<td>16</td>
<td>38</td>
<td>4</td>
<td>16</td>
</tr>
<tr class="even">
<td>6</td>
<td>3</td>
<td>17</td>
<td>23</td>
<td>5</td>
<td>15</td>
<td>39</td>
<td>3</td>
<td>17</td>
</tr>
<tr class="odd">
<td>7</td>
<td>7</td>
<td>13</td>
<td>24</td>
<td>8</td>
<td>12</td>
<td>40</td>
<td>6</td>
<td>14</td>
</tr>
<tr class="even">
<td>8</td>
<td>6</td>
<td>14</td>
<td>25</td>
<td>4</td>
<td>16</td>
<td>41</td>
<td>5</td>
<td>15</td>
</tr>
<tr class="odd">
<td>9</td>
<td>3</td>
<td>17</td>
<td>26</td>
<td>1</td>
<td>19</td>
<td>42</td>
<td>2</td>
<td>18</td>
</tr>
<tr class="even">
<td>10</td>
<td>2</td>
<td>18</td>
<td>27</td>
<td>5</td>
<td>15</td>
<td>43</td>
<td>8</td>
<td>12</td>
</tr>
<tr class="odd">
<td>11</td>
<td>6</td>
<td>14</td>
<td>28</td>
<td>3</td>
<td>17</td>
<td>44</td>
<td>4</td>
<td>16</td>
</tr>
<tr class="even">
<td>12</td>
<td>1</td>
<td>19</td>
<td>29</td>
<td>8</td>
<td>12</td>
<td>45</td>
<td>6</td>
<td>14</td>
</tr>
<tr class="odd">
<td>13</td>
<td>6</td>
<td>14</td>
<td>30</td>
<td>8</td>
<td>12</td>
<td>46</td>
<td>5</td>
<td>15</td>
</tr>
<tr class="even">
<td>14</td>
<td>3</td>
<td>17</td>
<td>31</td>
<td>5</td>
<td>15</td>
<td>47</td>
<td>3</td>
<td>17</td>
</tr>
<tr class="odd">
<td>15</td>
<td>1</td>
<td>19</td>
<td>32</td>
<td>3</td>
<td>17</td>
<td>48</td>
<td>5</td>
<td>15</td>
</tr>
<tr class="even">
<td>16</td>
<td>5</td>
<td>15</td>
<td>33</td>
<td>4</td>
<td>16</td>
<td>49</td>
<td>3</td>
<td>17</td>
</tr>
<tr class="odd">
<td>17</td>
<td>5</td>
<td>15</td>
<td></td>
<td></td>
<td></td>
<td>50</td>
<td>5</td>
<td>15</td>
</tr>
</tbody>
</table>
</div>
</figure>
</div>
<p>In <a href="testing_counts_1.html#sec-fruitfly" class="quarto-xref"><span>Section 21.2.1</span></a> with a sample of twenty flies that contained fourteen or more males, we found only an 8% probability that such an extreme sample would result from a 50-50 universe. Therefore, if we observe such an extreme sample, we rule out a 50-50 universe.</p>
<p>Now <a href="#tbl-flies-more-males" class="quarto-xref">Table&nbsp;<span>30.6</span></a> tells us that, if the ratio is <em>really</em> 75 to 25, then a sample of twenty will show fourteen or more males forty-two of fifty times (84 percent of the time). If we take a sample of twenty flies and if the ratio is really 75-25, we will make the correct decision by deciding that the split is not 50-50, 84 percent of the time.</p>
<p>Perhaps you are not satisfied with reaching the right conclusion only 84 percent of the time. In that case, still assuming that the ratio will really be 75-25 if it is not 50-50, you need to take a sample larger than twenty flies. How much larger? That depends on how much surer you want to be. Follow the same procedure for a sample size of perhaps eighty flies. First work out for a sample of eighty, as was done in <a href="testing_counts_1.html#sec-fruitfly" class="quarto-xref"><span>Section 21.2.1</span></a> for a sample of twenty, the number of males out of eighty that you would need to find for the odds to be, say, 9 to 1 that the universe is not 50-50; your estimate turns out to be forty-eight males. Then run fifty trials of eighty flies each on the basis of 75-25 probability, and see how often you would not get as many as forty-eight males in the sample. <a href="#tbl-flies-morer-males" class="quarto-xref">Table&nbsp;<span>30.7</span></a> shows the results we got. No trial was anywhere near as low as forty-eight, which suggests that a sample of eighty is larger than necessary if the split is really 75-25.</p>
<div id="tbl-flies-morer-males" class="quarto-float quarto-figure quarto-figure-center anchored">
<figure class="quarto-float quarto-float-tbl figure">
<figcaption class="quarto-float-caption-top quarto-float-caption quarto-float-tbl" id="tbl-flies-morer-males-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
Table&nbsp;30.7: Simulated males / females from 80 flies for 75/25% universe
</figcaption>
<div aria-describedby="tbl-flies-morer-males-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
<table class="caption-top table">
<colgroup>
<col style="width: 10%">
<col style="width: 13%">
<col style="width: 10%">
<col style="width: 10%">
<col style="width: 13%">
<col style="width: 10%">
<col style="width: 10%">
<col style="width: 13%">
<col style="width: 10%">
</colgroup>
<thead>
<tr class="header">
<th>Trial</th>
<th>Females</th>
<th>Males</th>
<th>Trial</th>
<th>Females</th>
<th>Males</th>
<th>Trial</th>
<th>Females</th>
<th>Males</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>1</td>
<td>21</td>
<td>59</td>
<td>18</td>
<td>13</td>
<td>67</td>
<td>34</td>
<td>21</td>
<td>59</td>
</tr>
<tr class="even">
<td>2</td>
<td>22</td>
<td>58</td>
<td>19</td>
<td>19</td>
<td>61</td>
<td>35</td>
<td>17</td>
<td>63</td>
</tr>
<tr class="odd">
<td>3</td>
<td>13</td>
<td>67</td>
<td>20</td>
<td>17</td>
<td>63</td>
<td>36</td>
<td>22</td>
<td>58</td>
</tr>
<tr class="even">
<td>4</td>
<td>15</td>
<td>65</td>
<td>21</td>
<td>17</td>
<td>63</td>
<td>37</td>
<td>19</td>
<td>61</td>
</tr>
<tr class="odd">
<td>5</td>
<td>22</td>
<td>58</td>
<td>22</td>
<td>18</td>
<td>62</td>
<td>38</td>
<td>21</td>
<td>59</td>
</tr>
<tr class="even">
<td>6</td>
<td>21</td>
<td>59</td>
<td>23</td>
<td>26</td>
<td>54</td>
<td>39</td>
<td>21</td>
<td>59</td>
</tr>
<tr class="odd">
<td>7</td>
<td>13</td>
<td>67</td>
<td>24</td>
<td>20</td>
<td>60</td>
<td>40</td>
<td>21</td>
<td>59</td>
</tr>
<tr class="even">
<td>8</td>
<td>24</td>
<td>56</td>
<td>25</td>
<td>16</td>
<td>64</td>
<td>41</td>
<td>21</td>
<td>59</td>
</tr>
<tr class="odd">
<td>9</td>
<td>16</td>
<td>64</td>
<td>26</td>
<td>22</td>
<td>58</td>
<td>42</td>
<td>18</td>
<td>62</td>
</tr>
<tr class="even">
<td>10</td>
<td>21</td>
<td>59</td>
<td>27</td>
<td>16</td>
<td>64</td>
<td>43</td>
<td>19</td>
<td>61</td>
</tr>
<tr class="odd">
<td>11</td>
<td>20</td>
<td>60</td>
<td>28</td>
<td>21</td>
<td>59</td>
<td>44</td>
<td>17</td>
<td>63</td>
</tr>
<tr class="even">
<td>12</td>
<td>19</td>
<td>61</td>
<td>29</td>
<td>22</td>
<td>58</td>
<td>45</td>
<td>13</td>
<td>67</td>
</tr>
<tr class="odd">
<td>13</td>
<td>21</td>
<td>59</td>
<td>30</td>
<td>21</td>
<td>59</td>
<td>46</td>
<td>16</td>
<td>64</td>
</tr>
<tr class="even">
<td>14</td>
<td>17</td>
<td>63</td>
<td>31</td>
<td>22</td>
<td>58</td>
<td>47</td>
<td>21</td>
<td>59</td>
</tr>
<tr class="odd">
<td>15</td>
<td>22</td>
<td>68</td>
<td>32</td>
<td>19</td>
<td>61</td>
<td>48</td>
<td>16</td>
<td>64</td>
</tr>
<tr class="even">
<td>16</td>
<td>22</td>
<td>68</td>
<td>33</td>
<td>10</td>
<td>70</td>
<td>49</td>
<td>17</td>
<td>63</td>
</tr>
<tr class="odd">
<td>17</td>
<td>17</td>
<td>63</td>
<td></td>
<td></td>
<td></td>
<td>50</td>
<td>21</td>
<td>59</td>
</tr>
</tbody>
</table>
</div>
</figure>
</div>
<p>It is obvious that, if the split you guess at is 60 to 40 rather than 75 to 25, you will need a bigger sample to obtain the “correct” result with the same probability. For example, run some eighty-fly random-number trials with 1-40 representing males and 51-100 representing females. <a href="#tbl-flies-moreish-males" class="quarto-xref">Table&nbsp;<span>30.8</span></a> shows that only twenty-four of fifty (48 percent) of the trials reach the necessary cut-off at which one would judge that a sample of eighty really does not come from a universe that is split 50-50; therefore, a sample of eighty is not big enough if the split is 60-40.</p>
<div id="tbl-flies-moreish-males" class="quarto-float quarto-figure quarto-figure-center anchored">
<figure class="quarto-float quarto-float-tbl figure">
<figcaption class="quarto-float-caption-top quarto-float-caption quarto-float-tbl" id="tbl-flies-moreish-males-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
Table&nbsp;30.8: Simulated males / females from 80 flies for 60/40% universe
</figcaption>
<div aria-describedby="tbl-flies-moreish-males-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
<table class="caption-top table">
<colgroup>
<col style="width: 10%">
<col style="width: 13%">
<col style="width: 10%">
<col style="width: 10%">
<col style="width: 13%">
<col style="width: 10%">
<col style="width: 10%">
<col style="width: 13%">
<col style="width: 10%">
</colgroup>
<thead>
<tr class="header">
<th>Trial</th>
<th>Females</th>
<th>Males</th>
<th>Trial</th>
<th>Females</th>
<th>Males</th>
<th>Trial</th>
<th>Females</th>
<th>Males</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>1</td>
<td>35</td>
<td>45</td>
<td>18</td>
<td>32</td>
<td>48</td>
<td>34</td>
<td>35</td>
<td>45</td>
</tr>
<tr class="even">
<td>2</td>
<td>36</td>
<td>44</td>
<td>19</td>
<td>28</td>
<td>52</td>
<td>35</td>
<td>36</td>
<td>44</td>
</tr>
<tr class="odd">
<td>3</td>
<td>35</td>
<td>45</td>
<td>20</td>
<td>32</td>
<td>48</td>
<td>36</td>
<td>29</td>
<td>51</td>
</tr>
<tr class="even">
<td>4</td>
<td>35</td>
<td>45</td>
<td>21</td>
<td>33</td>
<td>47</td>
<td>37</td>
<td>36</td>
<td>44</td>
</tr>
<tr class="odd">
<td>5</td>
<td>36</td>
<td>44</td>
<td>22</td>
<td>37</td>
<td>43</td>
<td>38</td>
<td>36</td>
<td>44</td>
</tr>
<tr class="even">
<td>6</td>
<td>36</td>
<td>44</td>
<td>23</td>
<td>36</td>
<td>44</td>
<td>39</td>
<td>31</td>
<td>49</td>
</tr>
<tr class="odd">
<td>7</td>
<td>36</td>
<td>44</td>
<td>24</td>
<td>31</td>
<td>49</td>
<td>40</td>
<td>29</td>
<td>51</td>
</tr>
<tr class="even">
<td>8</td>
<td>34</td>
<td>46</td>
<td>25</td>
<td>27</td>
<td>53</td>
<td>41</td>
<td>30</td>
<td>50</td>
</tr>
<tr class="odd">
<td>9</td>
<td>34</td>
<td>46</td>
<td>26</td>
<td>30</td>
<td>50</td>
<td>42</td>
<td>35</td>
<td>45</td>
</tr>
<tr class="even">
<td>10</td>
<td>29</td>
<td>51</td>
<td>27</td>
<td>31</td>
<td>49</td>
<td>43</td>
<td>32</td>
<td>48</td>
</tr>
<tr class="odd">
<td>11</td>
<td>29</td>
<td>51</td>
<td>28</td>
<td>33</td>
<td>47</td>
<td>44</td>
<td>30</td>
<td>50</td>
</tr>
<tr class="even">
<td>12</td>
<td>32</td>
<td>48</td>
<td>29</td>
<td>37</td>
<td>43</td>
<td>45</td>
<td>37</td>
<td>43</td>
</tr>
<tr class="odd">
<td>13</td>
<td>29</td>
<td>51</td>
<td>30</td>
<td>30</td>
<td>50</td>
<td>46</td>
<td>31</td>
<td>49</td>
</tr>
<tr class="even">
<td>14</td>
<td>31</td>
<td>49</td>
<td>31</td>
<td>31</td>
<td>49</td>
<td>47</td>
<td>36</td>
<td>44</td>
</tr>
<tr class="odd">
<td>15</td>
<td>28</td>
<td>52</td>
<td>32</td>
<td>32</td>
<td>48</td>
<td>48</td>
<td>34</td>
<td>64</td>
</tr>
<tr class="even">
<td>16</td>
<td>33</td>
<td>47</td>
<td>33</td>
<td>34</td>
<td>46</td>
<td>49</td>
<td>29</td>
<td>51</td>
</tr>
<tr class="odd">
<td>17</td>
<td>36</td>
<td>44</td>
<td></td>
<td></td>
<td></td>
<td>50</td>
<td>37</td>
<td>43</td>
</tr>
</tbody>
</table>
</div>
</figure>
</div>
<p>To review the main principles of this example: First, the closer together the two possible universes from which you think the sample might have come (50-50 and 60-40 are closer together than are 50-50 and 75-25), the larger the sample needed to distinguish between them. Second, the surer you want to be that you reach the right decision based upon the sample evidence, the larger the sample you need.</p>
<p>The problem may be handled with the following Python notebook. We construct a benchmark universe that is 60-40 male-female, and take samples of size 80, observing whether the numbers of males and females differs enough in these resamples to rule out a 50-50 universe. Recall that we need at least 48 of 80 males to say that the proportion of males is <em>not</em> 50%.</p>
<div id="nte-flies_sample_size" class="callout callout-style-default callout-note callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Note&nbsp;30.3: Notebook: Sample size for detecting fruitfly sex difference
</div>
</div>
<div class="callout-body-container callout-body">
<div class="nb-links">
<p><a class="notebook-link" href="notebooks/flies_sample_size.ipynb">Download notebook</a> <a class="interact-button" href="./interact/lab/index.html?path=flies_sample_size.ipynb">Interact</a></p>
</div>
</div>
</div>
<div class="nb-start" name="flies_sample_size" title="Sample size for detecting fruitfly sex difference">

</div>
<div class="cell" data-layout-align="center">
<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a>rnd <span class="op">=</span> np.random.default_rng()</span>
<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a>n_trials <span class="op">=</span> <span class="dv">10_000</span></span>
<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Results for each trial.</span></span>
<span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a>results <span class="op">=</span> np.zeros(n_trials)</span>
<span id="cb6-9"><a href="#cb6-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a><span class="co"># Do 10,000 trials</span></span>
<span id="cb6-11"><a href="#cb6-11" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(n_trials):</span>
<span id="cb6-12"><a href="#cb6-12" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Generate 80 "flies" with 0.6 chance of male, 0.4 of female&gt;</span></span>
<span id="cb6-13"><a href="#cb6-13" aria-hidden="true" tabindex="-1"></a>    flies <span class="op">=</span> rnd.choice([<span class="st">'male'</span>, <span class="st">'female'</span>], size<span class="op">=</span><span class="dv">80</span>, p<span class="op">=</span>[<span class="fl">0.6</span>, <span class="fl">0.4</span>])</span>
<span id="cb6-14"><a href="#cb6-14" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Count the males.</span></span>
<span id="cb6-15"><a href="#cb6-15" aria-hidden="true" tabindex="-1"></a>    n_males <span class="op">=</span> np.<span class="bu">sum</span>(flies <span class="op">==</span> <span class="st">'male'</span>)</span>
<span id="cb6-16"><a href="#cb6-16" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Keep score.</span></span>
<span id="cb6-17"><a href="#cb6-17" aria-hidden="true" tabindex="-1"></a>    results[i] <span class="op">=</span> n_males</span>
<span id="cb6-18"><a href="#cb6-18" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb6-19"><a href="#cb6-19" aria-hidden="true" tabindex="-1"></a><span class="co"># How many of the trials produced more than 48 males?</span></span>
<span id="cb6-20"><a href="#cb6-20" aria-hidden="true" tabindex="-1"></a>k <span class="op">=</span> np.<span class="bu">sum</span>(results <span class="op">&gt;=</span> <span class="dv">48</span>)</span>
<span id="cb6-21"><a href="#cb6-21" aria-hidden="true" tabindex="-1"></a><span class="co"># Convert to a proportion</span></span>
<span id="cb6-22"><a href="#cb6-22" aria-hidden="true" tabindex="-1"></a>kk <span class="op">=</span> k <span class="op">/</span> n_trials</span>
<span id="cb6-23"><a href="#cb6-23" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb6-24"><a href="#cb6-24" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">'Proportion of 60/40 trials giving &gt;= 48 males:'</span>, kk)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>Proportion of 60/40 trials giving &gt;= 48 males: 0.5534</code></pre>
</div>
</div>
<p>If the result <code>kk</code> is close to 1, we then know that samples of size 80 will almost always produce samples with enough males to avoid misleading us into thinking that they could have come from a universe in which males and females are split 50-50.</p>
<div class="nb-end">

</div>
<div class="callout callout-style-default callout-note callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
End of notebook: Sample size for detecting fruitfly sex difference
</div>
</div>
<div class="callout-body-container callout-body">
<p><code>flies_sample_size</code> starts at <a href="#nte-flies_sample_size" class="quarto-xref">Note&nbsp;<span>30.3</span></a>.</p>
</div>
</div>
<!---
End of notebook.
-->
<section id="example-sample-size-for-an-internet-provider-poll" class="level3" data-number="30.3.1">
<h3 data-number="30.3.1" class="anchored" data-anchor-id="example-sample-size-for-an-internet-provider-poll"><span class="header-section-number">30.3.1</span> Example: sample size for an internet-provider poll</h3>
<p>Referring back to <a href="testing_counts_1.html#sec-contract-poll" class="quarto-xref"><span>Section 21.2.3</span></a>, on the internet provider poll, how large a sample <em>should</em> you have taken? Pretend that the data have not yet been collected. You need <em>some</em> estimate of how the results will turn out before you can select a sample size. But you have not the foggiest idea how the results will turn out. Therefore, go out and take a very small sample, maybe ten people, to give you some idea of whether people will split quite evenly or unevenly. Seven of your ten initial interviews say they are for the internet provider contract. How large a sample do you now need to provide an answer of which you can be fairly sure?</p>
<p>Using the techniques of the previous chapter, we can estimate that from a sample of fifty people at least thirty-two would have to vote the same way for you to believe that the odds are at least 19 to 1 that the sample does not misrepresent the universe, that is, that the sample does not show a majority different from that of the whole universe if you polled everyone.</p>
<p>We do this by repeating the experiment in <a href="testing_counts_1.html#sec-contract-poll" class="quarto-xref"><span>Section 21.2.3</span></a>. In that experiment, we were doing simulated trials in the 50:50 world, and looking at the proportion of simulated trials where the count of yes votes was &gt;= 30. We found the proportion of trials was about 10%. We are interested to know the count for which we get less than 5%. We can do this by checking the proportions for &gt;=31, &gt;= 32, and so on. It turns out that counts &gt;= 32 occur a bit less than 5% of the time in the 50:50 world.</p>
<div id="nte-contract_poll_32" class="callout callout-style-default callout-note callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Note&nbsp;30.4: Notebook: Contract poll looking for &gt;=32
</div>
</div>
<div class="callout-body-container callout-body">
<div class="nb-links">
<p><a class="notebook-link" href="notebooks/contract_poll_32.ipynb">Download notebook</a> <a class="interact-button" href="./interact/lab/index.html?path=contract_poll_32.ipynb">Interact</a></p>
</div>
</div>
</div>
<div class="nb-start" name="contract_poll_32" title="Contract poll looking for >=32">

</div>
<p>This Python notebook generates samples of 50 simulated voters on the assumption that only 50 percent are in favor of the contract. Then it counts the number of generated samples where 32 or more of the 50 respondents said they were in favor of the contract.</p>
<div class="cell" data-layout-align="center">
<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a>rnd <span class="op">=</span> np.random.default_rng()</span>
<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb8-6"><a href="#cb8-6" aria-hidden="true" tabindex="-1"></a>n_trials <span class="op">=</span> <span class="dv">10_000</span></span>
<span id="cb8-7"><a href="#cb8-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb8-8"><a href="#cb8-8" aria-hidden="true" tabindex="-1"></a>yeses <span class="op">=</span> np.zeros(n_trials)</span>
<span id="cb8-9"><a href="#cb8-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb8-10"><a href="#cb8-10" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(n_trials):</span>
<span id="cb8-11"><a href="#cb8-11" aria-hidden="true" tabindex="-1"></a>    answers <span class="op">=</span> rnd.choice([<span class="st">'No'</span>, <span class="st">'Yes'</span>], size<span class="op">=</span><span class="dv">50</span>)</span>
<span id="cb8-12"><a href="#cb8-12" aria-hidden="true" tabindex="-1"></a>    yeses[i] <span class="op">=</span> np.<span class="bu">sum</span>(answers <span class="op">==</span> <span class="st">'Yes'</span>)</span>
<span id="cb8-13"><a href="#cb8-13" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb8-14"><a href="#cb8-14" aria-hidden="true" tabindex="-1"></a>k <span class="op">=</span> np.<span class="bu">sum</span>(yeses <span class="op">&gt;=</span> <span class="dv">32</span>)</span>
<span id="cb8-15"><a href="#cb8-15" aria-hidden="true" tabindex="-1"></a>kk <span class="op">=</span> k <span class="op">/</span> n_trials</span>
<span id="cb8-16"><a href="#cb8-16" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb8-17"><a href="#cb8-17" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">'Proportion &gt;= 32:'</span>, np.<span class="bu">round</span>(kk, <span class="dv">2</span>))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>Proportion &gt;= 32: 0.03</code></pre>
</div>
</div>
<div class="nb-end">

</div>
<div class="callout callout-style-default callout-note callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
End of notebook: Contract poll looking for &gt;=32
</div>
</div>
<div class="callout-body-container callout-body">
<p><code>contract_poll_32</code> starts at <a href="#nte-contract_poll_32" class="quarto-xref">Note&nbsp;<span>30.4</span></a>.</p>
</div>
</div>
<!---
End of notebook.
-->
<p>We know, therefore, that if we see a voter “yes” count &gt;= 32, there is only a small (&gt;5%) chance that arose from the 50:50 world.</p>
<p>Therefore, designate numbers 1-30 as <em>no</em> and 31-00 as <em>yes</em> in the random-number table (that is, 70 percent, as in your estimate based on your presample of ten), work through a trial sample size of fifty, and count the number of <em>yeses</em>. Run through perhaps ten or fifteen trials, and reckon how often the observed number of <em>yeses</em> is &gt;= 32 (the number you must exceed for a result you can rely on). In <a href="#tbl-cable-yes" class="quarto-xref">Table&nbsp;<span>30.9</span></a> we see that a sample of fifty respondents, from a universe split 70-30, will show that many <em>yeses</em> a preponderant proportion of the time — in fact, in fifteen of fifteen experiments; therefore, the sample size of fifty is large enough if the split is “really” 70-30.</p>
<div id="tbl-cable-yes" class="quarto-float quarto-figure quarto-figure-center anchored">
<figure class="quarto-float quarto-float-tbl figure">
<figcaption class="quarto-float-caption-top quarto-float-caption quarto-float-tbl" id="tbl-cable-yes-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
Table&nbsp;30.9: Number of “yes” votes out of 50 for 30% in favor universe
</figcaption>
<div aria-describedby="tbl-cable-yes-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
<table class="caption-top table">
<thead>
<tr class="header">
<th>Trial</th>
<th>No</th>
<th>Yes</th>
<th>Trial</th>
<th>No</th>
<th>Yes</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>1</td>
<td>13</td>
<td>37</td>
<td>9</td>
<td>15</td>
<td>35</td>
</tr>
<tr class="even">
<td>2</td>
<td>14</td>
<td>36</td>
<td>10</td>
<td>9</td>
<td>41</td>
</tr>
<tr class="odd">
<td>3</td>
<td>18</td>
<td>32</td>
<td>11</td>
<td>15</td>
<td>35</td>
</tr>
<tr class="even">
<td>4</td>
<td>10</td>
<td>40</td>
<td>12</td>
<td>15</td>
<td>35</td>
</tr>
<tr class="odd">
<td>5</td>
<td>13</td>
<td>37</td>
<td>13</td>
<td>9</td>
<td>41</td>
</tr>
<tr class="even">
<td>6</td>
<td>15</td>
<td>35</td>
<td>14</td>
<td>16</td>
<td>34</td>
</tr>
<tr class="odd">
<td>7</td>
<td>14</td>
<td>36</td>
<td>15</td>
<td>17</td>
<td>33</td>
</tr>
</tbody>
</table>
</div>
</figure>
</div>
<p>The following Python program takes samples of size 50 from a universe that is 70% “yes.” It then observes how often such samples produce more than 31 “yeses” — the number we must get if we are to be sure enough that the sample is not from a 50/50 universe.</p>
<div id="nte-poll_sample_size" class="callout callout-style-default callout-note callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Note&nbsp;30.5: Notebook: Sample size for an internet contract poll
</div>
</div>
<div class="callout-body-container callout-body">
<div class="nb-links">
<p><a class="notebook-link" href="notebooks/poll_sample_size.ipynb">Download notebook</a> <a class="interact-button" href="./interact/lab/index.html?path=poll_sample_size.ipynb">Interact</a></p>
</div>
</div>
</div>
<div class="nb-start" name="poll_sample_size" title="Sample size for an internet contract poll">

</div>
<div class="cell" data-layout-align="center">
<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a>rnd <span class="op">=</span> np.random.default_rng()</span>
<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb10-5"><a href="#cb10-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of trials.</span></span>
<span id="cb10-6"><a href="#cb10-6" aria-hidden="true" tabindex="-1"></a>n_trials <span class="op">=</span> <span class="dv">10_000</span></span>
<span id="cb10-7"><a href="#cb10-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb10-8"><a href="#cb10-8" aria-hidden="true" tabindex="-1"></a><span class="co"># Make array to store results for each trial.</span></span>
<span id="cb10-9"><a href="#cb10-9" aria-hidden="true" tabindex="-1"></a>results <span class="op">=</span> np.zeros(n_trials)</span>
<span id="cb10-10"><a href="#cb10-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb10-11"><a href="#cb10-11" aria-hidden="true" tabindex="-1"></a><span class="co"># Do 10,00 trials</span></span>
<span id="cb10-12"><a href="#cb10-12" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(n_trials):</span>
<span id="cb10-13"><a href="#cb10-13" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Generate 50 voters with 70% chance of "yes“.</span></span>
<span id="cb10-14"><a href="#cb10-14" aria-hidden="true" tabindex="-1"></a>    voters <span class="op">=</span> rnd.choice([<span class="st">"yes"</span>, <span class="st">"no"</span>], size<span class="op">=</span><span class="dv">50</span>, p<span class="op">=</span>[<span class="fl">0.7</span>, <span class="fl">0.3</span>])</span>
<span id="cb10-15"><a href="#cb10-15" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Count the "yeses".</span></span>
<span id="cb10-16"><a href="#cb10-16" aria-hidden="true" tabindex="-1"></a>    n_yes <span class="op">=</span> np.<span class="bu">sum</span>(voters <span class="op">==</span> <span class="st">'yes'</span>)</span>
<span id="cb10-17"><a href="#cb10-17" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Keep score of the result.</span></span>
<span id="cb10-18"><a href="#cb10-18" aria-hidden="true" tabindex="-1"></a>    results[i] <span class="op">=</span> n_yes</span>
<span id="cb10-19"><a href="#cb10-19" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb10-20"><a href="#cb10-20" aria-hidden="true" tabindex="-1"></a><span class="co"># Count how often the sample result &gt;= our 32 cutoff (recall that samples</span></span>
<span id="cb10-21"><a href="#cb10-21" aria-hidden="true" tabindex="-1"></a><span class="co"># with 31 or fewer "yeses" cannot be ruled out of a 50/50 universe).</span></span>
<span id="cb10-22"><a href="#cb10-22" aria-hidden="true" tabindex="-1"></a>k <span class="op">=</span> np.<span class="bu">sum</span>(results <span class="op">&gt;=</span> <span class="dv">32</span>)</span>
<span id="cb10-23"><a href="#cb10-23" aria-hidden="true" tabindex="-1"></a><span class="co"># Convert to a proportion</span></span>
<span id="cb10-24"><a href="#cb10-24" aria-hidden="true" tabindex="-1"></a>kk <span class="op">=</span> k <span class="op">/</span> n_trials</span>
<span id="cb10-25"><a href="#cb10-25" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb10-26"><a href="#cb10-26" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">'p of 50 voter samples in 70:30 universe &gt;= 32 "yes":'</span>, kk)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>p of 50 voter samples in 70:30 universe &gt;= 32 "yes": 0.859</code></pre>
</div>
</div>
<div class="nb-end">

</div>
<div class="callout callout-style-default callout-note callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
End of notebook: Sample size for an internet contract poll
</div>
</div>
<div class="callout-body-container callout-body">
<p><code>poll_sample_size</code> starts at <a href="#nte-poll_sample_size" class="quarto-xref">Note&nbsp;<span>30.5</span></a>.</p>
</div>
</div>
<!---
End of notebook.
-->
<p>If <code>kk</code> is close to 1, we can be confident that this sample will be large enough to avoid a result that we might mistakenly think comes from a 50/50 universe (provided that the real universe is 70% favorable).</p>
</section>
<section id="example-how-large-a-sample-for-pig-rations" class="level3" data-number="30.3.2">
<h3 data-number="30.3.2" class="anchored" data-anchor-id="example-how-large-a-sample-for-pig-rations"><span class="header-section-number">30.3.2</span> Example: how large a sample for pig rations?</h3>
<p>How large a sample is needed to determine whether there is any difference between the two pig rations in <a href="testing_measured.html#sec-pig-rations-measured" class="quarto-xref"><span>Section 24.0.1</span></a>? The first step is to guess the results of the tests. You estimate that the average for ration A will be a weight gain of 32 pounds. You further guess that twelve pigs on ration A might gain 36, 35, 34, 33, 33, 32, 32, 31, 31, 30, 29 and 28 pounds. This set of guesses has an equal number of pigs above and below the average and more pigs close to the average than farther away. That is, there are more pigs at 33 and 31 pounds than at 36 and 28 pounds. This would seem to be a reasonable distribution of pigs around an average of 32 pounds. In similar fashion, you guess an average weight gain of 28 pounds for ration B and a distribution of 32, 31, 30, 29, 29, 28, 28, 27, 27, 26, 25, and 24 pounds.</p>
<p>Let us review the basic strategy. We want to find a sample size large enough so that a large proportion of the time it will reveal a difference between groups big enough to be accepted as not attributable to chance. First, then, we need to find out how big the difference must be to be accepted as evidence that the difference is not attributable to chance. We do so from trials with samples of the given size from the benchmark universe. We state that a difference larger than the benchmark universe will usually produce is not attributable to chance.</p>
<p>In this case, let us try samples of 12 pigs on each ration. First we draw two samples from a <em>combined</em> benchmark universe made up of the results that we have guessed will come from ration A and ration B. (The procedure is the same as was followed in <a href="testing_measured.html#sec-pig-rations-measured" class="quarto-xref"><span>Section 24.0.1</span></a>).</p>
<div id="nte-sample_size_rations" class="callout callout-style-default callout-note callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Note&nbsp;30.6: Notebook: Sample size for pig rations with bootstrap
</div>
</div>
<div class="callout-body-container callout-body">
<div class="nb-links">
<p><a class="notebook-link" href="notebooks/sample_size_rations.ipynb">Download notebook</a> <a class="interact-button" href="./interact/lab/index.html?path=sample_size_rations.ipynb">Interact</a></p>
</div>
</div>
</div>
<div class="nb-start" name="sample_size_rations" title="Sample size for pig rations with bootstrap">

</div>
<p>First we need to get the measured data from the data file using the Pandas library:</p>
<div class="cell" data-layout-align="center">
<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a><span class="co"># set up the random number generator</span></span>
<span id="cb12-5"><a href="#cb12-5" aria-hidden="true" tabindex="-1"></a>rnd <span class="op">=</span> np.random.default_rng()</span>
<span id="cb12-6"><a href="#cb12-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb12-7"><a href="#cb12-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Estimated weights for ration A.</span></span>
<span id="cb12-8"><a href="#cb12-8" aria-hidden="true" tabindex="-1"></a>a_weights <span class="op">=</span> np.array([<span class="dv">36</span>, <span class="dv">35</span>, <span class="dv">34</span>, <span class="dv">33</span>, <span class="dv">33</span>, <span class="dv">32</span>, <span class="dv">32</span>, <span class="dv">31</span>, <span class="dv">31</span>, <span class="dv">30</span>, <span class="dv">29</span>, <span class="dv">28</span>])</span>
<span id="cb12-9"><a href="#cb12-9" aria-hidden="true" tabindex="-1"></a><span class="co"># Estimated weights for ration B.</span></span>
<span id="cb12-10"><a href="#cb12-10" aria-hidden="true" tabindex="-1"></a>b_weights <span class="op">=</span> np.array([<span class="dv">32</span>, <span class="dv">31</span>, <span class="dv">30</span>, <span class="dv">29</span>, <span class="dv">29</span>, <span class="dv">28</span>, <span class="dv">28</span>, <span class="dv">27</span>, <span class="dv">27</span>, <span class="dv">26</span>, <span class="dv">25</span>, <span class="dv">24</span>])</span>
<span id="cb12-11"><a href="#cb12-11" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb12-12"><a href="#cb12-12" aria-hidden="true" tabindex="-1"></a><span class="co"># Make a combined (benchmark) universe from the weights.</span></span>
<span id="cb12-13"><a href="#cb12-13" aria-hidden="true" tabindex="-1"></a>both <span class="op">=</span> np.concatenate([a_weights, b_weights])</span>
<span id="cb12-14"><a href="#cb12-14" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb12-15"><a href="#cb12-15" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the number of trials</span></span>
<span id="cb12-16"><a href="#cb12-16" aria-hidden="true" tabindex="-1"></a>n_trials <span class="op">=</span> <span class="dv">10_000</span></span>
<span id="cb12-17"><a href="#cb12-17" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb12-18"><a href="#cb12-18" aria-hidden="true" tabindex="-1"></a><span class="co"># An empty array to store the trial results.</span></span>
<span id="cb12-19"><a href="#cb12-19" aria-hidden="true" tabindex="-1"></a>results <span class="op">=</span> np.zeros(n_trials)</span>
<span id="cb12-20"><a href="#cb12-20" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb12-21"><a href="#cb12-21" aria-hidden="true" tabindex="-1"></a><span class="co"># Do 10,000 experiments.</span></span>
<span id="cb12-22"><a href="#cb12-22" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(n_trials):</span>
<span id="cb12-23"><a href="#cb12-23" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Take a "resample" of 12 with replacement from both and put it in fake_a</span></span>
<span id="cb12-24"><a href="#cb12-24" aria-hidden="true" tabindex="-1"></a>    fake_a <span class="op">=</span> rnd.choice(both, size<span class="op">=</span><span class="dv">12</span>)</span>
<span id="cb12-25"><a href="#cb12-25" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Likewise to make fake_b</span></span>
<span id="cb12-26"><a href="#cb12-26" aria-hidden="true" tabindex="-1"></a>    fake_b <span class="op">=</span> rnd.choice(both, size<span class="op">=</span><span class="dv">12</span>)</span>
<span id="cb12-27"><a href="#cb12-27" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Mean of the first "resample" sample.</span></span>
<span id="cb12-28"><a href="#cb12-28" aria-hidden="true" tabindex="-1"></a>    fake_a_mean <span class="op">=</span> np.mean(fake_a)</span>
<span id="cb12-29"><a href="#cb12-29" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Mean of the second "resample" sample.</span></span>
<span id="cb12-30"><a href="#cb12-30" aria-hidden="true" tabindex="-1"></a>    fake_b_mean <span class="op">=</span> np.mean(fake_b)</span>
<span id="cb12-31"><a href="#cb12-31" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Calculate the difference between the two resamples.</span></span>
<span id="cb12-32"><a href="#cb12-32" aria-hidden="true" tabindex="-1"></a>    fake_diff <span class="op">=</span> fake_a_mean <span class="op">-</span> fake_b_mean</span>
<span id="cb12-33"><a href="#cb12-33" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Keep track of each trial result.</span></span>
<span id="cb12-34"><a href="#cb12-34" aria-hidden="true" tabindex="-1"></a>    results[i] <span class="op">=</span> fake_diff</span>
<span id="cb12-35"><a href="#cb12-35" aria-hidden="true" tabindex="-1"></a>    <span class="co"># End one experiment, go back and repeat until all trials are complete,</span></span>
<span id="cb12-36"><a href="#cb12-36" aria-hidden="true" tabindex="-1"></a>    <span class="co"># then proceed.</span></span>
<span id="cb12-37"><a href="#cb12-37" aria-hidden="true" tabindex="-1"></a><span class="co"># Produce a histogram of trial results.</span></span>
<span id="cb12-38"><a href="#cb12-38" aria-hidden="true" tabindex="-1"></a>plt.hist(results, bins<span class="op">=</span><span class="dv">25</span>)</span>
<span id="cb12-39"><a href="#cb12-39" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">'Second resample mean minus first'</span>)</span>
<span id="cb12-40"><a href="#cb12-40" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">'Distribution difference in means of resamples'</span>)</span>
<span id="cb12-41"><a href="#cb12-41" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb12-42"><a href="#cb12-42" aria-hidden="true" tabindex="-1"></a><span class="co"># Get the 95% percentile.  Only 5% of results are above this value, by chance.</span></span>
<span id="cb12-43"><a href="#cb12-43" aria-hidden="true" tabindex="-1"></a>q_95 <span class="op">=</span> np.quantile(results, <span class="fl">0.95</span>)</span>
<span id="cb12-44"><a href="#cb12-44" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb12-45"><a href="#cb12-45" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">'95% quantile for resampled mean difference:'</span>, q_95)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>95% quantile for resampled mean difference: 2.0</code></pre>
</div>
<div class="sourceCode cell-code" id="cb14"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Check the quantile.</span></span>
<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a>k <span class="op">=</span> np.<span class="bu">sum</span>(results <span class="op">&gt;</span> q_95)</span>
<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a>kk <span class="op">=</span> k <span class="op">/</span> n_trials</span>
<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">'Proportion &gt; 95% quantile:'</span>, kk)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>Proportion &gt; 95% quantile: 0.0439</code></pre>
</div>
<div class="cell-output-display">
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="how_big_sample_files/figure-html/unnamed-chunk-12-1.png" class="img-fluid quarto-figure quarto-figure-center figure-img" style="width:70.0%"></p>
</figure>
</div>
</div>
</div>
<p>We find that in only of the trials the difference between the two observed groups of 12 pigs was more than 2. Now we investigate how often samples of 12 pigs, drawn from the <em>separate</em> universes, will show a mean difference larger than 2 pounds. We do so by making up a deck of 25 or 50 cards for <em>each</em> of the 12 hypothesized A’s and each of the 12 B’s, with the ration name and the weight gain written on it — that is, a deck of, say, 300 cards for each ration. Then from each deck we draw a set of 12 cards at random, record the group averages, and find the difference.</p>
<p>Here is the same work done with more runs on the computer. In this version we are sampling from the separate A and B universes we have estimated.</p>
<div class="cell" data-layout-align="center">
<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="co"># A new empty array to store the trial results.</span></span>
<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a>results <span class="op">=</span> np.zeros(n_trials)</span>
<span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb16-4"><a href="#cb16-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Do 10,000 experiments.</span></span>
<span id="cb16-5"><a href="#cb16-5" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(n_trials):</span>
<span id="cb16-6"><a href="#cb16-6" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Take a "resample" of 12 with replacement from A and put it in fake_a.</span></span>
<span id="cb16-7"><a href="#cb16-7" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Notice we are sampling from "a_weights" this time.</span></span>
<span id="cb16-8"><a href="#cb16-8" aria-hidden="true" tabindex="-1"></a>    fake_a <span class="op">=</span> rnd.choice(a_weights, size<span class="op">=</span><span class="dv">12</span>)</span>
<span id="cb16-9"><a href="#cb16-9" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Likewise to make fake_b</span></span>
<span id="cb16-10"><a href="#cb16-10" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Notice we are sampling from "b_weights" this time.</span></span>
<span id="cb16-11"><a href="#cb16-11" aria-hidden="true" tabindex="-1"></a>    fake_b <span class="op">=</span> rnd.choice(b_weights, size<span class="op">=</span><span class="dv">12</span>)</span>
<span id="cb16-12"><a href="#cb16-12" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Mean of the first "resample" sample.</span></span>
<span id="cb16-13"><a href="#cb16-13" aria-hidden="true" tabindex="-1"></a>    fake_a_mean <span class="op">=</span> np.mean(fake_a)</span>
<span id="cb16-14"><a href="#cb16-14" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Mean of the second "resample" sample.</span></span>
<span id="cb16-15"><a href="#cb16-15" aria-hidden="true" tabindex="-1"></a>    fake_b_mean <span class="op">=</span> np.mean(fake_b)</span>
<span id="cb16-16"><a href="#cb16-16" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Calculate the difference between the two resamples.</span></span>
<span id="cb16-17"><a href="#cb16-17" aria-hidden="true" tabindex="-1"></a>    fake_diff <span class="op">=</span> fake_a_mean <span class="op">-</span> fake_b_mean</span>
<span id="cb16-18"><a href="#cb16-18" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Keep track of each trial result.</span></span>
<span id="cb16-19"><a href="#cb16-19" aria-hidden="true" tabindex="-1"></a>    results[i] <span class="op">=</span> fake_diff</span>
<span id="cb16-20"><a href="#cb16-20" aria-hidden="true" tabindex="-1"></a>    <span class="co"># End one experiment, go back and repeat until all trials are complete,</span></span>
<span id="cb16-21"><a href="#cb16-21" aria-hidden="true" tabindex="-1"></a>    <span class="co"># then proceed.</span></span>
<span id="cb16-22"><a href="#cb16-22" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb16-23"><a href="#cb16-23" aria-hidden="true" tabindex="-1"></a><span class="co"># Produce a histogram of trial results.</span></span>
<span id="cb16-24"><a href="#cb16-24" aria-hidden="true" tabindex="-1"></a>plt.hist(results, bins<span class="op">=</span><span class="dv">25</span>)</span>
<span id="cb16-25"><a href="#cb16-25" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">'Second resample mean minus first, in separate universes'</span>)</span>
<span id="cb16-26"><a href="#cb16-26" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">'Distribution difference in means of resamples'</span>)</span>
<span id="cb16-27"><a href="#cb16-27" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb16-28"><a href="#cb16-28" aria-hidden="true" tabindex="-1"></a><span class="co"># What proportion of the differences are implausible in the combined universe?</span></span>
<span id="cb16-29"><a href="#cb16-29" aria-hidden="true" tabindex="-1"></a>k <span class="op">=</span> np.<span class="bu">sum</span>(results <span class="op">&gt;=</span> q_95)</span>
<span id="cb16-30"><a href="#cb16-30" aria-hidden="true" tabindex="-1"></a>kk <span class="op">=</span> k <span class="op">/</span> n_trials</span>
<span id="cb16-31"><a href="#cb16-31" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb16-32"><a href="#cb16-32" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">'p for separate universe results &gt;= 95% quantile:'</span>, kk)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>p for separate universe results &gt;= 95% quantile: 0.9865</code></pre>
</div>
<div class="cell-output-display">
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="how_big_sample_files/figure-html/unnamed-chunk-14-1.png" class="img-fluid quarto-figure quarto-figure-center figure-img" style="width:70.0%"></p>
</figure>
</div>
</div>
</div>
<p>If <code>kk</code> is close to one, we know that the sample size is large enough that samples drawn from the universes we have hypothesized will not mislead us into thinking that they could come from the same universe.</p>
<p>Therefore, two samples of twelve pigs each are clearly large enough, and, in fact, even smaller samples might be sufficient if the universes are really like those we guessed at. If, on the other hand, the differences in the guessed universes had been smaller, then twelve-pig groups would have seemed too small and we would then have had to try out larger sample sizes, say forty-eight pigs in each group and perhaps 200 pigs in each group if forty-eight were not enough. And so on until the sample size is large enough to promise the accuracy we want. (In that case, the decks would also have to be much larger, of course.)</p>
<p>If we had guessed different universes for the two rations, then the sample sizes required would have been larger or smaller. If we had guessed the averages for the two samples to be closer together, then we would have needed larger samples. Also, if we had guessed the weight gains <em>within</em> each universe to be less spread out, the samples could have been smaller and vice versa.</p>
<div class="nb-end">

</div>
<div class="callout callout-style-default callout-note callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
End of notebook: Sample size for pig rations with bootstrap
</div>
</div>
<div class="callout-body-container callout-body">
<p><code>sample_size_rations</code> starts at <a href="#nte-sample_size_rations" class="quarto-xref">Note&nbsp;<span>30.6</span></a>.</p>
</div>
</div>
<!---
End of notebook
-->
</section>
</section>
<section id="step-wise-sample-size-determination" class="level2" data-number="30.4">
<h2 data-number="30.4" class="anchored" data-anchor-id="step-wise-sample-size-determination"><span class="header-section-number">30.4</span> Step-wise sample-size determination</h2>
<p>Often it is wisest to determine the sample size as you go along, rather than fixing it firmly in advance. In sequential sampling, you <em>continue</em> sampling until the split is sufficiently even to make you believe you have a reliable answer.</p>
<p>Related techniques work in a series of jumps from sample size to sample size. Step-wise sampling makes it less likely that you will take a sample that is much larger than necessary. For example, in the internet contract survey case, if you took a sample of perhaps fifty you could see whether the split was as wide as 32-18, which you figure you need for 9 to 1 odds that your answer is right. If the split were not that wide, you would sample another fifty, another 100, or however large a sample you needed until you reached a split wide enough to satisfy you that your answer was reliable and that you really knew which way the entire universe would vote.</p>
<p>Step-wise sampling is not always practical, however, and the internet contract survey example is unusually favorable for its use. One major pitfall is that the <em>early</em> responses to a mail survey, for example, do <em>not</em> provide a random sample of the whole, and therefore it is a mistake simply to look at the early returns when the split is not wide enough to justify a verdict. If you have listened to early news reports of election returns, you know how misleading the reports from the first precincts can be if we regard them as a fair sample of the whole.<a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a></p>
<p>Stratified sampling is another device that helps reduce the sample size required, by balancing the amounts of information you obtain in the various strata. (Cluster sampling does not reduce the sample size. Rather, it aims to reduce the cost of obtaining a sample that will produce a given level of accuracy.)</p>
</section>
<section id="summary" class="level2" data-number="30.5">
<h2 data-number="30.5" class="anchored" data-anchor-id="summary"><span class="header-section-number">30.5</span> Summary</h2>
<p>Sample sizes are too often determined on the basis of convention or of the available budget. A more rational method of choosing the size of the sample is by balancing the diminution of error expected with a larger sample, and its value, against the cost of increasing the sample size. The relationship of various sample sizes to various degrees of accuracy can be estimated with resampling methods, which are illustrated here.</p>


<div id="refs" class="references csl-bib-body hanging-indent" data-entry-spacing="0" role="list" style="display: none">
<div id="ref-fussler1961patterns" class="csl-entry" role="listitem">
Fussler, Herman Howe, and Julian Lincoln Simon. 1961. <em>Patterns in the Use of Books in Large Research Libraries</em>. Chicago: University of Chicago Library.
</div>
<div id="ref-hansen1953sample" class="csl-entry" role="listitem">
Hansen, Morris H, William N Hurwitz, and William G Madow. 1953. <span>“Sample Survey Methods and Theory. Vol. I. Methods and Applications.”</span> <a href="https://archive.org/details/SampleSurveyMethodsAndTheoryVol1">https://archive.org/details/SampleSurveyMethodsAndTheoryVol1</a>.
</div>
<div id="ref-kinsey1948sexual" class="csl-entry" role="listitem">
Kinsey, Alfred C, Wardell B Pomeroy, and Clyde E Martin. 1948. <span>“Sexual Behavior in the Human Male.”</span> <em>W. B. Saunders Company</em>. <a href="https://books.google.co.uk/books?id=pfMKrY3VvigC">https://books.google.co.uk/books?id=pfMKrY3VvigC</a>.
</div>
<div id="ref-lorie1951basic" class="csl-entry" role="listitem">
Lorie, James Hirsch, and Harry V Roberts. 1951. <em>Basic Methods of Marketing Research</em>. McGraw-Hill.
</div>
<div id="ref-schlaifer1961introduction" class="csl-entry" role="listitem">
Schlaifer, Robert. 1961. <em>Introduction to Statistics for Business Decisions</em>. New York: MacGraw-Hill. <a href="https://archive.org/details/introductiontost00schl">https://archive.org/details/introductiontost00schl</a>.
</div>
<div id="ref-sudman1976applied" class="csl-entry" role="listitem">
Sudman, Seymour. 1976. <em>Applied Sampling</em>. New <span>Y</span>ork: Academic Press. <a href="https://archive.org/details/appliedsampling0000unse">https://archive.org/details/appliedsampling0000unse</a>.
</div>
</div>
</section>
<section id="footnotes" class="footnotes footnotes-end-of-document" role="doc-endnotes">
<hr>
<ol>
<li id="fn1"><p>Schlaifer <span class="citation" data-cites="schlaifer1961introduction">(<a href="references.html#ref-schlaifer1961introduction" role="doc-biblioref">1961</a>)</span> attacks the sample-size problem in the wider context of decision making, costs, and benefits. The statistically knowledgeable reader can find an excellent discussion of sample size in Hansen <em>et al.</em> <span class="citation" data-cites="hansen1953sample">(<a href="references.html#ref-hansen1953sample" role="doc-biblioref">1953</a>)</span>.<a href="#fnref1" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
<li id="fn2"><p>See J. Lorie and H. Roberts <span class="citation" data-cites="lorie1951basic">(<a href="references.html#ref-lorie1951basic" role="doc-biblioref">1951, 155–57</a>)</span> for more discussion of the limitations of sequential sampling. Hansen <em>et al</em> <span class="citation" data-cites="hansen1953sample">(<a href="references.html#ref-hansen1953sample" role="doc-biblioref">1953, 78</a>)</span>, warn against the danger of increasing the sample size in this fashion:</p>
<blockquote class="blockquote">
<p>A fairly obvious and flagrant way of arriving at biased results is to examine the returns from an initial sample to determine whether they appear acceptable to the investigator; if they do, he uses the results as they are; if they do not, he discards the sample results and draws a new sample, perhaps by a different method, in the hope that he will obtain a result more nearly like the one he expected. Such an approach can be utilized to obtain almost any results desired, or can “prove” any point even when unbiased or consistent methods of selecting the sample and making the individual estimates are used if the initial results are subject to relatively large sampling errors.</p>
</blockquote>
<a href="#fnref2" class="footnote-back" role="doc-backlink">↩︎</a></li>
</ol>
</section>

</main> <!-- /main -->
<script id="quarto-html-after-body" type="application/javascript">
window.document.addEventListener("DOMContentLoaded", function (event) {
  const toggleBodyColorMode = (bsSheetEl) => {
    const mode = bsSheetEl.getAttribute("data-mode");
    const bodyEl = window.document.querySelector("body");
    if (mode === "dark") {
      bodyEl.classList.add("quarto-dark");
      bodyEl.classList.remove("quarto-light");
    } else {
      bodyEl.classList.add("quarto-light");
      bodyEl.classList.remove("quarto-dark");
    }
  }
  const toggleBodyColorPrimary = () => {
    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
    if (bsSheetEl) {
      toggleBodyColorMode(bsSheetEl);
    }
  }
  toggleBodyColorPrimary();  
  const icon = "";
  const anchorJS = new window.AnchorJS();
  anchorJS.options = {
    placement: 'right',
    icon: icon
  };
  anchorJS.add('.anchored');
  const isCodeAnnotation = (el) => {
    for (const clz of el.classList) {
      if (clz.startsWith('code-annotation-')) {                     
        return true;
      }
    }
    return false;
  }
  const onCopySuccess = function(e) {
    // button target
    const button = e.trigger;
    // don't keep focus
    button.blur();
    // flash "checked"
    button.classList.add('code-copy-button-checked');
    var currentTitle = button.getAttribute("title");
    button.setAttribute("title", "Copied!");
    let tooltip;
    if (window.bootstrap) {
      button.setAttribute("data-bs-toggle", "tooltip");
      button.setAttribute("data-bs-placement", "left");
      button.setAttribute("data-bs-title", "Copied!");
      tooltip = new bootstrap.Tooltip(button, 
        { trigger: "manual", 
          customClass: "code-copy-button-tooltip",
          offset: [0, -8]});
      tooltip.show();    
    }
    setTimeout(function() {
      if (tooltip) {
        tooltip.hide();
        button.removeAttribute("data-bs-title");
        button.removeAttribute("data-bs-toggle");
        button.removeAttribute("data-bs-placement");
      }
      button.setAttribute("title", currentTitle);
      button.classList.remove('code-copy-button-checked');
    }, 1000);
    // clear code selection
    e.clearSelection();
  }
  const getTextToCopy = function(trigger) {
      const codeEl = trigger.previousElementSibling.cloneNode(true);
      for (const childEl of codeEl.children) {
        if (isCodeAnnotation(childEl)) {
          childEl.remove();
        }
      }
      return codeEl.innerText;
  }
  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
    text: getTextToCopy
  });
  clipboard.on('success', onCopySuccess);
  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
    // For code content inside modals, clipBoardJS needs to be initialized with a container option
    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
      text: getTextToCopy,
      container: window.document.getElementById('quarto-embedded-source-code-modal')
    });
    clipboardModal.on('success', onCopySuccess);
  }
    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
    var mailtoRegex = new RegExp(/^mailto:/);
      var filterRegex = new RegExp('/' + window.location.host + '/');
    var isInternal = (href) => {
        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
    }
    // Inspect non-navigation links and adorn them if external
 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
    for (var i=0; i<links.length; i++) {
      const link = links[i];
      if (!isInternal(link.href)) {
        // undo the damage that might have been done by quarto-nav.js in the case of
        // links that we want to consider external
        if (link.dataset.originalHref !== undefined) {
          link.href = link.dataset.originalHref;
        }
      }
    }
  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
    const config = {
      allowHTML: true,
      maxWidth: 500,
      delay: 100,
      arrow: false,
      appendTo: function(el) {
          return el.parentElement;
      },
      interactive: true,
      interactiveBorder: 10,
      theme: 'quarto',
      placement: 'bottom-start',
    };
    if (contentFn) {
      config.content = contentFn;
    }
    if (onTriggerFn) {
      config.onTrigger = onTriggerFn;
    }
    if (onUntriggerFn) {
      config.onUntrigger = onUntriggerFn;
    }
    window.tippy(el, config); 
  }
  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
  for (var i=0; i<noterefs.length; i++) {
    const ref = noterefs[i];
    tippyHover(ref, function() {
      // use id or data attribute instead here
      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
      try { href = new URL(href).hash; } catch {}
      const id = href.replace(/^#\/?/, "");
      const note = window.document.getElementById(id);
      if (note) {
        return note.innerHTML;
      } else {
        return "";
      }
    });
  }
  const xrefs = window.document.querySelectorAll('a.quarto-xref');
  const processXRef = (id, note) => {
    // Strip column container classes
    const stripColumnClz = (el) => {
      el.classList.remove("page-full", "page-columns");
      if (el.children) {
        for (const child of el.children) {
          stripColumnClz(child);
        }
      }
    }
    stripColumnClz(note)
    if (id === null || id.startsWith('sec-')) {
      // Special case sections, only their first couple elements
      const container = document.createElement("div");
      if (note.children && note.children.length > 2) {
        container.appendChild(note.children[0].cloneNode(true));
        for (let i = 1; i < note.children.length; i++) {
          const child = note.children[i];
          if (child.tagName === "P" && child.innerText === "") {
            continue;
          } else {
            container.appendChild(child.cloneNode(true));
            break;
          }
        }
        if (window.Quarto?.typesetMath) {
          window.Quarto.typesetMath(container);
        }
        return container.innerHTML
      } else {
        if (window.Quarto?.typesetMath) {
          window.Quarto.typesetMath(note);
        }
        return note.innerHTML;
      }
    } else {
      // Remove any anchor links if they are present
      const anchorLink = note.querySelector('a.anchorjs-link');
      if (anchorLink) {
        anchorLink.remove();
      }
      if (window.Quarto?.typesetMath) {
        window.Quarto.typesetMath(note);
      }
      // TODO in 1.5, we should make sure this works without a callout special case
      if (note.classList.contains("callout")) {
        return note.outerHTML;
      } else {
        return note.innerHTML;
      }
    }
  }
  for (var i=0; i<xrefs.length; i++) {
    const xref = xrefs[i];
    tippyHover(xref, undefined, function(instance) {
      instance.disable();
      let url = xref.getAttribute('href');
      let hash = undefined; 
      if (url.startsWith('#')) {
        hash = url;
      } else {
        try { hash = new URL(url).hash; } catch {}
      }
      if (hash) {
        const id = hash.replace(/^#\/?/, "");
        const note = window.document.getElementById(id);
        if (note !== null) {
          try {
            const html = processXRef(id, note.cloneNode(true));
            instance.setContent(html);
          } finally {
            instance.enable();
            instance.show();
          }
        } else {
          // See if we can fetch this
          fetch(url.split('#')[0])
          .then(res => res.text())
          .then(html => {
            const parser = new DOMParser();
            const htmlDoc = parser.parseFromString(html, "text/html");
            const note = htmlDoc.getElementById(id);
            if (note !== null) {
              const html = processXRef(id, note);
              instance.setContent(html);
            } 
          }).finally(() => {
            instance.enable();
            instance.show();
          });
        }
      } else {
        // See if we can fetch a full url (with no hash to target)
        // This is a special case and we should probably do some content thinning / targeting
        fetch(url)
        .then(res => res.text())
        .then(html => {
          const parser = new DOMParser();
          const htmlDoc = parser.parseFromString(html, "text/html");
          const note = htmlDoc.querySelector('main.content');
          if (note !== null) {
            // This should only happen for chapter cross references
            // (since there is no id in the URL)
            // remove the first header
            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
              note.children[0].remove();
            }
            const html = processXRef(null, note);
            instance.setContent(html);
          } 
        }).finally(() => {
          instance.enable();
          instance.show();
        });
      }
    }, function(instance) {
    });
  }
      let selectedAnnoteEl;
      const selectorForAnnotation = ( cell, annotation) => {
        let cellAttr = 'data-code-cell="' + cell + '"';
        let lineAttr = 'data-code-annotation="' +  annotation + '"';
        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
        return selector;
      }
      const selectCodeLines = (annoteEl) => {
        const doc = window.document;
        const targetCell = annoteEl.getAttribute("data-target-cell");
        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
        const lineIds = lines.map((line) => {
          return targetCell + "-" + line;
        })
        let top = null;
        let height = null;
        let parent = null;
        if (lineIds.length > 0) {
            //compute the position of the single el (top and bottom and make a div)
            const el = window.document.getElementById(lineIds[0]);
            top = el.offsetTop;
            height = el.offsetHeight;
            parent = el.parentElement.parentElement;
          if (lineIds.length > 1) {
            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
            height = bottom - top;
          }
          if (top !== null && height !== null && parent !== null) {
            // cook up a div (if necessary) and position it 
            let div = window.document.getElementById("code-annotation-line-highlight");
            if (div === null) {
              div = window.document.createElement("div");
              div.setAttribute("id", "code-annotation-line-highlight");
              div.style.position = 'absolute';
              parent.appendChild(div);
            }
            div.style.top = top - 2 + "px";
            div.style.height = height + 4 + "px";
            div.style.left = 0;
            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
            if (gutterDiv === null) {
              gutterDiv = window.document.createElement("div");
              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
              gutterDiv.style.position = 'absolute';
              const codeCell = window.document.getElementById(targetCell);
              const gutter = codeCell.querySelector('.code-annotation-gutter');
              gutter.appendChild(gutterDiv);
            }
            gutterDiv.style.top = top - 2 + "px";
            gutterDiv.style.height = height + 4 + "px";
          }
          selectedAnnoteEl = annoteEl;
        }
      };
      const unselectCodeLines = () => {
        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
        elementsIds.forEach((elId) => {
          const div = window.document.getElementById(elId);
          if (div) {
            div.remove();
          }
        });
        selectedAnnoteEl = undefined;
      };
        // Handle positioning of the toggle
    window.addEventListener(
      "resize",
      throttle(() => {
        elRect = undefined;
        if (selectedAnnoteEl) {
          selectCodeLines(selectedAnnoteEl);
        }
      }, 10)
    );
    function throttle(fn, ms) {
    let throttle = false;
    let timer;
      return (...args) => {
        if(!throttle) { // first call gets through
            fn.apply(this, args);
            throttle = true;
        } else { // all the others get throttled
            if(timer) clearTimeout(timer); // cancel #2
            timer = setTimeout(() => {
              fn.apply(this, args);
              timer = throttle = false;
            }, ms);
        }
      };
    }
      // Attach click handler to the DT
      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
      for (const annoteDlNode of annoteDls) {
        annoteDlNode.addEventListener('click', (event) => {
          const clickedEl = event.target;
          if (clickedEl !== selectedAnnoteEl) {
            unselectCodeLines();
            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
            if (activeEl) {
              activeEl.classList.remove('code-annotation-active');
            }
            selectCodeLines(clickedEl);
            clickedEl.classList.add('code-annotation-active');
          } else {
            // Unselect the line
            unselectCodeLines();
            clickedEl.classList.remove('code-annotation-active');
          }
        });
      }
  const findCites = (el) => {
    const parentEl = el.parentElement;
    if (parentEl) {
      const cites = parentEl.dataset.cites;
      if (cites) {
        return {
          el,
          cites: cites.split(' ')
        };
      } else {
        return findCites(el.parentElement)
      }
    } else {
      return undefined;
    }
  };
  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
  for (var i=0; i<bibliorefs.length; i++) {
    const ref = bibliorefs[i];
    const citeInfo = findCites(ref);
    if (citeInfo) {
      tippyHover(citeInfo.el, function() {
        var popup = window.document.createElement('div');
        citeInfo.cites.forEach(function(cite) {
          var citeDiv = window.document.createElement('div');
          citeDiv.classList.add('hanging-indent');
          citeDiv.classList.add('csl-entry');
          var biblioDiv = window.document.getElementById('ref-' + cite);
          if (biblioDiv) {
            citeDiv.innerHTML = biblioDiv.innerHTML;
          }
          popup.appendChild(citeDiv);
        });
        return popup.innerHTML;
      });
    }
  }
});
</script>
<nav class="page-navigation">
  <div class="nav-page nav-page-previous">
      <a href="./correlation_causation.html" class="pagination-link" aria-label="Correlation and Causation">
        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Correlation and Causation</span></span>
      </a>          
  </div>
  <div class="nav-page nav-page-next">
      <a href="./bayes_simulation.html" class="pagination-link" aria-label="Bayesian Analysis by Simulation">
        <span class="nav-page-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Bayesian Analysis by Simulation</span></span> <i class="bi bi-arrow-right-short"></i>
      </a>
  </div>
</nav>
</div> <!-- /content -->


</body></html>