From 60fdfdae2c66be734bcfd7a74618c67117e31bfd Mon Sep 17 00:00:00 2001 From: Yanay Rosen Date: Mon, 26 Nov 2018 14:35:17 -0800 Subject: [PATCH 1/8] Updated labs to fa18 standard- first pass. --- .DS_Store | Bin 0 -> 6148 bytes materials/.DS_Store | Bin 0 -> 6148 bytes materials/x18/.DS_Store | Bin 0 -> 6148 bytes materials/x18/lab/.DS_Store | Bin 0 -> 6148 bytes .../.ipynb_checkpoints/lab00-checkpoint.ipynb | 264 +++ materials/x18/lab/1/lab00/lab00.ipynb | 31 +- .../.ipynb_checkpoints/lab01-checkpoint.ipynb | 1084 ++++++++++++ materials/x18/lab/1/lab01/lab01.ipynb | 105 +- .../.ipynb_checkpoints/lab02-checkpoint.ipynb | 1548 +++++++++++++++++ materials/x18/lab/1/lab02/lab02.ipynb | 159 +- materials/x18/lab/1/lab02/tests/q11.py | 2 +- materials/x18/lab/1/lab02/tests/q441.py | 22 - materials/x18/lab/1/lab02/tests/q442.py | 22 - materials/x18/lab/1/lab02/tests/q443.py | 22 - materials/x18/lab/1/lab02/tests/q444.py | 22 - .../.ipynb_checkpoints/lab03-checkpoint.ipynb | 1071 ++++++++++++ materials/x18/lab/1/lab03/lab03.ipynb | 207 ++- .../.ipynb_checkpoints/lab04-checkpoint.ipynb | 1128 ++++++++++++ materials/x18/lab/1/lab04/lab04.ipynb | 267 ++- materials/x18/lab/2/.DS_Store | Bin 0 -> 6148 bytes materials/x18/lab/2/lab01/.DS_Store | Bin 0 -> 6148 bytes .../.ipynb_checkpoints/lab01-checkpoint.ipynb | 729 ++++++++ materials/x18/lab/2/lab01/comparisons.png | Bin 0 -> 55882 bytes materials/x18/lab/2/lab01/lab01.ipynb | 276 ++- materials/x18/lab/2/lab01/tests/q1_6.py | 22 - materials/x18/lab/2/lab01/tests/q1_7.py | 44 - materials/x18/lab/2/lab01/tests/q2_2.py | 2 +- materials/x18/lab/2/lab01/tests/q2_3.py | 2 +- materials/x18/lab/2/lab01/tests/q2_4.py | 22 - materials/x18/lab/2/lab01/tests/q2_5.py | 22 - .../.ipynb_checkpoints/lab02-checkpoint.ipynb | 664 +++++++ materials/x18/lab/2/lab02/lab02.ipynb | 114 +- .../.ipynb_checkpoints/lab05-checkpoint.ipynb | 667 +++++++ materials/x18/lab/2/lab05/lab05.ipynb | 75 +- .../.ipynb_checkpoints/lab06-checkpoint.ipynb | 881 ++++++++++ materials/x18/lab/2/lab06/lab06.ipynb | 7 +- .../.ipynb_checkpoints/lab01-checkpoint.ipynb | 772 ++++++++ materials/x18/lab/3/lab01/lab01.ipynb | 111 +- .../.ipynb_checkpoints/lab02-checkpoint.ipynb | 716 ++++++++ materials/x18/lab/3/lab02/lab02.ipynb | 100 +- .../.ipynb_checkpoints/lab03-checkpoint.ipynb | 601 +++++++ materials/x18/lab/3/lab03/lab03.ipynb | 82 +- 42 files changed, 11097 insertions(+), 766 deletions(-) create mode 100644 .DS_Store create mode 100644 materials/.DS_Store create mode 100644 materials/x18/.DS_Store create mode 100644 materials/x18/lab/.DS_Store create mode 100644 materials/x18/lab/1/lab00/.ipynb_checkpoints/lab00-checkpoint.ipynb create mode 100644 materials/x18/lab/1/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb create mode 100644 materials/x18/lab/1/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb delete mode 100644 materials/x18/lab/1/lab02/tests/q441.py delete mode 100644 materials/x18/lab/1/lab02/tests/q442.py delete mode 100644 materials/x18/lab/1/lab02/tests/q443.py delete mode 100644 materials/x18/lab/1/lab02/tests/q444.py create mode 100644 materials/x18/lab/1/lab03/.ipynb_checkpoints/lab03-checkpoint.ipynb create mode 100644 materials/x18/lab/1/lab04/.ipynb_checkpoints/lab04-checkpoint.ipynb create mode 100644 materials/x18/lab/2/.DS_Store create mode 100644 materials/x18/lab/2/lab01/.DS_Store create mode 100644 materials/x18/lab/2/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb create mode 100644 materials/x18/lab/2/lab01/comparisons.png delete mode 100644 materials/x18/lab/2/lab01/tests/q1_6.py delete mode 100644 materials/x18/lab/2/lab01/tests/q1_7.py delete mode 100644 materials/x18/lab/2/lab01/tests/q2_4.py delete mode 100644 materials/x18/lab/2/lab01/tests/q2_5.py create mode 100644 materials/x18/lab/2/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb create mode 100644 materials/x18/lab/2/lab05/.ipynb_checkpoints/lab05-checkpoint.ipynb create mode 100644 materials/x18/lab/2/lab06/.ipynb_checkpoints/lab06-checkpoint.ipynb create mode 100644 materials/x18/lab/3/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb create mode 100644 materials/x18/lab/3/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb create mode 100644 materials/x18/lab/3/lab03/.ipynb_checkpoints/lab03-checkpoint.ipynb diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..f9ad8639c2efffdf647cfbffce82547c18915f3f GIT binary patch literal 6148 zcmeHKO>fgM7=GOrmKFgOLK=cgE=U{-YUnhO(4Z3>9B+n^qC8qgtD?b-U>i}@gkd4P7YIXpnR z^<5e~q751$)~67wJ)u!4`co;oDtClIF^X-LqUv;n9y9qA5xkhv665`PMlegq`nf*` z<2XvQcKa)W%lvGbcKyjqQF`KM2Z8pVaUP%fe(*dR&ic)( zk9d;#Q8JuJauf~`^6EvDgnZWJ(sRm2xo;n$KE7~c``z)VBT03kZuB0Z!$ZxVcv$_j1WTByG>V!vJgUnBIb%Pd?PrIIwm>tG$J!qfc(?O74hh!gW9e>$DcoemI=jDZ6ouomOwAv;)YPKNc^f*eI zx>wiTq}eg+_;y06qAH%3m-~HptF~Ej+_jBC#p%0lty*!`Ya4?>Q7x{xJNp+m50Cw) z!SjpV3kjd8k$J))yn;am-udk$*2xukhS9?)MrME+UDT$ zMtAF2m3Y@WpckMh7?)Z6N&&-M#Sn{E@e-&J@H=P#dW4xp@PP1-fT4j0X5dd5_yiOU BSWo}} literal 0 HcmV?d00001 diff --git a/materials/x18/.DS_Store b/materials/x18/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..732df4c51a33dd343999c518772416a1966ef230 GIT binary patch literal 6148 zcmeHK!AiqG5Z!I7ZYyFBg5oI`!9x#?l`4o3svf)v5k08b#1PzF_`?pulc+tsEHCZpBn_gZ-80Eis|_LNr%}?< z!-gItt-e{uw-ZVgRq?XCIvTm#wXKTdZq&yWXXLuIYQ@>C)yLzaT3&N^51Ti4_oIjL zL0RRi&HUhT!`3H2+0q7BC7Qq7|Z7QHmmDFNL>K&v_hk1|UGmAD| zNb_b`hu$oyLy^>aNXLW==~?8F8DIuJGl0DxNEz?{gP;3<6T|~Ezzj?$1G2F1?>C?} zx?8_giFd67+5|O5Z!|}VMGrqI%9$kjMI`#3~P+cd+ffz zK8p{qkF!>FSAy~9szIut`m3(4>Y4s}=$;Zn$VRv62~iP3C{V;g37U5V$5G!%fjrZI z66Y`xSUA--jWA3)18(oU{lr{G$=Lp_{>Y&Jbwu`|G0DgdqocUhJFooMHE9;cX{&9M z<7Nv&PLJcXX?hLQO`9D%Pv8cWDyh03@`%| z$N(LAYHkAGjbDiwUsV9?$D(RNyUexCrheOqY2bz#_*4d-0nxH(vj6}9 literal 0 HcmV?d00001 diff --git a/materials/x18/lab/1/lab00/.ipynb_checkpoints/lab00-checkpoint.ipynb b/materials/x18/lab/1/lab00/.ipynb_checkpoints/lab00-checkpoint.ipynb new file mode 100644 index 0000000..c75742c --- /dev/null +++ b/materials/x18/lab/1/lab00/.ipynb_checkpoints/lab00-checkpoint.ipynb @@ -0,0 +1,264 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lab 0: Introduction and Practice with Jupyter Notebooks" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In Lab 0, you will learn how to navigate a Jupyter Notebook (like this one). All of the required lab assignments in this course are published as jupyter notebooks. You follow the instructions in the notebook to complete the assignment.\n", + "\n", + "This one isn't graded, but you should complete it anyway for practice. Let's get started!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "## 1. Jupyter notebooks\n", + "This webpage is called a Jupyter notebook. A notebook is a place to write programs and view their results.\n", + "\n", + "### 1.1. Text cells\n", + "In a notebook, each rectangle containing text or code is called a *cell*.\n", + "\n", + "Text cells (like this one) can be edited by double-clicking on them. They're written in a simple format called [Markdown](http://daringfireball.net/projects/markdown/syntax) to add formatting and section headings. You don't need to learn Markdown, but you might want to.\n", + "\n", + "After you edit a text cell, select the \"run cell\" button at the top that looks like ▶| to confirm any changes." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 1.1.1.**
\n", + "This paragraph is in its own text cell. Try editing it so that **this** sentence is the last sentence in the paragraph, and then select the \"run cell\" ▶| button on the top. This sentence, for example, should be deleted. So should this one." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "print(\"Hello, World!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And this one:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "print(\"\\N{WAVING HAND SIGN}, \\N{EARTH GLOBE ASIA-AUSTRALIA}!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The fundamental building block of Python code is an expression. Cells can contain multiple lines with multiple expressions. When you run a cell, the lines of code are executed in the order in which they appear. Every `print` expression prints a line. Run the next cell and notice the order of the output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "print(\"First this line is printed,\")\n", + "print(\"and then this one.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 1.2.1.**
\n", + "Change the cell above so that it prints out:\n", + "\n", + " First this line,\n", + " then the whole 🌏,\n", + " and then this one.\n", + "\n", + "*Hint:* If you're stuck on how to print the Earth symbol, try looking at the print expressions above." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.3. Writing Jupyter notebooks\n", + "You can use Jupyter notebooks for your own projects or documents. They are among the world's most popular programming environments for data science. When you make your own notebook, you'll need to create your own cells for text and code.\n", + "\n", + "To add a cell, select the + button in the menu bar. A new cell starts out as text. You can change it to a code cell by selecting it so that it's highlighted, then selecting the drop-down box next to the restart (⟳) button in the menu bar, and choosing Code instead of Markdown.\n", + "\n", + "**Question 1.3.1.**
\n", + "Add a code cell below this one. Write code in it that prints out:\n", + " \n", + " A whole new cell! \n", + " ♪🌏♪\n", + "\n", + "(That musical note symbol is like the Earth symbol. Its long-form name is `\\N{EIGHTH NOTE}`.)\n", + "\n", + "Run your cell to verify that it works." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.4. Errors\n", + "Python is a language, and like natural human languages, it has rules. It differs from natural language in two important ways:\n", + "1. The rules are *simple*. You can learn most of them in a few weeks and gain reasonable proficiency with the language in a semester.\n", + "2. The rules are *rigid*. If you're proficient in a natural language, you can understand a non-proficient speaker, glossing over small mistakes. A computer running Python code is not smart enough to do that.\n", + "\n", + "Whenever you write code, you'll make mistakes. When you run a code cell that has errors, Python will sometimes produce error messages to tell you what you did wrong.\n", + "\n", + "Errors are okay; even experienced programmers make many errors. When you make an error, you just have to find the source of the problem, fix it, and move on.\n", + "\n", + "We have made an error in the next cell. Run it and see what happens." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "print(\"This line is missing something.\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should see something like this (minus our annotations):\n", + "\n", + "\"\"/\n", + "\n", + "The last line of the error output attempts to tell you what went wrong. The *syntax* of a language is its structure, and this `SyntaxError` tells you that you have created an illegal structure. \"`EOF`\" means \"end of file,\" so the message is saying Python expected you to write something more (in this case, a right parenthesis) before finishing the cell.\n", + "\n", + "There's a lot of terminology in programming languages. You'll learn as you go. If you are ever having trouble understanding an error message, search the discussion forum. If you don't find an answer, post a question about the error yourself.\n", + "\n", + "Try to fix the code above so that you can run the cell and see the intended message instead of an error." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.5. The Kernel\n", + "The kernel is a program that executes the code inside your notebook and outputs the results. In the top right of your window, you can see a circle that indicates the status of your kernel. If the circle is empty (⚪), the kernel is idle and ready to execute code. If the circle is filled in (⚫), the kernel is busy running some code. \n", + "\n", + "You may run into problems where your kernel is stuck for an excessive amount of time, your notebook is very slow and unresponsive, or your kernel loses its connection. If this happens, try the following steps:\n", + "1. At the top of your screen, select **Kernel**, then **Interrupt**.\n", + "2. If that doesn't help, select **Kernel**, then **Restart**. If you do this, you will have to run your code cells from the start of your notebook up until where you paused your work.\n", + "3. If that doesn't help, restart your server. First, save your work by selecting **File** at the top left of your screen, then **Save and Checkpoint**. Next, select **Control Panel** at the top right. Choose **Stop My Server** to shut it down, then **My Server** to start it back up. Then, navigate back to the notebook you were working on." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.6. Completing a lab\n", + "All assignments in the course will be distributed as notebooks like this one. At the top of each assignment, you'll see a cell like the one below that imports autograder tests. Run it to import the autograder tests.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Don't change this cell, just run it\n", + "# Import autograder tests\n", + "from gofer.ok import check" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When you finish a question, you need to check your answer by running the check command below. It's OK to grade multiple times; Gofer will only try to grade your final submission for each question. There are no hidden autograder tests. If you pass all the given autograder tests for a question, you will receive full credit for that question." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check(\"tests/q0.py\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The notebook resides on a server that is run by the course staff, and so we have access to it as well. Once you're finished with a lab, use the File menu within the notebook page (below the Jupyter logo) to \"Save and Checkpoint\" and you're done. You may also check your notebook in its entirety with the following command." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import glob\n", + "from gofer.ok import grade_notebook\n", + "if not globals().get('__GOFER_GRADER__', False):\n", + " display(grade_notebook('lab00.ipynb', sorted(glob.glob('tests/q*.py'))))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/materials/x18/lab/1/lab00/lab00.ipynb b/materials/x18/lab/1/lab00/lab00.ipynb index 7c18b56..c75742c 100644 --- a/materials/x18/lab/1/lab00/lab00.ipynb +++ b/materials/x18/lab/1/lab00/lab00.ipynb @@ -44,7 +44,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "print(\"Hello, World!\")" @@ -60,7 +62,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "print(\"\\N{WAVING HAND SIGN}, \\N{EARTH GLOBE ASIA-AUSTRALIA}!\")" @@ -76,7 +80,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "print(\"First this line is printed,\")\n", @@ -136,7 +142,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "print(\"This line is missing something.\"" @@ -182,7 +190,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# Don't change this cell, just run it\n", @@ -200,7 +210,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check(\"tests/q0.py\")" @@ -216,7 +228,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "import glob\n", @@ -242,10 +256,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 1 } - diff --git a/materials/x18/lab/1/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb b/materials/x18/lab/1/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb new file mode 100644 index 0000000..75b2c77 --- /dev/null +++ b/materials/x18/lab/1/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb @@ -0,0 +1,1084 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lab 1: Introduction to Python\n", + "\n", + "Welcome to Lab 1! Each week you will complete a lab assignment like this one. In this lab, you'll get started with the Python programming language through numbers, names, and expressions.\n", + "\n", + "As you go, please regularly select **Save and Checkpoint** from the `File` menu below the Jupyter logo to save your work." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Numbers\n", + "\n", + "Quantitative information arises everywhere in data science. In addition to representing commands to print out lines, expressions can represent numbers and methods of combining numbers. The expression `3.2500` evaluates to the number 3.25. (Run the cell and see.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "3.2500" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that we didn't have to `print`. When you run a notebook cell, if the last line has a value, then Jupyter helpfully prints out that value for you. However, it won't print out prior lines automatically. If you want to print out a prior line, you need to add the `print` statement. Run the cell below to check." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "print(2)\n", + "3\n", + "4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Above, you should see that 4 is the value of the last expression, 2 is printed, but 3 is lost forever because it was neither printed nor last.\n", + "\n", + "You don't want to print everything all the time anyway. But if you feel sorry for 3, change the cell above to print it." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.1. Arithmetic\n", + "The line in the next cell subtracts. Its value is what you'd expect. Run it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "3.25 - 1.5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Many basic arithmetic operations are built in to Python. The textbook section on [Expressions](http://www.inferentialthinking.com/chapters/03/1/expressions.html) describes all the arithmetic operators used in the course. The common operator that differs from typical math notation is `**`, which raises one number to the power of the other. So, `2**3` stands for $2^3$ and evaluates to 8. \n", + "\n", + "The order of operations is what you learned in elementary school, and Python also has parentheses. For example, compare the outputs of the cells below. Use parentheses for a happy new year!" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "-724.0" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "3+6*5-6*3**2*2**3/4*7" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "2018.0" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "3+(6*5-(6*3))**2*((2**3)/4*7)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In standard math notation, the first expression is\n", + "\n", + "$$3 + 6 \\times 5 - 6 \\times 3^2 \\times \\frac{2^3}{4} \\times 7,$$\n", + "\n", + "while the second expression is\n", + "\n", + "$$3 + (6 \\times 5 - (6 \\times 3))^2 \\times (\\frac{(2^3)}{4} \\times 7).$$\n", + "\n", + "**Question 1.1.1.**
Write a Python expression in this next cell that's equal to $5 \\times (3 \\frac{10}{11}) - 49 \\frac{1}{3} + 2^{.5 \\times 22} - \\frac{7}{33}$. That's five times three and ten elevenths, minus 49 and a third, plus two to the power of half of 22, minus 7 33rds. By \"$3 \\frac{10}{11}$\" we mean $3+\\frac{10}{11}$, not $3 \\times \\frac{10}{11}$.\n", + "\n", + "Replace the ellipses (`...`) with your expression. Try to use parentheses only when necessary.\n", + "\n", + "*Hint:* The correct output should be a familiar number." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Names\n", + "In natural language, we have terminology that lets us quickly reference very complicated concepts. We don't say, \"That's a large mammal with brown fur and sharp teeth!\" Instead, we just say, \"Bear!\"\n", + "\n", + "Similarly, an effective strategy for writing code is to define names for data as we compute it, like a lawyer would define terms for complex ideas at the start of a legal document to simplify the rest of the writing.\n", + "\n", + "In Python, we do this with *assignment statements*. An assignment statement has a name on the left side of an `=` sign and an expression to be evaluated on the right." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "ten = 3 * 2 + 4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When you run that cell, Python first evaluates the first line. It computes the value of the expression `3 * 2 + 4`, which is the number 10. Then it gives that value the name `ten`. At that point, the code in the cell is done running.\n", + "\n", + "After you run that cell, the value 10 is bound to the name `ten`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "ten" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The statement `ten = 3 * 2 + 4` is not asserting that `ten` is already equal to `3 * 2 + 4`, as we might expect by analogy with math notation. Rather, that line of code changes what `ten` means; it now refers to the value 10, whereas before it meant nothing at all.\n", + "\n", + "If the designers of Python had been ruthlessly pedantic, they might have made us write\n", + "\n", + " define the name ten to hereafter have the value of 3 * 2 + 4 \n", + "\n", + "instead. You will probably appreciate the brevity of \"`=`\"! But keep in mind that this is the real meaning.\n", + "\n", + "**Question 2.1.**
Try writing code that uses a name (like `eleven`) that hasn't been assigned to anything. You'll see an error!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A common pattern in Jupyter notebooks is to assign a value to a name and then immediately evaluate the name in the last line in the cell so that the value is displayed as output. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "close_to_pi = 355/113\n", + "close_to_pi" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Another common pattern is that a series of lines in a single cell will build up a complex computation in stages, naming the intermediate results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "bimonthly_salary = 840\n", + "monthly_salary = 2 * bimonthly_salary\n", + "number_of_months_in_a_year = 12\n", + "yearly_salary = number_of_months_in_a_year * monthly_salary\n", + "yearly_salary" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Names in Python can have letters (upper- and lower-case letters are both okay and count as different letters), underscores, and numbers. The first character can't be a number (otherwise a name might look like a number). And names can't contain spaces, since spaces are used to separate pieces of code from each other.\n", + "\n", + "Other than those rules, what you name something doesn't matter *to Python*. For example, this cell does the same thing as the above cell, except everything has a different name:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "a = 840\n", + "b = 2 * a\n", + "c = 12\n", + "d = c * b\n", + "d" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**However**, names are very important for making your code *readable* to yourself and others. The cell above is shorter, but it's totally useless without an explanation of what it does.\n", + "\n", + "According to a famous joke among computer scientists, naming things is one of the two hardest problems in computer science. (The other two are cache invalidation and \"off-by-one\" errors. And people say computer scientists have an odd sense of humor...)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 2.2.**
Assign the name `seconds_in_a_decade` to the number of seconds between midnight January 1, 2010 and midnight January 1, 2020. Use Python to perform any required arithmetic.\n", + "\n", + "*Hint:* If you're stuck, the next section shows you how to get hints." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "# Change the next line so that it computes the number of\n", + "# seconds in a decade and assigns that number the name\n", + "# seconds_in_a_decade.\n", + "seconds_in_a_decade = ...\n", + "\n", + "# We've put this line in this cell so that it will print\n", + "# the value you've given to seconds_in_a_decade when you\n", + "# run it. You don't need to change this.\n", + "seconds_in_a_decade" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.1. Checking your code\n", + "Now that you know how to name things, you can start using the built-in *tests* to check whether your work is correct. Try not to change the contents of the test cells. \n", + "\n", + "The cell below appears only once in the notebook and loads all of the tests so that they can be run later. You can load all of the tests before you answer all questions in the notebook. You will run tests as you go to check your work along the way, and you can also run all of the tests at the end to make sure that you will receive full credit on the lab." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# These lines load the tests.\n", + "from gofer.ok import check" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Running the following cell will test whether you have assigned `seconds_in_a_decade` correctly in Question 2.2. \n", + "\n", + "Sometimes the tests will give hints about what went wrong. If the test doesn't pass, read the output, adjust your answer to the question, run the answer cell again to update the name `seconds_in_a_decade`, then run this test cell again.\n", + "\n", + "Sometimes the tests will tell you the answer. Rather than copying the answer, try to understand how it was reached. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "# Test cell; please do not change!\n", + "check('tests/q22.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.2. Comments\n", + "You may have noticed this line in the cell above:\n", + "\n", + " # Test cell; please do not change!\n", + "\n", + "That is called a *comment*. It doesn't make anything happen in Python; Python ignores anything on a line after a #. Instead, it's there to communicate something about the code to you, the human reader. Comments are extremely useful.\n", + "\n", + "\"comic" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.3. Application: A physics experiment\n", + "\n", + "On the Apollo 15 mission to the Moon, astronaut David Scott famously replicated Galileo's physics experiment in which he showed that gravity accelerates objects of different mass at the same rate. Because there is no air resistance for a falling object on the surface of the Moon, even two objects with very different masses and densities should fall at the same rate. David Scott compared a feather and a hammer.\n", + "\n", + "You can run the following cell to watch a video of the experiment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "from IPython.display import YouTubeVideo\n", + "# The original URL is:\n", + "# https://www.youtube.com/watch?v=U7db6ZeLR5s\n", + "YouTubeVideo(\"U7db6ZeLR5s\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here's the transcript of the video:\n", + "\n", + "**167:22:06 Scott**: Well, in my left hand, I have a feather; in my right hand, a hammer. And I guess one of the reasons we got here today was because of a gentleman named Galileo, a long time ago, who made a rather significant discovery about falling objects in gravity fields. And we thought where would be a better place to confirm his findings than on the Moon. And so we thought we'd try it here for you. The feather happens to be, appropriately, a falcon feather for our Falcon. And I'll drop the two of them here and, hopefully, they'll hit the ground at the same time. \n", + "\n", + "**167:22:43 Scott**: How about that!\n", + "\n", + "**167:22:45 Allen**: How about that! (Applause in Houston)\n", + "\n", + "**167:22:46 Scott**: Which proves that Mr. Galileo was correct in his findings." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Newton's Law.** Using this footage, we can also attempt to confirm another famous bit of physics: Newton's law of universal gravitation. Newton's laws predict that any object dropped near the surface of the Moon should fall\n", + "\n", + "$$\\frac{1}{2} G \\frac{M}{R^2} t^2 \\text{ meters}$$\n", + "\n", + "after $t$ seconds, where $G$ is a universal constant, $M$ is the moon's mass in kilograms, and $R$ is the moon's radius in meters. So if we know $G$, $M$, and $R$, then Newton's laws let us predict how far an object will fall over any amount of time.\n", + "\n", + "To verify the accuracy of this law, we will calculate the difference between the predicted distance the hammer drops and the actual distance. (If they are different, it might be because Newton's laws are wrong, or because our measurements are imprecise, or because there are other factors affecting the hammer for which we haven't accounted.)\n", + "\n", + "Someone studied the video and estimated that the hammer was dropped 113 cm from the surface. Counting frames in the video, the hammer falls for 1.2 seconds (36 frames)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 2.3.1.**
Complete the code in the next cell to fill in the *data* from the experiment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "# t, the duration of the fall in the experiment, in seconds.\n", + "# Fill this in.\n", + "time = ...\n", + "\n", + "# The estimated distance the hammer actually fell, in meters.\n", + "# Fill this in.\n", + "estimated_distance_m = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "check('tests/q231.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 2.3.2.**
Now, complete the code in the next cell to compute the difference between the predicted and estimated distances (in meters) that the hammer fell in this experiment.\n", + "\n", + "This just means translating the formula above ($\\frac{1}{2}G\\frac{M}{R^2}t^2$) into Python code. You'll have to replace each variable in the math formula with the name we gave that number in Python code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "# First, we've written down the values of the 3 universal\n", + "# constants that show up in Newton's formula.\n", + "\n", + "# G, the universal constant measuring the strength of gravity.\n", + "gravity_constant = 6.674 * 10**-11\n", + "\n", + "# M, the moon's mass, in kilograms.\n", + "moon_mass_kg = 7.34767309 * 10**22\n", + "\n", + "# R, the radius of the moon, in meters.\n", + "moon_radius_m = 1.737 * 10**6\n", + "\n", + "# The distance the hammer should have fallen over the\n", + "# duration of the fall, in meters, according to Newton's\n", + "# law of gravity. The text above describes the formula\n", + "# for this distance given by Newton's law.\n", + "# **YOU FILL THIS PART IN.**\n", + "predicted_distance_m = ...\n", + "\n", + "# Here we've computed the difference between the predicted\n", + "# fall distance and the distance we actually measured.\n", + "# If you've filled in the above code, this should just work.\n", + "difference = predicted_distance_m - estimated_distance_m\n", + "difference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "check('tests/q232.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Calling functions\n", + "\n", + "The most common way to combine or manipulate values in Python is by calling functions. Python comes with many built-in functions that perform common operations.\n", + "\n", + "For example, the `abs` function takes a single number as its argument and returns the absolute value of that number. The absolute value of a number is its distance from 0 on the number line, so `abs(5)` is 5 and `abs(-5)` is also 5." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "abs(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "abs(-5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.1. Application: Computing walking distances\n", + "Chunhua is on the corner of 7th Avenue and 42nd Street in Midtown Manhattan, and she wants to know far she'd have to walk to get to Gramercy School on the corner of 10th Avenue and 34th Street.\n", + "\n", + "She can't cut across blocks diagonally, since there are buildings in the way. She has to walk along the sidewalks. Using the map below, she sees she'd have to walk 3 avenues (long blocks) and 8 streets (short blocks). In terms of the given numbers, she computed 3 as the difference between 7 and 10, *in absolute value*, and 8 similarly. \n", + "\n", + "Chunhua also knows that blocks in Manhattan are all about 80m by 274m (avenues are farther apart than streets). So in total, she'd have to walk $(80 \\times |42 - 34| + 274 \\times |7 - 10|)$ meters to get to the park.\n", + "\n", + "\"visual\n", + "\n", + "**Question 3.1.1.**
Finish the line `num_avenues_away = ...` in the next cell so that the cell calculates the distance Chunhua must walk and gives it the name `manhattan_distance`. Everything else has been filled in for you. **Use the `abs` function.**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "# Here's the number of streets away:\n", + "num_streets_away = abs(42-34)\n", + "\n", + "# Compute the number of avenues away in a similar way:\n", + "num_avenues_away = ...\n", + "\n", + "street_length_m = 80\n", + "avenue_length_m = 274\n", + "\n", + "# Now we compute the total distance Chunhua must walk.\n", + "manhattan_distance = street_length_m*num_streets_away + avenue_length_m*num_avenues_away\n", + "\n", + "# We've included this line so that you see the distance\n", + "# you've computed when you run this cell. You don't need\n", + "# to change it, but you can if you want.\n", + "manhattan_distance" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Be sure to run the next cell to test your code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "check('tests/q311.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Multiple arguments\n", + "Some functions take multiple arguments, separated by commas. For example, the built-in `max` function returns the maximum argument passed to it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "max(2, -3, 4, -5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Understanding nested expressions\n", + "Function calls and arithmetic expressions can themselves contain expressions. You saw an example in the last question:\n", + "\n", + " abs(42-34)\n", + "\n", + "has 2 number expressions in a subtraction expression in a function call expression. And you probably wrote something like `abs(7-10)` to compute `num_avenues_away`.\n", + "\n", + "Nested expressions can turn into complicated-looking code. However, the way in which complicated expressions break down is very regular.\n", + "\n", + "Suppose we are interested in heights that are very unusual. We'll say that a height is unusual to the extent that it's far away on the number line from the average human height. [An estimate](http://press.endocrine.org/doi/full/10.1210/jcem.86.9.7875?ck=nck&) of the average adult human height (averaging, we hope, over all humans on Earth today) is 1.688 meters.\n", + "\n", + "So if Aditya is 1.21 meters tall, then his height is $|1.21 - 1.688|$, or $.478$, meters away from the average. Here's a picture of that:\n", + "\n", + "\"number\n", + "\n", + "And here's how we'd write that in one line of Python code:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "abs(1.21 - 1.688)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What's going on here? `abs` takes just one argument, so the stuff inside the parentheses is all part of that *single argument*. Specifically, the argument is the value of the expression `1.21 - 1.688`. The value of that expression is `-.478`. That value is the argument to `abs`. The absolute value of that is `.478`, so `.478` is the value of the full expression `abs(1.21 - 1.688)`.\n", + "\n", + "Picture simplifying the expression in several steps:\n", + "\n", + "1. `abs(1.21 - 1.688)`\n", + "2. `abs(-.478)`\n", + "3. `.478`\n", + "\n", + "In fact, that's basically what Python does to compute the value of the expression." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 4.1.**
Say that Botan's height is 1.85 meters. In the next cell, use `abs` to compute the absolute value of the difference between Botan's height and the average human height. Give that value the name `botan_distance_from_average_m`.\n", + "\n", + "\"number" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "# Replace the ... with an expression to compute the absolute\n", + "# value of the difference between Botan's height (1.85m) and\n", + "# the average human height.\n", + "botan_distance_from_average_m = ...\n", + "\n", + "# Again, we've written this here so that the distance you\n", + "# compute will get printed when you run this cell.\n", + "botan_distance_from_average_m" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "check('tests/q41.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.1. More nesting\n", + "Now say that we want to compute the most unusual height among Aditya's and Botan's heights. We'll use the function `max`, which (again) takes two numbers as arguments and returns the larger of the two arguments. Combining that with the `abs` function, we can compute the biggest distance from the average among the two heights:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "# Just read and run this cell.\n", + "\n", + "aditya_height_m = 1.21\n", + "botan_height_m = 1.85\n", + "average_adult_human_height_m = 1.688\n", + "\n", + "# The biggest distance from the average human height, among the two heights:\n", + "biggest_distance_m = max(abs(aditya_height_m - average_adult_human_height_m), abs(botan_height_m - average_adult_human_height_m))\n", + "\n", + "# Print out our results in a nice readable format:\n", + "print(\"The biggest distance from the average height among these two people is\", biggest_distance_m, \"meters.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The line where `biggest_distance_m` is computed looks complicated, but we can break it down into simpler components just like we did before.\n", + "\n", + "The basic recipe is repeated simplification of small parts of the expression:\n", + "* We start with the simplest components whose values we know, like plain names or numbers. (Examples: `aditya_height_m` or `5`.)\n", + "* **Find a simple-enough group of expressions:** We look for a group of simple expressions that are directly connected to each other in the code, for example by arithmetic or as arguments to a function call.\n", + "* **Evaluate that group:** We evaluate the arithmetic expressions or function calls they're part of, and replace the whole group with whatever we compute. (Example: `aditya_height_m - average_adult_human_height_m` becomes `-.478`.)\n", + "* **Repeat:** We continue this process, using the values of the glommed-together stuff as our new basic components. (Example: `abs(-.478)` becomes `.478`, and `max(.478, .162)` later becomes `.478`.)\n", + "* We keep doing that until we've evaluated the whole expression.\n", + "\n", + "You can run the next cell to see a slideshow of that process." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "from IPython.display import IFrame\n", + "IFrame('https://docs.google.com/presentation/d/1urkX-nRsD8VJvcOnJsjmCy0Jpv752Ssn5Pphg2sMC-0/embed?start=false&loop=false&delayms=3000', 800, 600)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ok, your turn. \n", + "\n", + "**Question 4.1.1.**
Given the heights of the Splash Triplets from the Golden State Warriors, write an expression that computes the smallest difference between any of the three heights. Your expression shouldn't have any numbers in it, only function calls and the names `klay`, `steph`, and `kevin`. Give the value of your expression the name `min_height_difference`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "# The three players' heights, in meters:\n", + "klay = 2.01 # Klay Thompson is 6'7\"\n", + "steph = 1.91 # Steph Curry is 6'3\"\n", + "kevin = 2.06 # Kevin Durant is officially 6'9\", but many suspect that he is taller.\n", + " # (Further complicating matters, membership of the \"Splash Triplets\" \n", + " # is disputed, since it was originally used in reference to \n", + " # Klay Thompson, Steph Curry, and Draymond Green.)\n", + "\n", + "# We'd like to look at all 3 pairs of heights, compute the absolute\n", + "# difference between each pair, and then find the smallest of those\n", + "# 3 absolute differences. This is left to you! If you're stuck,\n", + "# try computing the value for each step of the process (like the\n", + "# difference between Klay's heigh and Steph's height) on a separate\n", + "# line and giving it a name (like klay_steph_height_diff).\n", + "min_height_difference = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "check('tests/q411.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Tables" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A website called [Gapminder](https://www.gapminder.org/) collects a large variety of measurements of human health, education, and progress. Each measurement is published in a table that has one row per country and one column per year, describing how the measurement varies over time and place.\n", + "\n", + "For example, [this table](https://docs.google.com/spreadsheets/d/1kmnYQzXLGVF9RbKB3Y-WuUsJFumnE4s2UWdmlskv6r4/pub#) describes the average number of years of school attended by all women 25 and older. The table has a row for each of 175 countries and a column for each year from 1970 through 2009. The data were estimated for a study by the [Institute for Health Metrics and Evaluation](http://www.healthmetricsandevaluation.org/) called \"Increased educational attainment and its impact on child mortality: a systematic analysis in 175 countries from 1970 to 2009\" ([link](http://www.healthmetricsandevaluation.org/resources/datasets/2010/education_attainment/education_attainment.html&sa=D&ust=1522644678563000&usg=AFQjCNG-Rn_hO868jLLBz6FRLT8LSqwUVA)).\n", + "\n", + "To load tables into Python, you must first import the `datascience` module. The second line below makes sure that charts appear on the screen when you create them. You only need to execute these lines once per notebook (and each time you restart your kernel)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Don't change this cell\n", + "from datascience import *\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, run the next cell in order to load the table describing years of school attended by women around the world and over time. Only the first 10 rows of the table will be displayed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "school = Table.read_table('school.csv')\n", + "school" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 5.1.**\n", + "Assign the name `top_1970` to a two-column table that has the column of country names (labeled `\"Row Labels\"`) and the years in school in 1970, sorted by the second column in decreasing order. Notice the large difference between the country with the most average years of school and the rest in the top 10.\n", + "\n", + "*Hint*: Even though 1970 is a number, treat it as text by placing it within quotation marks when using it as a label. For example, `school.select(\"1970\")` rather than `school.select(1970)`. Column labels are always text." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "top_1970 = ...\n", + "top_1970" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q51.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can create a bar chart of all the countries in the data set using the expression below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": false + }, + "outputs": [], + "source": [ + "top_1970.barh('Row Labels')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 5.2** Now, to see how much these numbers have changed, assign `top_1970_with_2009` to a table with the rows in the same order, but include a third column for 2009 as well. The differences between countries are much smaller in 2009." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "top_1970_with_2009 = ...\n", + "top_1970_with_2009" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q52.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A bar chart for this three-column table will compare 1970 to 2009 for each country. Everywhere in the world, the average number of years that women attend school has increased, in some cases dramatically!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": false + }, + "outputs": [], + "source": [ + "top_1970_with_2009.barh('Row Labels')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The [Gapminder data browser](https://www.gapminder.org/data/) includes many other tables that you can explore as well. For more information on how to load a table from the web, try the course discussion forum." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Completion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Congratulations, you're done with lab 1! You can check that all tests pass by running the next cell. If all the tests are passing in your notebook when we score everybody's assignment, then you will receive full credit." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": false + }, + "outputs": [], + "source": [ + "# For your convenience, you can run this cell to run all the tests at once!\n", + "import glob\n", + "from gofer.ok import grade_notebook\n", + "if not globals().get('__GOFER_GRADER__', False):\n", + " display(grade_notebook('lab01.ipynb', sorted(glob.glob('tests/q*.py'))))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, select **Save and Checkpoint** from the `File` menu below the Jupyter logo to save your work." + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/materials/x18/lab/1/lab01/lab01.ipynb b/materials/x18/lab/1/lab01/lab01.ipynb index ea1b5b9..76b9247 100644 --- a/materials/x18/lab/1/lab01/lab01.ipynb +++ b/materials/x18/lab/1/lab01/lab01.ipynb @@ -24,6 +24,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -42,6 +43,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -72,6 +74,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -90,24 +93,46 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "-724.0" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "2+6*5-6*3**2*2**3/4*7" + "3+6*5-6*3**2*2**3/4*7" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "2018.0" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "2+(6*5-(6*3))**2*((2**3)/4*7)" + "3+(6*5-(6*3))**2*((2**3)/4*7)" ] }, { @@ -116,13 +141,13 @@ "source": [ "In standard math notation, the first expression is\n", "\n", - "$$2 + 6 \\times 5 - 6 \\times 3^2 \\times \\frac{2^3}{4} \\times 7,$$\n", + "$$3 + 6 \\times 5 - 6 \\times 3^2 \\times \\frac{2^3}{4} \\times 7,$$\n", "\n", "while the second expression is\n", "\n", - "$$2 + (6 \\times 5 - (6 \\times 3))^2 \\times (\\frac{(2^3)}{4} \\times 7).$$\n", + "$$3 + (6 \\times 5 - (6 \\times 3))^2 \\times (\\frac{(2^3)}{4} \\times 7).$$\n", "\n", - "**Question 1.1.1.**
Write a Python expression in this next cell that's equal to $5 \\times (3 \\frac{10}{11}) - 49 \\frac{1}{3} + 2^{.5 \\times 22} - \\frac{7}{33}$. That's five times three and ten elevenths, minus 49 and a third, plus two to the power of half of 22, minus 7 33rds. By \"$3 \\frac{10}{11}$\" we mean $3+\\frac{10}{11}$, not $3 \\times \\frac{10}{11}$.\n", + "**Question 1.1.1.**
Write a Python expression in this next cell that's equal to $5 \\times (3 \\frac{10}{11}) - 49 \\frac{1}{3} + 2^{.5 \\times 22} + \\frac{26}{33}$. That's five times three and ten elevenths, minus 49 and a third, plus two to the power of half of 22, plus 26 33rds. By \"$3 \\frac{10}{11}$\" we mean $3+\\frac{10}{11}$, not $3 \\times \\frac{10}{11}$.\n", "\n", "Replace the ellipses (`...`) with your expression. Try to use parentheses only when necessary.\n", "\n", @@ -133,11 +158,12 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], "source": [ - "..." + "5 * (43/11) - (49 + 1/)" ] }, { @@ -156,6 +182,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -176,6 +203,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -202,6 +230,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -218,6 +247,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -237,6 +267,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -261,6 +292,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -294,6 +326,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -322,7 +355,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# These lines load the tests.\n", @@ -344,6 +379,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -381,6 +417,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -432,6 +469,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -449,6 +487,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -469,6 +508,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -503,6 +543,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -525,6 +566,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -536,6 +578,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -563,6 +606,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -596,6 +640,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -615,6 +660,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -648,6 +694,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -683,6 +730,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -701,6 +749,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -720,6 +769,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -757,6 +807,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -778,6 +829,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -803,6 +855,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -831,7 +884,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# Don't change this cell\n", @@ -849,7 +904,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "school = Table.read_table('school.csv')\n", @@ -869,7 +926,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "top_1970 = ...\n", @@ -879,7 +938,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q51.py')" @@ -896,6 +957,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": false }, "outputs": [], @@ -913,7 +975,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "top_1970_with_2009 = ...\n", @@ -923,7 +987,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q52.py')" @@ -940,6 +1006,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": false }, "outputs": [], @@ -972,6 +1039,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": false }, "outputs": [], @@ -1008,10 +1076,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 1 } - diff --git a/materials/x18/lab/1/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb b/materials/x18/lab/1/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb new file mode 100644 index 0000000..d5184af --- /dev/null +++ b/materials/x18/lab/1/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb @@ -0,0 +1,1548 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lab 2: Data Types, Arrays, and Tables\n", + "Welcome to Lab 2! \n", + "\n", + "Last time, we had our first look at Python and Jupyter notebooks. So far, we've only used Python to manipulate numbers. There's a lot more to life than numbers, so Python lets us represent many other types of data in programs.\n", + "\n", + "In this lab, you'll first see how to represent and manipulate another fundamental type of data: text. A piece of text is called a *string* in Python.\n", + "\n", + "You'll also see how to invoke *methods*. A method is very similar to a function. Calling a method looks different because the method is tied to a particular piece of data.\n", + "\n", + "Last, you'll learn more about working with datasets in Python." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, initialize the grader. Each time you come back to this site to work on the lab, you will need to run this cell again." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from gofer.ok import check" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Review: The building blocks of Python code\n", + "\n", + "The two building blocks of Python code are *expressions* and *statements*. An **expression** is a piece of code that\n", + "\n", + "* is self-contained, meaning it would make sense to write it on a line by itself, and\n", + "* usually has a value.\n", + "\n", + "\n", + "Here are two expressions that both evaluate to 3\n", + "\n", + " 3\n", + " 5 - 2\n", + " \n", + "One important form of an expression is the **call expression**, which first names a function and then describes its arguments. The function returns some value, based on its arguments. Some important mathematical functions are\n", + "\n", + "| Function | Description |\n", + "|----------|---------------------------------------------------------------|\n", + "| `abs` | Returns the absolute value of its argument |\n", + "| `max` | Returns the maximum of all its arguments |\n", + "| `min` | Returns the minimum of all its arguments |\n", + "| `pow` | Raises its first argument to the power of its second argument |\n", + "| `round` | Round its argument to the nearest integer |\n", + "\n", + "Here are two call expressions that both evaluate to 3\n", + "\n", + " abs(2 - 5)\n", + " max(round(2.8), min(pow(2, 10), -1 * pow(2, 10)))\n", + "\n", + "All these expressions but the first are **compound expressions**, meaning that they are actually combinations of several smaller expressions. `2 + 3` combines the expressions `2` and `3` by addition. In this case, `2` and `3` are called **subexpressions** because they're expressions that are part of a larger expression. Any expression can be used as part of a larger expression.\n", + "\n", + "A **statement** is a piece of code that *makes something happen* rather than *having a value*. For example, an **assignment statement** assigns a value to a name. \n", + "\n", + "Every assignment statement has one `=` sign. The whole statement is executed by **evaluating the expression on the right-hand side** of the equals sign and then **assigning its value to the name on the left-hand side**. Here are some assignment statements:\n", + " \n", + " height = 1.3\n", + " the_number_five = abs(-5)\n", + " absolute_height_difference = abs(height - 1.688)\n", + "\n", + "A key idea in programming is that large, interesting things can be built by combining many simple, uninteresting things. The key to understanding a complicated piece of code is breaking it down into its simple components.\n", + "\n", + "For example, a lot is going on in the last statement above, but it's really just a combination of a few things. This picture describes what's going on.\n", + "\n", + "\"Explanation\n", + "\n", + "Any names that you assign in one cell are available in later cells and can be used in place of the value assigned to them." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 1.1.**
In the next cell, assign the name `new_year` to the larger number among the following two numbers:\n", + "\n", + "1. the absolute value of $2^{5}-2^{11}-2^{1}-2^{0}$, and \n", + "2. $5 \\times 13 \\times 31 + 4$.\n", + "\n", + "Try to use just one statement (one line of code)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "new_year = ...\n", + "new_year" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check your work by executing the next cell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q11.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Text\n", + "Programming doesn't just concern numbers. Text is one of the most common types of values used in programs. \n", + "\n", + "A snippet of text is represented by a **string value** in Python. The word \"*string*\" is a programming term for a sequence of characters. A string might contain a single character, a word, a sentence, or a whole book.\n", + "\n", + "To distinguish text data from actual code, we demarcate strings by putting quotation marks around them. Single quotes (`'`) and double quotes (`\"`) are both valid, but the types of opening and closing quotation marks must match. The contents can be any sequence of characters, including numbers and symbols. \n", + "\n", + "We've seen strings before in `print` statements. Below, two different strings are passed as arguments to the `print` function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "print(\"I <3\", 'Data Science')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Just like names can be given to numbers, names can be given to string values. The names and strings aren't required to be similar in any way. Any name can be assigned to any string." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "one = 'two'\n", + "plus = '*'\n", + "print(one, plus, one)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 2.1.**
Yuri Gagarin was the first person to travel through outer space. When he emerged from his capsule upon landing on Earth, he [reportedly](https://en.wikiquote.org/wiki/Yuri_Gagarin) had the following conversation with a woman and girl who saw the landing:\n", + "\n", + " The woman asked: \"Can it be that you have come from outer space?\"\n", + " Gagarin replied: \"As a matter of fact, I have!\"\n", + "\n", + "The cell below contains unfinished code. Fill in the `...`s so that it prints out this conversation *exactly* as it appears above." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "woman_asking = ...\n", + "woman_quote = '\"Can it be that you have come from outer space?\"'\n", + "gagarin_reply = 'Gagarin replied:'\n", + "gagarin_quote = ...\n", + "\n", + "print(woman_asking, woman_quote)\n", + "print(gagarin_reply, gagarin_quote)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q21.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.1. String Methods" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Strings can be transformed using **methods**, which are functions that involve an existing string and some other arguments. One example is the `replace` method, which replaces all instances of some part of a string with some alternative. \n", + "\n", + "A method is invoked on a string by placing a `.` after the string value, then the name of the method, and finally parentheses containing the arguments. Here's a sketch, where the `<` and `>` symbols aren't part of the syntax; they just mark the boundaries of sub-expressions.\n", + "\n", + " .(, , ...)\n", + "\n", + "Try to predict the output of these examples, then execute them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Replace one letter\n", + "'Hello'.replace('H', 'C')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Replace a sequence of letters, which appears twice\n", + "'hitchhiker'.replace('hi', 'ma')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once a name is bound to a string value, methods can be invoked on that name as well. The name is still bound to the original string, so a new name is needed to capture the result. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "sharp = 'edged'\n", + "hot = sharp.replace('ed', 'ma')\n", + "print('sharp:', sharp)\n", + "print('hot:', hot)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can call functions on the results of other functions. For example,\n", + "\n", + " max(abs(-5), abs(3))\n", + "\n", + "has value 5. Similarly, you can invoke methods on the results of other method (or function) calls." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Calling replace on the output of another call to replace\n", + "'train'.replace('t', 'ing').replace('in', 'de')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here's a picture of how Python evaluates a \"chained\" method call like that:\n", + "\n", + "\"In" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 2.1.1.**
Assign strings to the names `you` and `this` so that the final expression evaluates to a 10-letter English word with three double letters in a row.\n", + "\n", + "*Hint:* The call to `print` is there to print out the intermediate result called `the`. This should be an English word with two double letters in a row.\n", + "\n", + "*Hint 2:* Run the tests if you're stuck. They'll give you some hints." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "you = ...\n", + "this = ...\n", + "a = 'beeper'\n", + "the = a.replace('p', you) \n", + "print('the:', the)\n", + "the.replace('bee', this)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q211.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Other string methods do not take any arguments at all, because the original string is all that's needed to compute the result. In these cases, parentheses are still needed, but there's nothing in between the parentheses. Here are some methods that take no arguments:\n", + "\n", + "|Method name|Value|\n", + "|-|-|\n", + "|`lower`|a lowercased version of the string|\n", + "|`upper`|an uppercased version of the string|\n", + "|`capitalize`|a version with the first letter capitalized|\n", + "|`title`|a version with the first letter of every word capitalized||\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "'unIverSITy of caliFORnia'.title()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All these string methods are useful, but most programmers don't memorize their names or how to use them. Instead, people usually just search the internet for documentation and examples. A complete [list of string methods](https://docs.python.org/3/library/stdtypes.html#string-methods) appears in the Python language documentation. [Stack Overflow](http://stackoverflow.com) has a huge database of answered questions that often demonstrate how to use these methods to achieve various ends." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.2. Converting to and from Strings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Strings and numbers are different *types* of values, even when a string contains the digits of a number. For example, evaluating the following cell causes an error because an integer cannot be added to a string." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "8 + \"8\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "However, there are built-in functions to convert numbers to strings and strings to numbers. \n", + "\n", + "|Function name|Effect|Example|\n", + "|-|-|-|\n", + "|`int` |Converts a string of digits and perhaps a negative sign to an integer (`int`) value|`int(\"42\")`|\n", + "|`float`|Converts a string of digits and perhaps a negative sign and decimal point to a decimal (`float`) value|`float(\"4.2\")`|\n", + "|`str` | Converts any value to a string (`str`) value|`str(42)`|" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Try to predict what the following cell will evaluate to, then evaluate it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "8 + int(\"8\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Suppose you're writing a program that looks for dates in a text, and you want your program to find the amount of time that elapsed between two years it has identified. It doesn't make sense to subtract two texts, but you can first convert the text containing the years into numbers.\n", + "\n", + "**Question 2.2.1.**
Finish the code below to compute the number of years that elapsed between `one_year` and `another_year`. Don't just write the numbers `1618` and `1648` (or `30`); use a conversion function to turn the given text data into numbers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Some text data:\n", + "one_year = \"1618\"\n", + "another_year = \"1648\"\n", + "\n", + "# Complete the next line. Note that we can't just write:\n", + "# another_year - one_year\n", + "# If you don't see why, try seeing what happens when you\n", + "# write that here.\n", + "difference = ...\n", + "difference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q221.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 2.2.2.** Use `replace` and `int` together to compute the difference between the the year 753 BC ([the founding of Rome](https://en.wikipedia.org/wiki/Ancient_Rome)) and the year 410 AD ([the sack of Rome](https://en.wikipedia.org/wiki/Sack_of_Rome_(410)). Try not to use any numbers in your solution, but instead manipulate the strings that are provided.\n", + "\n", + "*Hint*: It's ok to be off by one year. In historical calendars, there is no year zero, but astronomical calendars do include [year zero](https://en.wikipedia.org/wiki/Year_zero) to simplify calculations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "founded = 'BC 753'\n", + "sacked = 'AD 410'\n", + "start = ...\n", + "end = ...\n", + "print('Ancient Rome lasted for about', end-start, 'years from', founded, 'to', sacked)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q222.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.3. Strings as function arguments\n", + "\n", + "String values, like numbers, can be arguments to functions and can be returned by functions. The function `len` takes a single string as its argument and returns the number of characters in the string: its **len**gth. \n", + "\n", + "Note that it doesn't count *words*. `len(\"one small step for man\")` is 22, not 5.\n", + "\n", + "**Question 2.3.1.**
Use `len` to find out the number of characters in the very long string in the next cell. (It's the first sentence of the English translation of the French [Declaration of the Rights of Man](http://avalon.law.yale.edu/18th_century/rightsof.asp).) The length of a string is the total number of characters in it, including things like spaces and punctuation. Assign `sentence_length` to that number." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "a_very_long_sentence = \"The representatives of the French people, organized as a National Assembly, believing that the ignorance, neglect, or contempt of the rights of man are the sole cause of public calamities and of the corruption of governments, have determined to set forth in a solemn declaration the natural, unalienable, and sacred rights of man, in order that this declaration, being constantly before all the members of the Social body, shall remind them continually of their rights and duties; in order that the acts of the legislative power, as well as those of the executive power, may be compared at any moment with the objects and purposes of all political institutions and may thus be more respected, and, lastly, in order that the grievances of the citizens, based hereafter upon simple and incontestable principles, shall tend to the maintenance of the constitution and redound to the happiness of all.\"\n", + "sentence_length = ...\n", + "sentence_length" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q231.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Importing code\n", + "\n", + "> What has been will be again, \n", + "> what has been done will be done again; \n", + "> there is nothing new under the sun.\n", + "\n", + "Most programming involves work that is very similar to work that has been done before. Since writing code is time consuming, it's good to rely on others' published code when you can. Rather than copy-pasting, Python allows us to **import** other code, creating a **module** that contains all of the names created by that code.\n", + "\n", + "Python includes many useful modules that are just an `import` away. We'll look at the `math` module as a first example. The `math` module is extremely useful in computing mathematical expressions in Python. \n", + "\n", + "Suppose we want to very accurately compute the area of a circle with radius 5 meters. For that, we need the constant $\\pi$, which is roughly 3.14. Conveniently, the `math` module has `pi` defined for us:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import math\n", + "radius = 5\n", + "area_of_circle = radius**2 * math.pi\n", + "area_of_circle" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`pi` is defined inside `math`, and the way that we access names that are inside modules is by writing the module's name, then a dot, then the name of the thing we want:\n", + "\n", + " .\n", + " \n", + "In order to use a module at all, we must first write the statement `import `. That statement creates a module object with things like `pi` in it and then assigns the name `math` to that module. Above we have done that for `math`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 3.1.**
`math` also provides the name `e` for the base of the natural logarithm, which is roughly 2.71. Compute $e^{\\pi}-\\pi$, giving it the name `near_twenty`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "near_twenty = ...\n", + "near_twenty" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q31.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![XKCD](http://imgs.xkcd.com/comics/e_to_the_pi_minus_pi.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.1. Importing functions\n", + "\n", + "**Modules** can provide other named things, including **functions**. For example, `math` provides the name `sin` for the sine function. Having imported `math` already, we can write `math.sin(3)` to compute the sine of 3. (Note that this sine function considers its argument to be in [radians](https://en.wikipedia.org/wiki/Radian), not degrees. 180 degrees are equivalent to $\\pi$ radians.)\n", + "\n", + "**Question 3.1.1.**
A $\\frac{\\pi}{4}$-radian (45-degree) angle forms a right triangle with equal base and height, pictured below. If the hypotenuse (the radius of the circle in the picture) is 1, then the height is $\\sin(\\frac{\\pi}{4})$. Compute that using `sin` and `pi` from the `math` module. Give the result the name `sine_of_pi_over_four`.\n", + "\n", + "\n", + "(Source: [Wolfram MathWorld](http://mathworld.wolfram.com/images/eps-gif/TrigonometryAnglesPi4_1000.gif))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "sine_of_pi_over_four = ...\n", + "sine_of_pi_over_four" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q311.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For your reference, here are some more examples of functions from the `math` module.\n", + "\n", + "Note how different methods take in different number of arguments. Often, the documentation of the module will provide information on how many arguments is required for each method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Calculating factorials.\n", + "math.factorial(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Calculating logarithms (the logarithm of 8 in base 2).\n", + "# The result is 3 because 2 to the power of 3 is 8.\n", + "math.log(8, 2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Calculating square roots.\n", + "math.sqrt(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There's many variations of how we can import methods from outside sources. For example, we can import just a specific method from an outside source, we can rename a library we import, and we can import every single method from a whole library. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Importing just cos and pi from math.\n", + "# Now, we don't have to use \"math.\" before these names.\n", + "from math import cos, pi\n", + "print(cos(pi))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# We can nickname math as something else, if we don't want to type the name math\n", + "import math as m\n", + "m.log(m.pi)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Lastly, we can import ever thing from math and use all of its names without \"math.\"\n", + "from math import *\n", + "log(pi)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### A function that displays a picture\n", + "People have written Python functions that do very cool and complicated things, like crawling web pages for data, transforming videos, or learning functions from data. Now that you can import things, when you want to do something with code, first check to see if someone else has done it for you.\n", + "\n", + "Let's see an example of a function that's used for downloading and displaying pictures.\n", + "\n", + "The module `IPython.display` provides a function called `Image`. The `Image` function takes a single argument, a string that is the URL of the image on the web. It returns an *image* value that this Jupyter notebook understands how to display. To display an image, make it the value of the last expression in a cell, just like you'd display a number or a string.\n", + "\n", + "**Question 3.1.2.**
In the next cell, import the module `IPython.display` and use its `Image` function to display the image at this URL:\n", + "\n", + " https://upload.wikimedia.org/wikipedia/commons/thumb/8/8c/David_-_The_Death_of_Socrates.jpg/1024px-David_-_The_Death_of_Socrates.jpg\n", + "\n", + "Give the name `art` to the output of the call to `Image`. (It might take a few seconds to load the image. It's a painting called *The Death of Socrates* by Jacques-Louis David, depicting events from a philosophical text by Plato.)\n", + "\n", + "*Hint*: A link isn't any special type of data type in Python. You can't just write a link into Python and expect it to work; you need to type the link in as a specific data type. Which one makes the most sense?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Import the module IPython.display. Watch out for capitalization.\n", + "import IPython.display\n", + "# Replace the ... with a call to the Image function\n", + "# in the IPython.display module, which should produce\n", + "# a picture.\n", + "art = ...\n", + "art" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q312.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Arrays\n", + "\n", + "Up to now, we haven't done much that you couldn't do yourself by hand, without going through the trouble of learning Python. Computers are most useful when a small amount of code performs a lot of work by *performing the same action* to *many different things*.\n", + "\n", + "For example, in the time it takes you to calculate the 18% tip on a restaurant bill, a laptop can calculate 18% tips for every restaurant bill paid by every human on Earth that day. (That's if you're pretty fast at doing arithmetic in your head!)\n", + "\n", + "**Arrays** are how we put many values in one place so that we can operate on them as a group. For example, if `billions_of_numbers` is an array of numbers, the expression\n", + "\n", + " .18 * billions_of_numbers\n", + "\n", + "gives a new array of numbers that's the result of multiplying each number in `billions_of_numbers` by .18 (18%). Arrays are not limited to numbers; we can also put all the words in a book into an array of strings.\n", + "\n", + "Concretely, an array is a **collection of values of the same type**, like a column in an Excel spreadsheet. \n", + "\n", + "\"In" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.1. Making arrays\n", + "You can type in the data that goes in an array yourself, but that's not typically how programs work. Normally, we create arrays by loading them from an external source, like a data file.\n", + "\n", + "First, though, let's learn how to start from scratch. Execute the following cell so that all the names from the `datascience` module are available to you. The documentation for this module is available at [http://data8.org/datascience](http://data8.org/datascience/)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from datascience import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, to create an array, call the function `make_array`. Each argument you pass to `make_array` will be in the array it returns. Run this cell to see an example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "make_array(0.125, 4.75, -1.3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Each value in an array (in the above case, the numbers 0.125, 4.75, and -1.3) is called an *element* or *item* of that array.\n", + "\n", + "Arrays themselves are also values, just like numbers and strings. That means you can assign them names or use them as arguments to functions.\n", + "\n", + "**Question 4.1.1.**
Make an array containing the numbers 1, 2, and 3, in that order. Name it `small_numbers`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "small_numbers = ...\n", + "small_numbers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q411.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 4.1.2.**
Make an array containing the numbers 0, 1, -1, $\\pi$, and $e$, in that order. Name it `interesting_numbers`. *Hint:* How did you get the values $\\pi$ and $e$ earlier? You can refer to them in exactly the same way here." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "interesting_numbers = ...\n", + "interesting_numbers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q412.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 4.1.3.**
Make an array containing the five strings `\"Hello\"`, `\",\"`, `\" \"`, `\"world\"`, and `\"!\"`. (The third one is a single space inside quotes.) Name it `hello_world_components`.\n", + "\n", + "*Note:* If you print `hello_world_components`, you'll notice some extra information in addition to its contents: `dtype=' Assign `separator` to a string so that the name `hello` is bound to the string `'Hello, world!'` in the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "separator = ...\n", + "hello = separator.join(hello_world_components)\n", + "hello" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q414.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4.1.1. `np.arange`\n", + "Arrays are provided by a package called [NumPy](http://www.numpy.org/) (pronounced \"NUM-pie\" or, if you prefer to pronounce things incorrectly, \"NUM-pee\"). The package is called `numpy`, but it's standard to rename it `np` for brevity. You can do that with:\n", + "\n", + " import numpy as np\n", + "\n", + "Very often in data science, we want to work with many numbers that are evenly spaced within some range. NumPy provides a special function for this called `arange`. `np.arange(start, stop, space)` produces an array with all the numbers starting at `start` and counting up by `space`, stopping before `stop` is reached.\n", + "\n", + "For example, the value of `np.arange(1, 6, 2)` is an array with elements 1, 3, and 5 -- it starts at 1 and counts up by 2, then stops before 6. In other words, it's equivalent to `make_array(1, 3, 5)`.\n", + "\n", + "`np.arange(4, 9, 1)` is an array with elements 4, 5, 6, 7, and 8. (It doesn't contain 9 because `np.arange` stops *before* the stop value is reached.)\n", + "\n", + "**Question 4.1.1.1.**
Import `numpy` as `np` and then use `np.arange` to create an array with the multiples of 99 from 0 up to (**and including**) 9999. (So its elements are 0, 99, 198, 297, etc.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "...\n", + "multiples_of_99 = ...\n", + "multiples_of_99" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q4111.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Temperature readings\n", + "NOAA (the US National Oceanic and Atmospheric Administration) operates weather stations that measure surface temperatures at different sites around the United States. The hourly readings are [publicly available](http://www.ncdc.noaa.gov/qclcd/QCLCD?prior=N).\n", + "\n", + "Suppose we download all the hourly data from the Oakland, California site for the month of December 2015. To analyze the data, we want to know when each reading was taken, but we find that the data don't include the timestamps of the readings (the time at which each one was taken).\n", + "\n", + "However, we know the first reading was taken at the first instant of December 2015 (midnight on December 1st) and each subsequent reading was taken exactly 1 hour after the last.\n", + "\n", + "**Question 4.1.1.2.**
Create an array of the *time, in seconds, since the start of the month* at which each hourly reading was taken. Name it `collection_times`.\n", + "\n", + "*Hint 1:* There were 31 days in December, which is equivalent to ($31 \\times 24$) hours or ($31 \\times 24 \\times 60 \\times 60$) seconds. So your array should have $31 \\times 24$ elements in it.\n", + "\n", + "*Hint 2:* The `len` function works on arrays, too. If your `collection_times` isn't passing the tests, check its length and make sure it has $31 \\times 24$ elements." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "collection_times = ...\n", + "collection_times" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q4112.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.2. Working with single elements of arrays (\"indexing\")\n", + "Let's work with a more interesting dataset. The next cell creates an array called `population` that includes estimated world populations in every year from **1950** to roughly the present. (The estimates come from the [US Census Bureau website](http://www.census.gov/population/international/data/worldpop/table_population.php).)\n", + "\n", + "Rather than type in the data manually, we've loaded them from a file on your computer called `world_population.csv`. You'll learn how to do that next week." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Don't worry too much about what goes on in this cell.\n", + "from datascience import *\n", + "population = Table.read_table(\"world_population.csv\").column(\"Population\")\n", + "population" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here's how we get the first element of `population`, which is the world population in the first year in the dataset, 1950." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "population.item(0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The value of that expression is the number 2557628654 (around 2.5 billion), because that's the first thing in the array `population`.\n", + "\n", + "Notice that we wrote `.item(0)`, not `.item(1)`, to get the first element. This is a weird convention in computer science. 0 is called the *index* of the first item. It's the number of elements that appear *before* that item. So 3 is the index of the 4th item.\n", + "\n", + "Here are some more examples. In the examples, we've given names to the things we get out of `population`. Read and run each cell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# The third element in the array is the population\n", + "# in 1952.\n", + "population_1952 = population.item(2)\n", + "population_1952" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# The thirteenth element in the array is the population\n", + "# in 1962 (which is 1950 + 12).\n", + "population_1962 = population.item(12)\n", + "population_1962" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# The 66th element is the population in 2015.\n", + "population_2015 = population.item(65)\n", + "population_2015" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# The array has only 66 elements, so this doesn't work.\n", + "# (There's no element with 66 other elements before it.)\n", + "population_2016 = population.item(66)\n", + "population_2016" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Since make_array returns an array, we can call .item(3)\n", + "# on its output to get its 4th element, just like we\n", + "# \"chained\" together calls to the method \"replace\" earlier.\n", + "make_array(-1, -3, 4, -2).item(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 4.2.1.**
Set `population_1973` to the world population in 1973, by getting the appropriate element from `population` using `item`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "population_1973 = ...\n", + "population_1973" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q421.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.3. Doing something to every element of an array\n", + "Arrays are primarily useful for doing the same operation many times, so we don't often have to use `.item` and work with single elements.\n", + "\n", + "##### Logarithms\n", + "Here is one simple question we might ask about world population:\n", + "\n", + "> How big was the population in *orders of magnitude* in each year?\n", + "\n", + "The logarithm function is one way of measuring how big a number is. The logarithm (base 10) of a number increases by 1 every time we multiply the number by 10. It's like a measure of how many decimal digits the number has, or how big it is in orders of magnitude.\n", + "\n", + "We could try to answer our question like this, using the `log10` function from the `math` module and the `item` method you just saw:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import math\n", + "\n", + "population_1950_magnitude = math.log10(population.item(0))\n", + "population_1951_magnitude = math.log10(population.item(1))\n", + "population_1952_magnitude = math.log10(population.item(2))\n", + "population_1953_magnitude = math.log10(population.item(3))\n", + "..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "But this is tedious and doesn't really take advantage of the fact that we are using a computer.\n", + "\n", + "Instead, NumPy provides its own version of `log10` that takes the logarithm of each element of an array. It takes a single array of numbers as its argument. It returns an array of the same length, where the first element of the result is the logarithm of the first element of the argument, and so on.\n", + "\n", + "**Question 4.3.1.**
Use it to compute the logarithms of the world population in every year. Give the result (an array of 66 numbers) the name `population_magnitudes`. Your code should be very short." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "population_magnitudes = ...\n", + "population_magnitudes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q431.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Elementwise\n", + "\n", + "This is called *elementwise* application of the function, since it operates separately on each element of the array it's called on. The textbook's section on arrays has a useful list of NumPy functions that are designed to work elementwise, like `np.log10`.\n", + "\n", + "##### Arithmetic\n", + "Arithmetic also works elementwise on arrays. For example, you can divide all the population numbers by 1 billion to get numbers in billions:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "population_in_billions = population / 1000000000\n", + "population_in_billions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can do the same with addition, subtraction, multiplication, and exponentiation (`**`). For example, you can calculate a tip on several restaurant bills at once (in this case just 3):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "restaurant_bills = make_array(20.12, 39.90, 31.01)\n", + "print(\"Restaurant bills:\\t\", restaurant_bills)\n", + "tips = .2 * restaurant_bills\n", + "print(\"Tips:\\t\\t\\t\", tips)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Elementwise\n", + "\n", + "**Question 4.3.2.**
Suppose the total charge at a restaurant is the original bill plus the tip. That means we can multiply the original bill by 1.2 to get the total charge. Compute the total charge for each bill in `restaurant_bills`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "total_charges = ...\n", + "total_charges" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q432.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 4.3.3.**
`more_restaurant_bills.csv` contains 100,000 bills! Compute the total charge for each one. How is your code different?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "more_restaurant_bills = Table.read_table(\"more_restaurant_bills.csv\").column(\"Bill\")\n", + "more_total_charges = ...\n", + "more_total_charges" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q433.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The function `sum` takes a single array of numbers as its argument. It returns the sum of all the numbers in that array (so it returns a single number, not an array).\n", + "\n", + "**Question 4.3.4.**
What was the sum of all the bills in `more_restaurant_bills`, *including tips*?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "sum_of_bills = ...\n", + "sum_of_bills" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q434.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 4.3.5.**
The powers of 2 ($2^0 = 1$, $2^1 = 2$, $2^2 = 4$, etc) arise frequently in computer science. (For example, you may have noticed that storage on smartphones or USBs come in powers of 2, like 16 GB, 32 GB, or 64 GB.) Use `np.arange` and the exponentiation operator `**` to compute the first 15 powers of 2, starting from `2^0`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "powers_of_2 = ...\n", + "powers_of_2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q435.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Success!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Congratulations, you're done with lab 2! Be sure to \n", + "- **run all the tests and verify that they all pass** (the next cell has a shortcut for that), \n", + "- **Review the notebook one last time, we will be grading the final state of your notebook after the deadline**,\n", + "- **Save and Checkpoint** from the `File` menu," + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# For your convenience, you can run this cell to run all the tests at once!\n", + "import glob\n", + "from gofer.ok import grade_notebook\n", + "if not globals().get('__GOFER_GRADER__', False):\n", + " display(grade_notebook('lab02.ipynb', sorted(glob.glob('tests/q*.py'))))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/materials/x18/lab/1/lab02/lab02.ipynb b/materials/x18/lab/1/lab02/lab02.ipynb index 2ce444e..d5184af 100644 --- a/materials/x18/lab/1/lab02/lab02.ipynb +++ b/materials/x18/lab/1/lab02/lab02.ipynb @@ -91,8 +91,8 @@ "source": [ "**Question 1.1.**
In the next cell, assign the name `new_year` to the larger number among the following two numbers:\n", "\n", - "1. the absolute value of $2^{5}-2^{11}-2^1$, and \n", - "2. $5 \\times 13 \\times 31 + 2$.\n", + "1. the absolute value of $2^{5}-2^{11}-2^{1}-2^{0}$, and \n", + "2. $5 \\times 13 \\times 31 + 4$.\n", "\n", "Try to use just one statement (one line of code)." ] @@ -1483,145 +1483,6 @@ "check('tests/q435.py')" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 4.4 Example: Growth Rates" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A natural example of how we can use arrays to reduce large amounts of computation is growth rates. \n", - "\n", - "**Question 4.4.1**
Let's say we are investing in stocks, and we initially invest 10.23 dollars into the market. We check back in one year later, and we see that our total money in the market is now 14.32 dollars. What was our annual growth rate?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "annual_growth_rate = ...\n", - "annual_growth_rate" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q441.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 4.4.2**
If we wanted to see multiple people's annual stock growth rates, we could continue the above process per person. However, this can become tedious. \n", - "\n", - "Let's use the power of arrays! Assume that `initials` contains the initial amount of money for 5 different people, and `changed` contains the amount of money after one year for the same corresponding people. Assign `annual_growth_rates` to an array of all of the different growth rates for the 5 people. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "initials = make_array(10.21, 11.32, 15.21, 13.22, 19.10)\n", - "changed = make_array(14.20, 35.44, 10.43, 9.62, 20.10)\n", - "annual_growth_rates = ...\n", - "annual_growth_rates" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q442.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 4.4.3**
Now, let's use an array arithmetic to deduce the annual growth rate on peoples stocks given the amount of money in their market 10 years from now, found in the variable `ten_years`. Assuming everyone initially started with 10 dollars in their market, calculate the annual growth rate per person over these 10 years and assign this array of values to `annual_rates_over_ten_years`. \n", - "\n", - "*Hint*: If you don't remember this formula, check out the textbook!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "ten_years = make_array(50.32, 1.04, 0.40, 14.50, 11.12)\n", - "annual_rates_over_ten_years = ...\n", - "annual_rates_over_ten_years" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q443.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 4.4.4**
Lastly, let's use array arithmetic to figure the final amount of money in people's market 10 years from now, assuming they all invested different amounts of money (`invested`) in the same stock, DS8. The annual growth rate for DS8 was .045. Assign `money_in_ten_years` to an array of the money people ended with in the DS8 stock based on how much they initially invested.\n", - "\n", - "*Hint*: If you don't remember this formula, check out the textbook!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "invested = make_array(10,11,15,20,25)\n", - "money_in_ten_years = ...\n", - "money_in_ten_years" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q444.py')" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1642,9 +1503,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# For your convenience, you can run this cell to run all the tests at once!\n", @@ -1653,6 +1512,15 @@ "if not globals().get('__GOFER_GRADER__', False):\n", " display(grade_notebook('lab02.ipynb', sorted(glob.glob('tests/q*.py'))))" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] } ], "metadata": { @@ -1672,10 +1540,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.2" + "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 1 } - diff --git a/materials/x18/lab/1/lab02/tests/q11.py b/materials/x18/lab/1/lab02/tests/q11.py index 9bbef9e..d639059 100644 --- a/materials/x18/lab/1/lab02/tests/q11.py +++ b/materials/x18/lab/1/lab02/tests/q11.py @@ -7,7 +7,7 @@ { 'code': r""" >>> new_year - 2018 + 2019 """, 'hidden': False, 'locked': False diff --git a/materials/x18/lab/1/lab02/tests/q441.py b/materials/x18/lab/1/lab02/tests/q441.py deleted file mode 100644 index c88a80f..0000000 --- a/materials/x18/lab/1/lab02/tests/q441.py +++ /dev/null @@ -1,22 +0,0 @@ -test = { - 'name': '', - 'points': 1, - 'suites': [ - { - 'cases': [ - { - 'code': r""" - >>> round(annual_growth_rate,4) == .3998 - True - """, - 'hidden': False, - 'locked': False - }, - ], - 'scored': True, - 'setup': '', - 'teardown': '', - 'type': 'doctest' - } - ] -} diff --git a/materials/x18/lab/1/lab02/tests/q442.py b/materials/x18/lab/1/lab02/tests/q442.py deleted file mode 100644 index 569aa2b..0000000 --- a/materials/x18/lab/1/lab02/tests/q442.py +++ /dev/null @@ -1,22 +0,0 @@ -test = { - 'name': '', - 'points': 1, - 'suites': [ - { - 'cases': [ - { - 'code': r""" - >>> all(np.round(annual_growth_rates, 3) == np.round((changed/initials)-1, 3)) - True - """, - 'hidden': False, - 'locked': False - }, - ], - 'scored': True, - 'setup': '', - 'teardown': '', - 'type': 'doctest' - } - ] -} diff --git a/materials/x18/lab/1/lab02/tests/q443.py b/materials/x18/lab/1/lab02/tests/q443.py deleted file mode 100644 index d30179e..0000000 --- a/materials/x18/lab/1/lab02/tests/q443.py +++ /dev/null @@ -1,22 +0,0 @@ -test = { - 'name': '', - 'points': 1, - 'suites': [ - { - 'cases': [ - { - 'code': r""" - >>> all(annual_rates_over_ten_years == (ten_years/10)**(1/10) - 1) - True - """, - 'hidden': False, - 'locked': False - }, - ], - 'scored': True, - 'setup': '', - 'teardown': '', - 'type': 'doctest' - } - ] -} diff --git a/materials/x18/lab/1/lab02/tests/q444.py b/materials/x18/lab/1/lab02/tests/q444.py deleted file mode 100644 index e9403b9..0000000 --- a/materials/x18/lab/1/lab02/tests/q444.py +++ /dev/null @@ -1,22 +0,0 @@ -test = { - 'name': '', - 'points': 1, - 'suites': [ - { - 'cases': [ - { - 'code': r""" - >>> all(money_in_ten_years == invested * (1.045)**10) - True - """, - 'hidden': False, - 'locked': False - }, - ], - 'scored': True, - 'setup': '', - 'teardown': '', - 'type': 'doctest' - } - ] -} diff --git a/materials/x18/lab/1/lab03/.ipynb_checkpoints/lab03-checkpoint.ipynb b/materials/x18/lab/1/lab03/.ipynb_checkpoints/lab03-checkpoint.ipynb new file mode 100644 index 0000000..db5a783 --- /dev/null +++ b/materials/x18/lab/1/lab03/.ipynb_checkpoints/lab03-checkpoint.ipynb @@ -0,0 +1,1071 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lab 3: Tables\n", + "\n", + "Welcome to lab 3! \n", + "\n", + "This week, we will focus on manipulating tables. Tables are described in [Chapter 6](http://www.inferentialthinking.com/chapters/06/tables.html) of the text.\n", + "\n", + "First, set up the tests and imports by running the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "from datascience import *\n", + "\n", + "# These lines load the tests.\n", + "\n", + "from gofer.ok import check" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Introduction\n", + "\n", + "For a collection of things in the world, an array is useful for describing a single attribute of each thing. For example, among the collection of US States, an array could describe the land area of each. Tables extend this idea by describing multiple attributes for each element of a collection.\n", + "\n", + "In most data science applications, we have data about many entities, but we also have several kinds of data about each entity.\n", + "\n", + "For example, in the cell below we have two arrays. The first one contains the world population in each year (estimated by the US Census Bureau), and the second contains the years themselves. These elements are in order, so the year and the world population for that year have the same index in their corresponding arrays." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "population_amounts = Table.read_table(\"world_population.csv\").column(\"Population\")\n", + "years = np.arange(1950, 2015+1)\n", + "print(\"Population column:\", population_amounts)\n", + "print(\"Years column:\", years)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Suppose we want to answer this question:\n", + "\n", + "> When did world population cross 6 billion?\n", + "\n", + "You could technically answer this question just from staring at the arrays, but it's a bit convoluted, since you would have to count the position where the population first crossed 6 billion, then find the corresponding element in the years array. In cases like these, it might be easier to put the data into a *`Table`*, a 2-dimensional type of dataset. \n", + "\n", + "The expression below:\n", + "\n", + "- creates an empty table using the expression `Table()`,\n", + "- adds two columns by calling `with_columns` with four arguments,\n", + "- assignes the result to the name `population`, and finally\n", + "- evaluates `population` so that we can see the table.\n", + "\n", + "The strings `\"Year\"` and `\"Population\"` are column labels that we have chosen. Ther names `population_amounts` and `years` were assigned above to two arrays of the same length. The function `with_columns` (you can find the documentation [here](http://data8.org/datascience/tables.html)) takes in alternating strings (to represent column labels) and arrays (representing the data in those columns), which are all separated by commas." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "population = Table().with_columns(\n", + " \"Population\", population_amounts,\n", + " \"Year\", years\n", + ")\n", + "population" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now the data are all together in a single table! It's much easier to parse this data--if you need to know what the population was in 1959, for example, you can tell from a single glance. We'll revisit this table later." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Creating Tables\n", + "\n", + "**Question 2.1.**
In the cell below, we've created 2 arrays. Using the steps above, assign `top_10_movies` to a table that has two columns called \"Rating\" and \"Name\", which hold `top_10_movie_ratings` and `top_10_movie_names` respectively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "top_10_movie_ratings = make_array(9.2, 9.2, 9., 8.9, 8.9, 8.9, 8.9, 8.9, 8.9, 8.8)\n", + "top_10_movie_names = make_array(\n", + " 'The Shawshank Redemption (1994)',\n", + " 'The Godfather (1972)',\n", + " 'The Godfather: Part II (1974)',\n", + " 'Pulp Fiction (1994)',\n", + " \"Schindler's List (1993)\",\n", + " 'The Lord of the Rings: The Return of the King (2003)',\n", + " '12 Angry Men (1957)',\n", + " 'The Dark Knight (2008)',\n", + " 'Il buono, il brutto, il cattivo (1966)',\n", + " 'The Lord of the Rings: The Fellowship of the Ring (2001)')\n", + "\n", + "top_10_movies = ...\n", + "# We've put this next line here so your table will get printed out when you\n", + "# run this cell.\n", + "top_10_movies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q2_1.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Loading a table from a file\n", + "In most cases, we aren't going to go through the trouble of typing in all the data manually. Instead, we can use our `Table` functions.\n", + "\n", + "`Table.read_table` takes one argument, a path to a data file (a string) and returns a table. There are many formats for data files, but CSV (\"comma-separated values\") is the most common.\n", + "\n", + "**Question 2.2.**
The file `imdb.csv` contains a table of information about the 250 highest-rated movies on IMDb. Load it as a table called `imdb`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "imdb = ...\n", + "imdb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q2_2.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice the part about \"... (240 rows omitted).\" This table is big enough that only a few of its rows are displayed, but the others are still there. 10 are shown, so there are 250 movies total.\n", + "\n", + "Where did `imdb.csv` come from? Take a look at [this lab's folder](./). You should see a file called `imdb.csv`.\n", + "\n", + "Open up the `imdb.csv` file in that folder and look at the format. What do you notice? The `.csv` filename ending says that this file is in the [CSV (comma-separated value) format](http://edoceo.com/utilitas/csv-file-format)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Using lists\n", + "\n", + "A *list* is another Python sequence type, similar to an array. It's different than an array because the values it contains can all have different types. A single list can contain `int` values, `float` values, and strings. Elements in a list can even be other lists! A list is created by giving a name to the list of values enclosed in square brackets and separated by commas. For example, `values_with_different_types = ['data', 8, 8.1]`\n", + "\n", + "Lists can be useful when working with tables because they can describe the contents of one row in a table, which often corresponds to a sequence of values with different types. A list of lists can be used to describe multiple rows.\n", + "\n", + "Each column in a table is a collection of values with the same type (an array). If you create a table column from a list, it will automatically be converted to an array. A row, on the ther hand, mixes types.\n", + "\n", + "Here's a table from Chapter 5. (Run the cell below.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Run this cell to recreate the table\n", + "flowers = Table().with_columns(\n", + " 'Number of petals', make_array(8, 34, 5),\n", + " 'Name', make_array('lotus', 'sunflower', 'rose')\n", + ")\n", + "flowers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 3.1.**
Create a list that describes a new fourth row of this table. The details can be whatever you want, but the list must contain two values: the number of petals (an `int` value) and the name of the flower (a string). For example, your flower could be \"pondweed\"! (A flower with zero petals)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "my_flower = ...\n", + "my_flower" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q3_1.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 3.2.**
`my_flower` fits right in to the table from chapter 5. Complete the cell below to create a table of seven flowers that includes your flower as the fourth row followed by `other_flowers`. You can use `with_row` to create a new table with one extra row by passing a list of values and `with_rows` to create a table with multiple extra rows by passing a list of lists of values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Use the method .with_row(...) to create a new table that includes my_flower \n", + "\n", + "four_flowers = ...\n", + "\n", + "# Use the method .with_rows(...) to create a table that \n", + "# includes four_flowers followed by other_flowers\n", + "\n", + "other_flowers = [[10, 'lavender'], [3, 'birds of paradise'], [6, 'tulip']]\n", + "\n", + "seven_flowers = ...\n", + "seven_flowers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q3_2.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Analyzing datasets\n", + "With just a few table methods, we can answer some interesting questions about the IMDb dataset.\n", + "\n", + "If we want just the ratings of the movies, we can get an array that contains the data in that column:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "imdb.column(\"Rating\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The value of that expression is an array, exactly the same kind of thing you'd get if you typed in `make_array(8.4, 8.3, 8.3, [etc])`.\n", + "\n", + "**Question 4.1.**
Find the rating of the highest-rated movie in the dataset.\n", + "\n", + "*Hint:* Think back to the functions you've learned about for working with arrays of numbers. Ask for help if you can't remember one that's useful for this." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "highest_rating = ...\n", + "highest_rating" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q4_1.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's not very useful, though. You'd probably want to know the *name* of the movie whose rating you found! To do that, we can sort the entire table by rating, which ensures that the ratings and titles will stay together. Note that calling sort creates a copy of the table and leaves the original table unsorted." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "imdb.sort(\"Rating\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Well, that actually doesn't help much, either -- we sorted the movies from lowest -> highest ratings. To look at the highest-rated movies, sort in reverse order:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "imdb.sort(\"Rating\", descending=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(The `descending=True` bit is called an *optional argument*. It has a default value of `False`, so when you explicitly tell the function `descending=True`, then the function will sort in descending order.)\n", + "\n", + "So there are actually 2 highest-rated movies in the dataset: *The Shawshank Redemption* and *The Godfather*.\n", + "\n", + "Some details about sort:\n", + "\n", + "1. The first argument to `sort` is the name of a column to sort by.\n", + "2. If the column has strings in it, `sort` will sort alphabetically; if the column has numbers, it will sort numerically.\n", + "3. The value of `imdb.sort(\"Rating\")` is a *copy of `imdb`*; the `imdb` table doesn't get modified. For example, if we called `imdb.sort(\"Rating\")`, then running `imdb` by itself would still return the unsorted table.\n", + "4. Rows always stick together when a table is sorted. It wouldn't make sense to sort just one column and leave the other columns alone. For example, in this case, if we sorted just the \"Rating\" column, the movies would all end up with the wrong ratings.\n", + "\n", + "**Question 4.2.**
Create a version of `imdb` that's sorted chronologically, with the earliest movies first. Call it `imdb_by_year`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "imdb_by_year = ...\n", + "imdb_by_year" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q4_2.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 4.3.**
What's the title of the earliest movie in the dataset? You could just look this up from the output of the previous cell. Instead, write Python code to find out.\n", + "\n", + "*Hint:* Starting with `imdb_by_year`, extract the Title column to get an array, then use `item` to get its first item." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "earliest_movie_title = ...\n", + "earliest_movie_title" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q4_3.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Finding pieces of a dataset\n", + "Suppose you're interested in movies from the 1940s. Sorting the table by year doesn't help you, because the 1940s are in the middle of the dataset.\n", + "\n", + "Instead, we use the table method `where`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "forties = imdb.where('Decade', are.equal_to(1940))\n", + "forties" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ignore the syntax for the moment. Instead, try to read that line like this:\n", + "\n", + "> Assign the name **`forties`** to a table whose rows are the rows in the **`imdb`** table **`where`** the **`'Decade'`**s **`are` `equal` `to` `1940`**.\n", + "\n", + "**Question 5.1.**
Compute the average rating of movies from the 1940s.\n", + "\n", + "*Hint:* The function `np.average` computes the average of an array of numbers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "average_rating_in_forties = ...\n", + "average_rating_in_forties" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q5_1.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's dive into the details a bit more. `where` takes 2 arguments:\n", + "\n", + "1. The name of a column. `where` finds rows where that column's values meet some criterion.\n", + "2. Something that describes the criterion that the column needs to meet, called a predicate.\n", + "\n", + "To create our predicate, we called the function `are.equal_to` with the value we wanted, 1940. We'll see other predicates soon.\n", + "\n", + "`where` returns a table that's a copy of the original table, but with only the rows that meet the given predicate.\n", + "\n", + "**Question 5.2.**
Create a table called `ninety_nine` containing the movies that came out in the year 1999. Use `where`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "ninety_nine = ...\n", + "ninety_nine" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q5_2.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So far we've only been finding where a column is *exactly* equal to a certain value. However, there are many other predicates. Here are a few:\n", + "\n", + "|Predicate|Example|Result|\n", + "|-|-|-|\n", + "|`are.equal_to`|`are.equal_to(50)`|Find rows with values equal to 50|\n", + "|`are.not_equal_to`|`are.not_equal_to(50)`|Find rows with values not equal to 50|\n", + "|`are.above`|`are.above(50)`|Find rows with values above (and not equal to) 50|\n", + "|`are.above_or_equal_to`|`are.above_or_equal_to(50)`|Find rows with values above 50 or equal to 50|\n", + "|`are.below`|`are.below(50)`|Find rows with values below 50|\n", + "|`are.between`|`are.between(2, 10)`|Find rows with values above or equal to 2 and below 10|\n", + "\n", + "The textbook section on selecting rows has more examples.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 5.3.**
Using `where` and one of the predicates from the table above, find all the movies with a rating higher than 8.5. Put their data in a table called `really_highly_rated`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "really_highly_rated = ...\n", + "really_highly_rated" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q5_3.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 5.4.**
Find the average rating for movies released in the 20th century and the average rating for movies released in the 21st century for the movies in `imdb`.\n", + "\n", + "*Hint*: Think of the steps you need to do (take the average, find the ratings, find movies released in 20th/21st centuries), and try to put them in an order that makes sense." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "average_20th_century_rating = ...\n", + "average_21st_century_rating = ...\n", + "print(\"Average 20th century rating:\", average_20th_century_rating)\n", + "print(\"Average 21st century rating:\", average_21st_century_rating)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q5_4.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The property `num_rows` tells you how many rows are in a table. (A \"property\" is just a method that doesn't need to be called by adding parentheses.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "num_movies_in_dataset = imdb.num_rows\n", + "num_movies_in_dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 5.5.**
Use `num_rows` (and arithmetic) to find the *proportion* of movies in the dataset that were released in the 20th century, and the proportion from the 21st century.\n", + "\n", + "*Hint:* The *proportion* of movies released in the 20th century is the *number* of movies released in the 20th century, divided by the *total number* of movies." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "proportion_in_20th_century = ...\n", + "proportion_in_21st_century = ...\n", + "print(\"Proportion in 20th century:\", proportion_in_20th_century)\n", + "print(\"Proportion in 21st century:\", proportion_in_21st_century)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q5_5.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 5.6.**
Here's a challenge: Find the number of movies that came out in *even* years.\n", + "\n", + "*Hint:* The operator `%` computes the remainder when dividing by a number. So `5 % 2` is 1 and `6 % 2` is 0. A number is even if the remainder is 0 when you divide by 2.\n", + "\n", + "*Hint 2:* `%` can be used on arrays, operating elementwise like `+` or `*`. So `make_array(5, 6, 7) % 2` is `array([1, 0, 1])`.\n", + "\n", + "*Hint 3:* Create a column called \"Year Remainder\" that's the remainder when each movie's release year is divided by 2. Make a copy of `imdb` that includes that column. Then use `where` to find rows where that new column is equal to 0. Then use `num_rows` to count the number of such rows." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "for_assignment_type": "student" + }, + "outputs": [], + "source": [ + "num_even_year_movies = ...\n", + "num_even_year_movies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q5_6.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 5.7.**
Check out the `population` table from the introduction to this lab. Compute the year when the world population first went above 6 billion." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "year_population_crossed_6_billion = ...\n", + "year_population_crossed_6_billion" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q5_7.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Miscellanea\n", + "There are a few more table methods you'll need to fill out your toolbox. The first 3 have to do with manipulating the columns in a table.\n", + "\n", + "The table `farmers_markets.csv` contains data on farmers' markets in the United States (data collected [by the USDA]([dataset](https://apps.ams.usda.gov/FarmersMarketsExport/ExcelExport.aspx)). Each row represents one such market.\n", + "\n", + "**Question 6.1.**
Load the dataset into a table. Call it `farmers_markets`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "farmers_markets = ...\n", + "farmers_markets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q6_1.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You'll notice that it has a large number of columns in it!\n", + "\n", + "### `num_columns`\n", + "\n", + "**Question 6.2.**
The table property `num_columns` (example call: `tbl.num_columns`) produces the number of columns in a table. Use it to find the number of columns in our farmers' markets dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "num_farmers_markets_columns = ...\n", + "print(\"The table has\", num_farmers_markets_columns, \"columns in it!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q6_2.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Most of the columns are about particular products -- whether the market sells tofu, pet food, etc. If we're not interested in that stuff, it just makes the table difficult to read. This comes up more than you might think.\n", + "\n", + "### `select`\n", + "\n", + "In such situations, we can use the table method `select` to pare down the columns of a table. It takes any number of arguments. Each should be the name or index of a column in the table. It returns a new table with only those columns in it.\n", + "\n", + "For example, the value of `imdb.select(\"Year\", \"Decade\")` is a table with only the years and decades of each movie in `imdb`.\n", + "\n", + "**Question 6.3.**
Use `select` to create a table with only the name, city, state, latitude ('y'), and longitude ('x') of each market. Call that new table `farmers_markets_locations`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "farmers_markets_locations = ...\n", + "farmers_markets_locations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q6_3.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `select` is not `column`!\n", + "\n", + "The method `select` is **definitely not** the same as the method `column`.\n", + "\n", + "`farmers_markets.column('y')` is an *array* of the latitudes of all the markets. `farmers_markets.select('y')` is a *table* that happens to contain only 1 column, the latitudes of all the markets.\n", + "\n", + "**Question 6.4.**
Below, we tried using the function `np.average` to find the average latitude ('y') and average longitude ('x') of the farmers' markets in the table, but we screwed something up. Run the cell to see the (somewhat inscrutable) error message that results from calling `np.average` on a table. Then, fix our code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "for_assignment_type": "student" + }, + "outputs": [], + "source": [ + "average_latitude = np.average(farmers_markets.select('y'))\n", + "average_longitude = np.average(farmers_markets.select('x'))\n", + "print(\"The average of US farmers' markets' coordinates is located at (\", average_latitude, \",\", average_longitude, \")\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q6_4.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `drop`\n", + "\n", + "`drop` serves the same purpose as `select`, but it takes away the columns you list instead of the ones you don't list, leaving all the rest of the columns.\n", + "\n", + "**Question 6.5.**
Suppose you just didn't want the \"FMID\" or \"updateTime\" columns in `farmers_markets`. Create a table that's a copy of `farmers_markets` but doesn't include those columns. Call that table `farmers_markets_without_fmid`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "farmers_markets_without_fmid = ...\n", + "farmers_markets_without_fmid" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q6_5.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### `take`\n", + "Let's find the 5 northernmost farmers' markets in the US. You already know how to sort by latitude ('y'), but we haven't seen how to get the first 5 rows of a table. That's what `take` is for.\n", + "\n", + "The table method `take` takes as its argument an array of numbers. Each number should be the index of a row in the table. It returns a new table with only those rows.\n", + "\n", + "Most often you'll want to use `take` in conjunction with `np.arange` to take the first few rows of a table.\n", + "\n", + "**Question 6.6.**
Make a table of the 5 northernmost farmers' markets in `farmers_markets_locations`. Call it `northern_markets`. (It should include the same columns as `farmers_markets_locations`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "northern_markets = ...\n", + "northern_markets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q6_6.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 6.7.**
Make a table of the farmers' markets in Berkeley, California. (It should include the same columns as `farmers_markets_locations`.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "berkeley_markets = ...\n", + "berkeley_markets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q6_7.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Summary\n", + "\n", + "For your reference, here's a table of all the functions and methods we saw in this lab.\n", + "\n", + "|Name|Example|Purpose|\n", + "|-|-|-|\n", + "|`Table`|`Table()`|Create an empty table, usually to extend with data|\n", + "|`Table.read_table`|`Table.read_table(\"my_data.csv\")`|Create a table from a data file|\n", + "|`with_columns`|`tbl = Table().with_columns(\"N\", np.arange(5), \"2*N\", np.arange(0, 10, 2))`|Create a copy of a table with more columns|\n", + "|`column`|`tbl.column(\"N\")`|Create an array containing the elements of a column|\n", + "|`sort`|`tbl.sort(\"N\")`|Create a copy of a table sorted by the values in a column|\n", + "|`where`|`tbl.where(\"N\", are.above(2))`|Create a copy of a table with only the rows that match some *predicate*|\n", + "|`num_rows`|`tbl.num_rows`|Compute the number of rows in a table|\n", + "|`num_columns`|`tbl.num_columns`|Compute the number of columns in a table|\n", + "|`select`|`tbl.select(\"N\")`|Create a copy of a table with only some of the columns|\n", + "|`drop`|`tbl.drop(\"2*N\")`|Create a copy of a table without some of the columns|\n", + "|`take`|`tbl.take(np.arange(0, 6, 2))`|Create a copy of the table with only the rows whose indices are in the given array|\n", + "\n", + "
\n", + "\n", + "Congratulations, you're done with lab 3! Be sure to \n", + "- **run all the tests and verify that they all pass** (the next cell has a shortcut for that), \n", + "- **Review the notebook one last time, we will be grading the final state of your notebook after the deadline**,\n", + "- **Save and Checkpoint** from the `File` menu," + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# For your convenience, you can run this cell to run all the tests at once!\n", + "import glob\n", + "from gofer.ok import grade_notebook\n", + "if not globals().get('__GOFER_GRADER__', False):\n", + " display(grade_notebook('lab03.ipynb', sorted(glob.glob('tests/q*.py'))))" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/materials/x18/lab/1/lab03/lab03.ipynb b/materials/x18/lab/1/lab03/lab03.ipynb index b6c7bc4..db5a783 100644 --- a/materials/x18/lab/1/lab03/lab03.ipynb +++ b/materials/x18/lab/1/lab03/lab03.ipynb @@ -16,7 +16,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "import numpy as np\n", @@ -37,13 +39,15 @@ "\n", "In most data science applications, we have data about many entities, but we also have several kinds of data about each entity.\n", "\n", - "For example, in the cell below we have two arrays. The first one contains the world population in each year (as [estimated](http://www.census.gov/population/international/data/worldpop/table_population.php) by the US Census Bureau), and the second contains the years themselves (in order, so the first elements in the population and the years arrays correspond)." + "For example, in the cell below we have two arrays. The first one contains the world population in each year (estimated by the US Census Bureau), and the second contains the years themselves. These elements are in order, so the year and the world population for that year have the same index in their corresponding arrays." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "population_amounts = Table.read_table(\"world_population.csv\").column(\"Population\")\n", @@ -75,7 +79,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "population = Table().with_columns(\n", @@ -104,7 +110,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "top_10_movie_ratings = make_array(9.2, 9.2, 9., 8.9, 8.9, 8.9, 8.9, 8.9, 8.9, 8.8)\n", @@ -129,7 +137,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q2_1.py')" @@ -150,7 +160,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "imdb = ...\n", @@ -160,7 +172,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q2_2.py')" @@ -195,7 +209,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# Run this cell to recreate the table\n", @@ -216,7 +232,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "my_flower = ...\n", @@ -226,7 +244,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q3_1.py')" @@ -242,7 +262,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# Use the method .with_row(...) to create a new table that includes my_flower \n", @@ -261,7 +283,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q3_2.py')" @@ -280,7 +304,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "imdb.column(\"Rating\")" @@ -300,7 +326,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "highest_rating = ...\n", @@ -310,7 +338,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q4_1.py')" @@ -326,7 +356,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "imdb.sort(\"Rating\")" @@ -342,7 +374,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "imdb.sort(\"Rating\", descending=True)" @@ -369,7 +403,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "imdb_by_year = ...\n", @@ -379,7 +415,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q4_2.py')" @@ -397,7 +435,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "earliest_movie_title = ...\n", @@ -407,7 +447,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q4_3.py')" @@ -426,7 +468,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "forties = imdb.where('Decade', are.equal_to(1940))\n", @@ -449,7 +493,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "average_rating_in_forties = ...\n", @@ -459,7 +505,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q5_1.py')" @@ -484,7 +532,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "ninety_nine = ...\n", @@ -494,7 +544,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q5_2.py')" @@ -528,7 +580,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "really_highly_rated = ...\n", @@ -538,7 +592,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q5_3.py')" @@ -556,7 +612,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "average_20th_century_rating = ...\n", @@ -568,7 +626,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q5_4.py')" @@ -584,7 +644,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "num_movies_in_dataset = imdb.num_rows\n", @@ -603,7 +665,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "proportion_in_20th_century = ...\n", @@ -615,7 +679,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q5_5.py')" @@ -638,6 +704,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "for_assignment_type": "student" }, "outputs": [], @@ -649,7 +716,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q5_6.py')" @@ -665,7 +734,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "year_population_crossed_6_billion = ...\n", @@ -675,7 +746,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q5_7.py')" @@ -696,7 +769,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "farmers_markets = ...\n", @@ -706,7 +781,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q6_1.py')" @@ -726,7 +803,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "num_farmers_markets_columns = ...\n", @@ -736,7 +815,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q6_2.py')" @@ -760,7 +841,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "farmers_markets_locations = ...\n", @@ -770,7 +853,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q6_3.py')" @@ -793,6 +878,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "for_assignment_type": "student" }, "outputs": [], @@ -805,7 +891,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q6_4.py')" @@ -825,7 +913,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "farmers_markets_without_fmid = ...\n", @@ -835,7 +925,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q6_5.py')" @@ -858,7 +950,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "northern_markets = ...\n", @@ -868,7 +962,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q6_6.py')" @@ -884,7 +980,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "berkeley_markets = ...\n", @@ -894,7 +992,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q6_7.py')" @@ -933,7 +1033,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# For your convenience, you can run this cell to run all the tests at once!\n", @@ -961,10 +1063,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 1 } - diff --git a/materials/x18/lab/1/lab04/.ipynb_checkpoints/lab04-checkpoint.ipynb b/materials/x18/lab/1/lab04/.ipynb_checkpoints/lab04-checkpoint.ipynb new file mode 100644 index 0000000..956c7c6 --- /dev/null +++ b/materials/x18/lab/1/lab04/.ipynb_checkpoints/lab04-checkpoint.ipynb @@ -0,0 +1,1128 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Functions and Visualizations" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Welcome to lab 4! This week, we'll learn about functions and the table method `apply` from [Section 8.1](https://www.inferentialthinking.com/chapters/08/1/applying-a-function-to-a-column.html). We'll also learn about visualization from [Chapter 7](https://www.inferentialthinking.com/chapters/07/visualization.html).\n", + "\n", + "First, set up the tests and imports by running the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "from datascience import *\n", + "\n", + "# These lines set up graphing capabilities.\n", + "import matplotlib\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "plt.style.use('fivethirtyeight')\n", + "import warnings\n", + "warnings.simplefilter('ignore', FutureWarning)\n", + "\n", + "from ipywidgets import interact, interactive, fixed, interact_manual\n", + "import ipywidgets as widgets\n", + "\n", + "from gofer.ok import check" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Functions and CEO Incomes\n", + "\n", + "Let's start with a real data analysis task. We'll look at the 2015 compensation of CEOs at the 100 largest companies in California. The data were compiled for a Los Angeles Times analysis [here](http://spreadsheets.latimes.com/california-ceo-compensation/), and ultimately came from [filings](https://www.sec.gov/answers/proxyhtf.htm) mandated by the SEC from all publicly-traded companies. Two companies have two CEOs, so there are 102 CEOs in the dataset.\n", + "\n", + "We've copied the data in raw form from the LA Times page into a file called `raw_compensation.csv`. (The page notes that all dollar amounts are in millions of dollars.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "raw_compensation = Table.read_table('raw_compensation.csv')\n", + "raw_compensation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 1.1.**
We want to compute the average of the CEOs' pay. Try running the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "np.average(raw_compensation.column(\"Total Pay\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should see an error. Let's examine why this error occured by looking at the values in the \"Total Pay\" column. Use the `type` function and set `total_pay_type` to the type of the first value in the \"Total Pay\" column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "total_pay_type = ...\n", + "total_pay_type" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q1_1.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 1.2.**
You should have found that the values in \"Total Pay\" column are strings (text). It doesn't make sense to take the average of the text values, so we need to convert them to numbers if we want to do this. Extract the first value in the \"Total Pay\" column. It's Mark Hurd's pay in 2015, in *millions* of dollars. Call it `mark_hurd_pay_string`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "mark_hurd_pay_string = ...\n", + "mark_hurd_pay_string" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q1_2.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 1.3.**
Convert `mark_hurd_pay_string` to a number of *dollars*. The string method `strip` will be useful for removing the dollar sign; it removes a specified character from the start or end of a string. For example, the value of `\"100%\".strip(\"%\")` is the string `\"100\"`. You'll also need the function `float`, which converts a string that looks like a number to an actual number. Last, remember that the answer should be in dollars, not millions of dollars." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "mark_hurd_pay = ...\n", + "mark_hurd_pay" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q1_3.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To compute the average pay, we need to do this for every CEO. But that looks like it would involve copying this code 102 times.\n", + "\n", + "This is where functions come in. First, we'll define a new function, giving a name to the expression that converts \"total pay\" strings to numeric values. Later in this lab we'll see the payoff: we can call that function on every pay string in the dataset at once.\n", + "\n", + "**Question 1.4.**
Copy the expression you used to compute `mark_hurd_pay` as the `return` expression of the function below, but replace the specific `mark_hurd_pay_string` with the generic `pay_string` name specified in the first line of the `def` statement.\n", + "\n", + "*Hint*: When dealing with functions, you should generally not be referencing any variable outside of the function. Usually, you want to be working with the arguments that are passed into it, such as `pay_string` for this function. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "for_assignment_type": "student" + }, + "outputs": [], + "source": [ + "def convert_pay_string_to_number(pay_string):\n", + " \"\"\"Converts a pay string like '$100' (in millions) to a number of dollars.\"\"\"\n", + " return ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q1_4.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Running that cell doesn't convert any particular pay string. Instead, it creates a function called `convert_pay_string_to_number` that can convert any string with the right format to a number representing millions of dollars.\n", + "\n", + "We can call our function just like we call the built-in functions we've seen. It takes one argument, a string, and it returns a number." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "convert_pay_string_to_number('$42')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "convert_pay_string_to_number(mark_hurd_pay_string)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# We can also compute Safra Catz's pay in the same way:\n", + "convert_pay_string_to_number(raw_compensation.where(\"Name\", are.containing(\"Safra\")).column(\"Total Pay\").item(0))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So, what have we gained by defining the `convert_pay_string_to_number` function? \n", + "Well, without it, we'd have to copy that `10**6 * float(pay_string.strip(\"$\"))` stuff each time we wanted to convert a pay string. Now we just call a function whose name says exactly what it's doing.\n", + "\n", + "Soon, we'll see how to apply this function to every pay string in a single expression. First, let's take a brief detour and introduce `interact`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using `interact`\n", + "\n", + "We've included a nifty function called `interact` that allows you to\n", + "call a function with different arguments.\n", + "\n", + "To use it, call `interact` with the function you want to interact with as the\n", + "first argument, then specify a default value for each argument of the original\n", + "function like so:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "_ = interact(convert_pay_string_to_number, pay_string='$42')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can now change the value in the textbox to automatically call\n", + "`convert_pay_string_to_number` with the argument you enter in the `pay_string`\n", + "textbox. For example, entering in `'$49'` in the textbox will display the result of\n", + "running `convert_pay_string_to_number('$49')`. Neat!\n", + "\n", + "Note that we'll never ask you to write the `interact` function calls yourself as\n", + "part of a question. However, we'll include it here and there where it's helpful\n", + "and you'll probably find it useful to use yourself.\n", + "\n", + "Now, let's continue on and write more functions." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Defining functions\n", + "\n", + "Let's write a very simple function that converts a proportion to a percentage by multiplying it by 100. For example, the value of `to_percentage(.5)` should be the number 50. (No percent sign.)\n", + "\n", + "A function definition has a few parts.\n", + "\n", + "##### `def`\n", + "It always starts with `def` (short for **def**ine):\n", + "\n", + " def\n", + "\n", + "##### Name\n", + "Next comes the name of the function. Let's call our function `to_percentage`.\n", + " \n", + " def to_percentage\n", + "\n", + "##### Signature\n", + "Next comes something called the *signature* of the function. This tells Python how many arguments your function should have, and what names you'll use to refer to those arguments in the function's code. `to_percentage` should take one argument, and we'll call that argument `proportion` since it should be a proportion.\n", + "\n", + " def to_percentage(proportion)\n", + "\n", + "We put a colon after the signature to tell Python it's over.\n", + "\n", + " def to_percentage(proportion):\n", + "\n", + "##### Documentation\n", + "Functions can do complicated things, so you should write an explanation of what your function does. For small functions, this is less important, but it's a good habit to learn from the start. Conventionally, Python functions are documented by writing a triple-quoted string:\n", + "\n", + " def to_percentage(proportion):\n", + " \"\"\"Converts a proportion to a percentage.\"\"\"\n", + " \n", + " \n", + "##### Body\n", + "Now we start writing code that runs when the function is called. This is called the *body* of the function. We can write anything we could write anywhere else. First let's give a name to the number we multiply a proportion by to get a percentage.\n", + "\n", + " def to_percentage(proportion):\n", + " \"\"\"Converts a proportion to a percentage.\"\"\"\n", + " factor = 100\n", + "\n", + "##### `return`\n", + "The special instruction `return` in a function's body tells Python to make the value of the function call equal to whatever comes right after `return`. We want the value of `to_percentage(.5)` to be the proportion .5 times the factor 100, so we write:\n", + "\n", + " def to_percentage(proportion):\n", + " \"\"\"Converts a proportion to a percentage.\"\"\"\n", + " factor = 100\n", + " return proportion * factor\n", + "Note that `return` inside a function gives the function a value, while `print`, which we have used before, is a function which has no `return` value and just prints a certain value out to the console. The two are very different. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 2.1.**
Define `to_percentage` in the cell below. Call your function to convert the proportion .2 to a percentage. Name that percentage `twenty_percent`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "for_assignment_type": "student" + }, + "outputs": [], + "source": [ + "def ...\n", + " \"\"\" ... \"\"\"\n", + " ... = ...\n", + " return ...\n", + "\n", + "twenty_percent = ...\n", + "twenty_percent" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q2_1.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Like the built-in functions, you can use named values as arguments to your function.\n", + "\n", + "**Question 2.2.**
Use `to_percentage` again to convert the proportion named `a_proportion` (defined below) to a percentage called `a_percentage`.\n", + "\n", + "*Note:* You don't need to define `to_percentage` again! Just like other named things, functions stick around after you define them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "a_proportion = 2**(.5) / 2\n", + "a_percentage = ...\n", + "a_percentage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q2_2.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here's something important about functions: the names assigned within a function body are only accessible within the function body. Once the function has returned, those names are gone. So even though you defined `factor = 100` inside `to_percentage` above and then called `to_percentage`, you cannot refer to `factor` anywhere except inside the body of `to_percentage`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# You should see an error when you run this. (If you don't, you might\n", + "# have defined factor somewhere above.)\n", + "factor" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we've seen with the built-in functions, functions can also take strings (or arrays, or tables) as arguments, and they can return those things, too.\n", + "\n", + "**Question 2.3.**
Define a function called `disemvowel`. It should take a single string as its argument. (You can call that argument whatever you want.) It should return a copy of that string, but with all the characters that are vowels removed. (In English, the vowels are the characters \"a\", \"e\", \"i\", \"o\", and \"u\".)\n", + "\n", + "*Hint:* To remove all the \"a\"s from a string, you can use `that_string.replace(\"a\", \"\")`. The `.replace` method for strings returns another string, so you can call `replace` multiple times, one after the other. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def disemvowel(a_string):\n", + " ...\n", + " ...\n", + "\n", + "# An example call to your function. (It's often helpful to run\n", + "# an example call from time to time while you're writing a function,\n", + "# to see how it currently works.)\n", + "disemvowel(\"Can you read this without vowels?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Alternatively, you can use interact to call your function\n", + "_ = interact(disemvowel, a_string='Hello world')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q2_3.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Calls on calls on calls\n", + "Just as you write a series of lines to build up a complex computation, it's useful to define a series of small functions that build on each other. Since you can write any code inside a function's body, you can call other functions you've written.\n", + "\n", + "If a function is a like a recipe, defining a function in terms of other functions is like having a recipe for cake telling you to follow another recipe to make the frosting, and another to make the sprinkles. This makes the cake recipe shorter and clearer, and it avoids having a bunch of duplicated frosting recipes. It's a foundation of productive programming.\n", + "\n", + "For example, suppose you want to count the number of characters *that aren't vowels* in a piece of text. One way to do that is this to remove all the vowels and count the size of the remaining string.\n", + "\n", + "**Question 2.4.**
Write a function called `num_non_vowels`. It should take a string as its argument and return a number. The number should be the number of characters in the argument string that aren't vowels.\n", + "\n", + "*Hint:* The function `len` takes a string as its argument and returns the number of characters in it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def num_non_vowels(a_string):\n", + " \"\"\"The number of characters in a string, minus the vowels.\"\"\"\n", + " ...\n", + "\n", + "# Try calling your function yourself to make sure the output is what\n", + "# you expect. You can also use the interact function if you'd like." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q2_4.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Functions can also encapsulate code that *does things* rather than just computing values. For example, if you call `print` inside a function, and then call that function, something will get printed.\n", + "\n", + "The `movies_by_year` dataset in the textbook has information about movie sales in recent years. Suppose you'd like to display the year with the 5th-highest total gross movie sales, printed in a human-readable way. You might do this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "movies_by_year = Table.read_table(\"movies_by_year.csv\")\n", + "rank = 5\n", + "fifth_from_top_movie_year = movies_by_year.sort(\"Total Gross\", descending=True).column(\"Year\").item(rank-1)\n", + "print(\"Year number\", rank, \"for total gross movie sales was:\", fifth_from_top_movie_year)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After writing this, you realize you also wanted to print out the 2nd and 3rd-highest years. Instead of copying your code, you decide to put it in a function. Since the rank varies, you make that an argument to your function.\n", + "\n", + "**Question 2.5.**
Write a function called `print_kth_top_movie_year`. It should take a single argument, the rank of the year (like 2, 3, or 5 in the above examples). It should print out a message like the one above. It shouldn't have a `return` statement." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "for_assignment_type": "student" + }, + "outputs": [], + "source": [ + "def print_kth_top_movie_year(k):\n", + " # Our solution used 2 lines.\n", + " ...\n", + " ...\n", + "\n", + "# Example calls to your function:\n", + "print_kth_top_movie_year(2)\n", + "print_kth_top_movie_year(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# interact also allows you to pass in an array for a function argument. It will\n", + "# then present a dropdown menu of options.\n", + "_ = interact(print_kth_top_movie_year, k=np.arange(1, 10))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q2_5.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Print is not the same as Return\n", + "The `print_kth_top_movie_year(k)` function prints the total gross movie sales for the year that was provided! However, since we did not return any value in this function, we can not use it after we call it. Let's look at an example of a function that prints a value but does not return it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def print_number_five():\n", + " print(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print_number_five()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "However, if we try to use the output of `print_number_five()`, we see that we get an error when we try to add the number 5 to it!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print_number_five_output = print_number_five()\n", + "print_number_five_output + 5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It may seem that `print_number_five()` is returning a value, 5. In reality, it just displays the number 5 to you without giving you the actual value! If your function prints out a value without returning it and you try to use it, you will run into errors so be careful!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. `apply`ing functions\n", + "\n", + "Defining a function is a lot like giving a name to a value with `=`. In fact, a function is a value just like the number 1 or the text \"the\"!\n", + "\n", + "For example, we can make a new name for the built-in function `max` if we want:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "our_name_for_max = max\n", + "our_name_for_max(2, 6)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The old name for `max` is still around:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "max(2, 6)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Try just writing `max` or `our_name_for_max` (or the name of any other function) in a cell, and run that cell. Python will print out a (very brief) description of the function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "max" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Why is this useful? Since functions are just values, it's possible to pass them as arguments to other functions. Here's a simple but not-so-practical example: we can make an array of functions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "make_array(max, np.average, are.equal_to)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 3.1.**
Make an array containing any 3 other functions you've seen. Call it `some_functions`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "some_functions = ...\n", + "some_functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q3_1.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Working with functions as values can lead to some funny-looking code. For example, see if you can figure out why this works:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "make_array(max, np.average, are.equal_to).item(0)(4, -2, 7)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here's a simpler example that's actually useful: the table method `apply`.\n", + "\n", + "`apply` calls a function many times, once on *each* element in a column of a table. It produces an array of the results. Here we use `apply` to convert every CEO's pay to a number, using the function you defined:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "raw_compensation.apply(convert_pay_string_to_number, \"Total Pay\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here's an illustration of what that did:\n", + "\n", + "\"For\n", + "\n", + "Note that we didn't write something like `convert_pay_string_to_number()` or `convert_pay_string_to_number(\"Total Pay\")`. The job of `apply` is to call the function we give it, so instead of calling `convert_pay_string_to_number` ourselves, we just write its name as an argument to `apply`.\n", + "\n", + "**Question 3.2.**
Using `apply`, make a table that's a copy of `raw_compensation` with one more column called \"Total Pay (\\$)\". It should be the result of applying `convert_pay_string_to_number` to the \"Total Pay\" column, as we did above, and creating a new table which is the old one, but with the \"Total Pay\" column redone. Call the new table `compensation`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "compensation = raw_compensation.with_column(\n", + " \"Total Pay ($)\",\n", + " ...\n", + "compensation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q3_2.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have the pay in numbers, we can compute things about them.\n", + "\n", + "**Question 3.3.**
Compute the average total pay of the CEOs in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "average_total_pay = ...\n", + "average_total_pay" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q3_3.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 3.4.**
Companies pay executives in a variety of ways: directly in cash; by granting stock or other \"equity\" in the company; or with ancillary benefits (like private jets). Compute the proportion of each CEO's pay that was cash. (Your answer should be an array of numbers, one for each CEO in the dataset.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "cash_proportion = ...\n", + "cash_proportion" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q3_4.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check out the \"% Change\" column in `compensation`. It shows the percentage increase in the CEO's pay from the previous year. For CEOs with no previous year on record, it instead says \"(No previous year)\". The values in this column are *strings*, not numbers, so like the \"Total Pay\" column, it's not usable without a bit of extra work.\n", + "\n", + "Given your current pay and the percentage increase from the previous year, you can compute your previous year's pay. For example, if your pay is \\$100 this year, and that's an increase of 50% from the previous year, then your previous year's pay was $\\frac{\\$100}{1 + \\frac{50}{100}}$, or around \\$66.66.\n", + "\n", + "**Question 3.5.**
Create a new table called `with_previous_compensation`. It should be a copy of `compensation`, but with the \"(No previous year)\" CEOs filtered out, and with an extra column called \"2014 Total Pay ($)\". That column should have each CEO's pay in 2014.\n", + "\n", + "*Hint:* This question takes several steps, but each one is still something you've seen before. Take it one step at a time, using as many lines as you need. You can print out your results after each step to make sure you're on the right track.\n", + "\n", + "*Hint 2:* You'll need to define a function. You can do that just above your other code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "for_assignment_type": "student" + }, + "outputs": [], + "source": [ + "# Definition to turn percent to number\n", + "def percent_string_to_num(percent_string):\n", + " return ...\n", + "\n", + "# Compensation table where there is a previous year\n", + "having_previous_year = ...\n", + "\n", + "# Get the percent changes as numbers instead of strings\n", + "percent_changes = ...\n", + "\n", + "# Calculate the previous years pay\n", + "previous_pay = ...\n", + "\n", + "# Put the previous pay column into the compensation table\n", + "with_previous_compensation = ...\n", + "\n", + "with_previous_compensation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q3_5.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 3.6.**
What was the average pay of these CEOs in 2014?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "average_pay_2014 = ...\n", + "average_pay_2014" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q3_6.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Histograms\n", + "Earlier, we computed the average pay among the CEOs in our 102-CEO dataset. The average doesn't tell us everything about the amounts CEOs are paid, though. Maybe just a few CEOs make the bulk of the money, even among these 102.\n", + "\n", + "We can use a *histogram* to display more information about a set of numbers. The table method `hist` takes a single argument, the name of a column of numbers. It produces a histogram of the numbers in that column.\n", + "\n", + "**Question 4.1.**
Make a histogram of the pay of the CEOs in `compensation`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 4.2.**
Looking at the histogram, how many CEOs made more than \\$30 million? (Answer the question by filling in your answer manually. You'll have to do a bit of arithmetic; feel free to use Python as a calculator.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "num_ceos_more_than_30_million = ..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 4.3.**
Answer the same question with code. *Hint:* Use the table method `where` and the property `num_rows`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "num_ceos_more_than_30_million_2 = ...\n", + "num_ceos_more_than_30_million_2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q4_3.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Submission" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Great job! :D You're finished with lab 4! Be sure to...\n", + "- **run all the tests and verify that they all pass** (the next cell has a shortcut for that), \n", + "- **Review the notebook one last time, we will be grading the final state of your notebook after the deadline**,\n", + "- **Save and Checkpoint** from the `File` menu," + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# For your convenience, you can run this cell to run all the tests at once!\n", + "import glob\n", + "from gofer.ok import grade_notebook\n", + "if not globals().get('__GOFER_GRADER__', False):\n", + " display(grade_notebook('lab04.ipynb', sorted(glob.glob('tests/q*.py'))))" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/materials/x18/lab/1/lab04/lab04.ipynb b/materials/x18/lab/1/lab04/lab04.ipynb index 4be7dd6..956c7c6 100644 --- a/materials/x18/lab/1/lab04/lab04.ipynb +++ b/materials/x18/lab/1/lab04/lab04.ipynb @@ -19,7 +19,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "import numpy as np\n", @@ -53,7 +55,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "raw_compensation = Table.read_table('raw_compensation.csv')\n", @@ -70,7 +74,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "np.average(raw_compensation.column(\"Total Pay\"))" @@ -86,7 +92,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "total_pay_type = ...\n", @@ -96,7 +104,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q1_1.py')" @@ -112,7 +122,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "mark_hurd_pay_string = ...\n", @@ -122,7 +134,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q1_2.py')" @@ -138,7 +152,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "mark_hurd_pay = ...\n", @@ -148,7 +164,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q1_3.py')" @@ -171,6 +189,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "for_assignment_type": "student" }, "outputs": [], @@ -183,7 +202,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q1_4.py')" @@ -201,7 +222,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "convert_pay_string_to_number('$42')" @@ -210,7 +233,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "convert_pay_string_to_number(mark_hurd_pay_string)" @@ -219,7 +244,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# We can also compute Safra Catz's pay in the same way:\n", @@ -253,7 +280,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "_ = interact(convert_pay_string_to_number, pay_string='$42')" @@ -339,6 +368,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "for_assignment_type": "student" }, "outputs": [], @@ -355,7 +385,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q2_1.py')" @@ -375,7 +407,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "a_proportion = 2**(.5) / 2\n", @@ -386,7 +420,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q2_2.py')" @@ -402,7 +438,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# You should see an error when you run this. (If you don't, you might\n", @@ -424,7 +462,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def disemvowel(a_string):\n", @@ -440,7 +480,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# Alternatively, you can use interact to call your function\n", @@ -450,7 +492,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q2_3.py')" @@ -475,7 +519,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def num_non_vowels(a_string):\n", @@ -489,7 +535,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q2_4.py')" @@ -507,7 +555,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "movies_by_year = Table.read_table(\"movies_by_year.csv\")\n", @@ -529,6 +579,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "for_assignment_type": "student" }, "outputs": [], @@ -546,7 +597,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# interact also allows you to pass in an array for a function argument. It will\n", @@ -557,12 +610,67 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q2_5.py')" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Print is not the same as Return\n", + "The `print_kth_top_movie_year(k)` function prints the total gross movie sales for the year that was provided! However, since we did not return any value in this function, we can not use it after we call it. Let's look at an example of a function that prints a value but does not return it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def print_number_five():\n", + " print(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print_number_five()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "However, if we try to use the output of `print_number_five()`, we see that we get an error when we try to add the number 5 to it!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print_number_five_output = print_number_five()\n", + "print_number_five_output + 5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It may seem that `print_number_five()` is returning a value, 5. In reality, it just displays the number 5 to you without giving you the actual value! If your function prints out a value without returning it and you try to use it, you will run into errors so be careful!" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -577,7 +685,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "our_name_for_max = max\n", @@ -594,7 +704,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "max(2, 6)" @@ -610,7 +722,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "max" @@ -626,7 +740,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "make_array(max, np.average, are.equal_to)" @@ -642,7 +758,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "some_functions = ...\n", @@ -652,7 +770,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q3_1.py')" @@ -668,7 +788,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "make_array(max, np.average, are.equal_to).item(0)(4, -2, 7)" @@ -686,7 +808,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "raw_compensation.apply(convert_pay_string_to_number, \"Total Pay\")" @@ -708,7 +832,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "compensation = raw_compensation.with_column(\n", @@ -720,7 +846,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q3_2.py')" @@ -738,7 +866,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "average_total_pay = ...\n", @@ -748,7 +878,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q3_3.py')" @@ -764,7 +896,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "cash_proportion = ...\n", @@ -774,7 +908,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q3_4.py')" @@ -803,17 +939,31 @@ }, "outputs": [], "source": [ - "# For reference, our solution involved more than just this one line of code\n", - "...\n", + "# Definition to turn percent to number\n", + "def percent_string_to_num(percent_string):\n", + " return ...\n", + "\n", + "# Compensation table where there is a previous year\n", + "having_previous_year = ...\n", "\n", + "# Get the percent changes as numbers instead of strings\n", + "percent_changes = ...\n", + "\n", + "# Calculate the previous years pay\n", + "previous_pay = ...\n", + "\n", + "# Put the previous pay column into the compensation table\n", "with_previous_compensation = ...\n", + "\n", "with_previous_compensation" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q3_5.py')" @@ -829,7 +979,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "average_pay_2014 = ...\n", @@ -839,7 +991,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q3_6.py')" @@ -860,7 +1014,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "..." @@ -876,7 +1032,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "num_ceos_more_than_30_million = ..." @@ -892,7 +1050,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "num_ceos_more_than_30_million_2 = ...\n", @@ -902,7 +1062,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q4_3.py')" @@ -928,7 +1090,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# For your convenience, you can run this cell to run all the tests at once!\n", @@ -956,10 +1120,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 1 } - diff --git a/materials/x18/lab/2/.DS_Store b/materials/x18/lab/2/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..894b1eedb037356b79646dcd96d3c97eb5981c21 GIT binary patch literal 6148 zcmeHK%}yIJ5Vk`ACkQnMB;e*tB@Q|4W+_6dkX9)@aI3Ucd!SX=ZKAT$vPQ|K5rkBG z;U&rw@G4w-?1S_HV8$LGe|xG_p~xeRzp>|=-S|^%uNh;!*>Agy)fi&}9I;%5W{zPV zea1>OQUT=p8O~9nzD0?OX1I31Dqt0Oc?#g#U1kXjS;QPRzJ75_cniLBuTB5-XQn&0 z(vf*v%!~%TbJ`Q`qVpu~WsdWQmzK)QE2~0OMRmRUH8@p+APcj8uNiiavG*a&P9usx zDwRCNVe~2P3|h5~cPh=oIPG+Gaop|z<;&+dZL2|3_0x7wKPNB=Au6KMs%;O4jmExL zm!3DO%c1LfPF?PLd!tcBY`%66j!w=lE{9j6pTG2c8HasM+bhCHxPf3yjUPcbO;q|F zMzP{K)0otWZ$fsyqTUqqx{$d$#r#{yd^5=`g@w$$Ddz7fCUmVWm;&_Q7HbPOFFPHp zfK}k%6~O&Lf+PBbxk0^kK%=h!fMpog1~GmPV2vd733G#x0pq3=(3EnzVsKLq=dq$2!2Hcne$`IRT0CFgj$j!{oX6mPH}bB|4pBh6QeMg;&hCvMul* z6_B$#!3ZlZDB zC7$tudwnA+juu>Jz{mm*O?EUQ<70!}LqT4KGYrX?lk-ki)?~RGvNqR@8>|vft^Chc zeph!bd#{?A(RnMALo-vr6fgz$L;==pvAE~aT2sIjFa_2M$oCcSPnc{<}s$V&s(9z7i{Ts~Y_*@YX5)73eDqUms{ zM{7+1Q=qB9fgkpz{$Kol|8J6PWD1x9d!>NuOs*$mZYk8(t<6cTP3U)YG0Cevt|{zj gDQ2#e;$ylRLX~5beBQ*OFP#LT-1^!flA9mSTe*gdg literal 0 HcmV?d00001 diff --git a/materials/x18/lab/2/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb b/materials/x18/lab/2/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb new file mode 100644 index 0000000..69c51bb --- /dev/null +++ b/materials/x18/lab/2/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb @@ -0,0 +1,729 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lab 1: Simulations\n", + "\n", + "Welcome to Lab 1 of Data 8.2x! \n", + "\n", + "We will go over [iteration and simulations](https://www.inferentialthinking.com/chapters/10/sampling-and-empirical-distributions.html), as well as introduce the concept of [randomness](https://www.inferentialthinking.com/chapters/09/randomness.html).\n", + "\n", + "The data used in this lab will contain salary data and other statistics for basketball players from the 2014-2015 NBA season. This data was collected from the following sports analytic sites: [Basketball Reference](http://www.basketball-reference.com) and [Spotrac](http://www.spotrac.com).\n", + "\n", + "First, set up the tests and imports by running the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "from datascience import *\n", + "\n", + "from gofer.ok import check" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Nachos and Conditionals" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In Python, Boolean values can either be `True` or `False`. We get Boolean values when using comparison operators such as `<` (less than), `>` (greater than), and `==` (equal to). A list of common comparison operators can be found below!\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "3 > 1 + 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can even assign the result of a comparison operation to a variable." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "result = 10 / 2 == 5\n", + "result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Arrays are compatible with comparison operators. The output is an array of boolean values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "make_array(1, 5, 7, 8, 3, -1) > 3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Waiting on the dining table just for you is a hot bowl of nachos! Let's say that whenever you take a nacho, it will have cheese, salsa, both, or neither (just a plain tortilla chip). \n", + "\n", + "Using the function call `np.random.choice(array_name)`, let's simulate taking nachos from the bowl at random. Start by running the cell below several times, and observe how the results change." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "nachos = make_array('cheese', 'salsa', 'both', 'neither')\n", + "np.random.choice(nachos)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 1.1**
Assume we took ten nachos at random, and stored the results in an array called `ten_nachos` as done below. Find the number of nachos with only cheese using code (do not hardcode the answer). \n", + "\n", + "*Hint:* Our solution involves a comparison operator and the `np.count_nonzero` method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "ten_nachos = make_array('neither', 'cheese', 'both', 'both', 'cheese', 'salsa', 'both', 'neither', 'cheese', 'both')\n", + "number_cheese = ...\n", + "number_cheese" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q1_1.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Conditional Statements**\n", + "\n", + "A conditional statement is made up of many lines that allow Python to choose from different alternatives based on whether some condition is true.\n", + "\n", + "Here is a basic example.\n", + "\n", + "```\n", + "def sign(x):\n", + " if x > 0:\n", + " return 'Positive'\n", + "```\n", + "\n", + "How the function works is if the input `x` is greater than `0`, we get the string `'Positive'` back.\n", + "\n", + "If we want to test multiple conditions at once, we use the following general format.\n", + "\n", + "```\n", + "if :\n", + " \n", + "elif :\n", + " \n", + "elif :\n", + " \n", + "...\n", + "else:\n", + " \n", + "```\n", + "\n", + "Only one of the bodies will ever be executed. Each `if` and `elif` expression is evaluated and considered in order, starting at the top. As soon as a true value is found, the corresponding body is executed, and the rest of the expression is skipped. If none of the `if` or `elif` expressions are true, then the `else body` is executed. For more examples and explanation, refer to [Section 9.1](https://www.inferentialthinking.com/chapters/09/1/conditional-statements.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 1.2**
Complete the following conditional statement so that the string `'More please'` is assigned to `say_please` if the number of nachos with cheese in `ten_nachos` is less than `5`.\n", + "*Hint*: You should not have to reference the variable `ten_nachos`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "for_assignment_type": "student" + }, + "outputs": [], + "source": [ + "say_please = '?'\n", + "\n", + "if ...:\n", + " say_please = 'More please'\n", + " \n", + "say_please" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q1_2.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 1.3**
Write a function called `nacho_reaction` that returns a string based on the type of nacho passed in as an argument. From top to bottom, the conditions should correspond to: `'cheese'`, `'salsa'`, `'both'`, `'neither'`. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "for_assignment_type": "student" + }, + "outputs": [], + "source": [ + "def nacho_reaction(nacho):\n", + " if ...:\n", + " return 'Cheesy!'\n", + " # next condition should return 'Spicy!'\n", + " ...\n", + " # next condition should return 'Wow!'\n", + " ...\n", + " # next condition should return 'Meh.'\n", + " ...\n", + "\n", + "spicy_nacho = nacho_reaction('salsa')\n", + "spicy_nacho" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q1_3.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 1.4**
Add a column `'Reactions'` to the table `ten_nachos_reactions` that consists of reactions for each of the nachos in `ten_nachos`. \n", + "\n", + "*Hint:* Use the `apply` method. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "for_assignment_type": "student" + }, + "outputs": [], + "source": [ + "ten_nachos_reactions = Table().with_column('Nachos', ten_nachos)\n", + "...\n", + "ten_nachos_reactions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q1_4.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 1.5**
Using code, find the number of `'Wow!'` reactions for the nachos in `ten_nachos_reactions`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "number_wow_reactions = ...\n", + "number_wow_reactions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q1_5.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Simulations and For Loops\n", + "Using a `for` statement, we can perform a task multiple times. This is known as iteration. Here, we'll simulate drawing different suits from a deck of cards. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "suits = make_array(\"♤\", \"♡\", \"♢\", \"♧\")\n", + "\n", + "draws = make_array()\n", + "\n", + "repetitions = 6\n", + "\n", + "for i in np.arange(repetitions):\n", + " draws = np.append(draws, np.random.choice(suits))\n", + "\n", + "draws" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The unrolled version of this `for` loop can be found below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "draws = make_array()\n", + "\n", + "draws = np.append(draws, np.random.choice(suits))\n", + "draws = np.append(draws, np.random.choice(suits))\n", + "draws = np.append(draws, np.random.choice(suits))\n", + "draws = np.append(draws, np.random.choice(suits))\n", + "draws = np.append(draws, np.random.choice(suits))\n", + "draws = np.append(draws, np.random.choice(suits))\n", + "\n", + "draws" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the example above, the `for` loop appends a random draw to the `draws` array for every number in `np.arange(repetitions)`. \n", + "\n", + "Here's a nice way to think of what we did above. We had a deck of 4 cards of different suits, we randomly drew one card, saw the suit, kept track of it in `draws`, and put the card back into the deck. We repeated this for a total of 6 times without having to repeat code, thanks to the `for` loop. We simulated this experiment using a `for` loop. \n", + "\n", + "Another use of iteration is to loop through a set of values. For instance, we can print out all of the colors of the rainbow.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "rainbow = make_array(\"red\", \"orange\", \"yellow\", \"green\", \"blue\", \"indigo\", \"violet\")\n", + "\n", + "for color in rainbow:\n", + " print(color)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the indented part of the `for` loop, known as the body, is executed once for each item in `rainbow`. Note that the name `color` is arbitrary; we could easily have named it something else. The important thing is we stay consistent throughout the for loop. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "for another_name in rainbow:\n", + " print(another_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In general, however, we would like the variable name to be somewhat informative. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 2.1**
Clay is playing darts. His dartboard contains ten equal-sized zones with point values from 1 to 10. Write code that simulates his total score after 1000 dart tosses. Make sure to use a `for` loop.\n", + "\n", + "*Hint:* There are three steps to this problem (and most simulations): \n", + "1. Deciding the possible values you can take in the experiment (point values in this case)\n", + "2. Running through the experiment a certain amount of times (running through 1000 dart tosses, and randomly getting a value per toss in this case)\n", + "3. Keeping track of the total information of each time you ran through the experiment (the total score in this case)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "for_assignment_type": "student" + }, + "outputs": [], + "source": [ + "possible_point_values = ...\n", + "tosses = 1000\n", + "total_score = ...\n", + "\n", + "total_score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q2_1.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 2.2**
In the following cell, we've loaded the text of _Pride and Prejudice_ by Jane Austen, split it into individual words, and stored these words in an array. Using a `for` loop, assign `longer_than_five` to the number of words in the novel that are more than 5 letters long.\n", + "\n", + "*Hint*: You can find the number of letters in a word with the `len` function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "for_assignment_type": "student" + }, + "outputs": [], + "source": [ + "austen_string = open('Austen_PrideAndPrejudice.txt', encoding='utf-8').read()\n", + "p_and_p_words = np.array(austen_string.split())\n", + "\n", + "longer_than_five = ...\n", + "\n", + "# a for loop would be useful here\n", + "\n", + "\n", + "longer_than_five" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q2_2.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 2.3**
Using simulation with 10,000 trials, assign `chance_of_all_different` to an estimate of the chance that if you pick three words from Pride and Prejudice uniformly at random (with replacement), they all have different lengths. \n", + "\n", + "*Hint*: Remember that `!=` only checks for non-equality between two items, not three. However, you can use `!=` more than once in the same line. \n", + "\n", + "For example, `2 != 3 != 4` first checks for non-equality between `2` and `3`, then `3` and `4`, but NOT `2` and `4`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "for_assignment_type": "student" + }, + "outputs": [], + "source": [ + "trials = 10000\n", + "different = ...\n", + "\n", + "for ... in ...:\n", + " ...\n", + "\n", + "chance_of_all_different = ...\n", + "\n", + "chance_of_all_different" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q2_3.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Finding Probabilities\n", + "After a long day of class, Clay decides to go to Crossroads for dinner. Today's menu has Clay's four favorite foods: enchiladas, hamburgers, pizza, and spaghetti. However, each dish has a 30% chance of running out before Clay can get to Crossroads." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 3.1**
What is the probability that Clay will be able to eat pizza at Crossroads?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "pizza_prob = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q3_1.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 3.2**
What is the probability that Clay will be able to eat all four of these foods at Crossroads?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "all_prob = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q3_2.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 3.3**
What is the probability that Crossroads will have run out of something before Clay can get there?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "something_is_out = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q3_3.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To make up for their unpredictable food supply, Crossroads decides to hold a contest for some free Cal Dining swag. There is a bag with two red marbles, two green marbles, and two blue marbles. Clay has to draw three marbles separately. In order to win, all three of these marbles must be of different colors." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 3.4**
What is the probability of Clay winning the contest?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "winning_prob = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q3_4.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Submission" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# For your convenience, you can run this cell to run all the tests at once!\n", + "import glob\n", + "from gofer.o import grade_notebook\n", + "if not globals().get('__GOFER_GRADER__', False):\n", + " display(grade_notebook('lab01.ipynb', glob.glob('tests/q*.py')))" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/materials/x18/lab/2/lab01/comparisons.png b/materials/x18/lab/2/lab01/comparisons.png new file mode 100644 index 0000000000000000000000000000000000000000..4ea6dbaed8454c6ff819a7d4ab5105734479268c GIT binary patch literal 55882 zcmeFYWl&tf_AffP1Py@%ClK6%`#^vo!QCOaGq?`}A;E$r_z*0(LxMYj;O;&UT!&zT zb0_DV|GB5?Q=oEZ>qM;Uc0;3THR~)>eas;sje!Ik4u3I008h66=XF501P?+ z0HpyN6Zu33q{smP;IY`t$fzsI$k3^~gKX@btN{Rp$Rs^1eXV}7Oe0N2yAW&zoYu&R z(y1|2Yz0!M_b=krEtIk8y{$AlYqRO2NKHsJJUN3o1P`n=s%FD6NUcpc=G6TX(8s3X zt%%lB@b=0>rijOuU5qo8 zA3vj{#K^{LdtilEpvnGj`;{GW?r{SlT|(`y~C2$cV+Y05^cmwFy_A75wPRDoOLgfd`?v_=1_MfsULn5R3| zqD=wMjjuaM{CNuDZ`%m@jV;v>{Vq(4^fUM`t9u5UYjtHVArI_QcMV(-{TB%vu7wX- zo0qW9=?t_~T~ur;EckI+8)ZdC7l{fO1+M~iztB*lRSS0Oy$;N8N0lTV=pwbNEke|_kc;zGT=iWE_dtqjBxOV#8Mf8)-=V4O zI}SeiB~N&*u<3?7 za-3yJCqqF|c+$z?ZXN1pz9LP`><^IN#3A{i1FlvSulSr!(`|q{-%#`_Gw_vb1RHmm z0-Y+=YFv`i$-7^exAkw_EwFB@ZCx`8O1VXD!MV0@y(3K~pm?Uu=94Cx)Czqa?J*?^+y zj9(XC4(d}Ru|#Zkj{ejonc@jbzh^8T;% zyM#jqZ}E15sbsLeDLnwimT={~x4<4_%L7&H`mXL5IWOnsHhTt~DRm(dw( zxm{&bhEi3PtIyy~ELPSb*rlfje?hPq}V==C?6TNVyHUZy*=MG7g@mdoH;?M0}|BcnGZJX3#h@BW)KME zRLIDR8E_K-$X>az`5v;b-bx+(o!!sfzHK0kod#VBgTgg`2hFm3N>Hi522q*;jo|$cCb~EIiVYM5L8hfpy6keQ(+Z$Vz`nUqxq~F z@1k={5&B|X6QMtU?)g4shk+@A(IQTmAv1#Z@jKb)ZVI@c!>s5nWMhY9w-~GyM81*D zM^DkUGla*l_le};H+~S8YKoujV}2rIUqs;W-Y_RuU5%e8MwVnO(vC@#lvDQOIHMh- zVa$>A{`mS1#u>#zNSjmuPjIoh^3W5H4sjkqve0}B^3*5kF-aDr`U1M)0-cgzjQx(L z>NsIy+f|Mwl#y9iapLVYt>5#P7J-~?XqCNczx0l=dJBvsX2h-C}?0sVW@_mOJg&V*qrk|gGsYK=+ zvre)qu^O?eBq}AUzM<9m=%8|*&!qmn=(H$lN_3BP&t#8MKtO;`fV;NZfxfoUA;}?N zdZa``{YGK3V19hA28Un78OK@d@Yx~RA%2Ex{a36ptzEw}#xsg*oNMqkmRH!df>*4l z893HI$-mXV`C17-DlA_vyXQrWbPR6H9YqR%lK?AJ7wV)^vE?+(HY@`b;t!~OW~ge& zW7uf;vDVumW75AsWuk6^V0_E*sUv6{m~$5q!ZSqfgc-+^Rh{G(HC`+;RXww|W3`*O zdoW`^y*oZV<29`^!8ukwAyKy2*`2DZOu!f3nAjNSmvLpEI$qUb{?7bk*fhQ)?S(0* z!8ABis!|&iw<6H0cFl1|bmw%pd5wRKz;i_t3$4bS#1nk>g?ERhf^W%I805Qq4DJF^ zehu>ua|`3ctAA$7e;~jsh|Bl+SvilW`IPezKf%ipUQ_$P?uNnl=D%#|c@6B{r#3{C zQb_v-%yqu0e)CtxSLssOEJ#-|Qn?u_8-6?-HFS}>%vT5vfZC>3@`HHb)i0}`S@KMN zo%vPA_;X@X)2?yhBziM|-LcfF6hjwZceL!hOl0Q6Ozn)1^N!%Xpq6uqbH&1AX9?%; z&a;cAbux!b2k*Ugyi*P|0(=5K-TB|t5x*xkW$0!okzaDn7C{%?5StPef7VY$F7VCH zs;e(0mqRW;))2TiR8$%35-SpGL=6|V7ke#CDk31#>RsotC88%{=9Sa@zG=wK-qUZ> z0nX;Syy`z0vnJT%9Aflw@u?4?PjNs-KuY)rd1$X_9A&0xW_9LAiBW&zmUk^%z5)T@ zGjGJm>B{z9eO#zo-Kc7oZEm^zBRLTDUEl0b%6{{FeOKIO%2s`UWfyEJYv1hf;bn(- zFl`m*s(rTeTiw-)j+9riuO_lXU%}^B4h#+~JM~@lUF}^RBfJ$X`WE7Ea%^(aa%yuV z%w5d+e#Cwt`N7bY5)NB^=H~l_LRIa(e0Nzy!l#b)y?8 zoyj?4T_R9;zNC`!AS}u)3w!Q)l7$7mou=oQ-BjK9)ZjS^!rcGs0aRW}~Tjg7=t60Z+C(Y<> z@7y~l@nF#>(Y9%AY3#hs@9%%$n%75g8nXK*k!IHN?1R0PFD?%5zifPwXxvz;S(34E zcB{MGc(Trlp|$Yh;`3w0UTt`BX~Bg8L;j*wgwxkD9yKhQZ`=dU8|{6x8#kMJn~>CG zerHICZb-xGFVE)Pj|GyPc$~M+n7~k>T1#ueuBJWVDCYyQ<0w@qT6A^ z%RF(S+yN_DuVY_tz6NbGkAm0)*)ol8cT%d%9F(S%@)(84E;;5Jh|E|F=bHMg$Dyyy zl&4{=MDLRMK;H z19m07HWLBA=|2?(#=vawp=#KS^YADx~)^gwFO6=S2_VR~%!-8z8@L0`~i9FmXufyiHZP(_+fuuS06FU*2EJ zeis$b+c$XFBXwJOxFBFYySd!iOu0|BCkQt`VqJW6F@OL*NE{;eZ|D)V2Z$vzpN^A` zrqJln{Bwo7v+GPua~XNdHzh@bQnm-+Hse zK4y(&X^RW^8J-SZ8F=W~&2Db5XZf_nx9se5T`BEXdT~3O(p-wI!aqu^`-_?>nw__s z-pV};U({ECjQlMF;O&FpSw!1e=zTaft%PLeRpRZgJ*;LlZ#2owshuAjbL(F(S>JB< zBgs8U9tDpL`U$0D{}3>y1$b*5wjKVvq&ZEKXd0{;*#Y@ca+9vPgi;jB{gdYl7%#Y0 zy*zs&hb?v$2L<5eMm~Y^$+|(f)M9)!=EHGL`$G&dAdd;qrf_p}GlM`t=5hO1P*>-) zr!&-_M~Uq)>0vn7k_}-<_O+uZ4|s^#MoTOpmkv0t3I-kk02$+-14U8u#W8YKk!i23 z@2Rh%ENTgI<}$YeSy*%VI=dp@1^~o;MUl78)}H2czRpfA9-_Vy^#6E66nX#WF*iNk zKVI>4l%UsFQKypuxm(i-aPf1!qzB^C(b0*!TiJ+e%F6w#IP#way`870t0*_OkB<+R z4<8rE-Ikk2L_~!9B`-HGFDLR1P7gm9Pjg>R7Y~MiR`Ngf$Xa_?y4$;Y+Jju^{?u!3 z0rK*cpr`-S(7%8GXqXz2yFP-N>Tif1Zk}+xuEO>C4(X zTf2B5+W_+L@$icOqriWW{&$oAQPkjn7v<-F`JW~KgYvJE;@p2E_z#Kxsjh!KMd}NP zE6)9I-2-tW*gUe4-9cq9tEP>-q9Tt!za`|s@~;3T>(FDYzu$5 zglwtZJ`V<eT5d3w z_-|AxIu5`UJPRU+@!uu?@qUETYMH~dMTI}D2L2nnwJ8E2zgV>7jhk`217BN#}Z#QqiPAsU}7Gf6FV?_(Ztn#_{mMrwzU| z;ZWrFG3=|aAs4p*vko3=U{(}L;}fcP`^LCjsEP`P*|Mnde&IelT>@~&^<>c5dSr)) zwnhe1$}V(dc4v?5+)M7QV5K6q?^$Per{Oo|nvX*jgu9I^DerAwen1MEXb>{F(0)$R z+UycqcX+MJJR)MY{pEJ#wnBTnlDk!zx_K#^;OR8mcU$=%#4PNnjR8XYA0;GN%QaD1 zfsEGtj^$)tB}`SsrOAXL>xMwPq7@Mu;dH3K@3>R#YAz9At+wouFTwuls5i?cahw;h zkgfPtdYyxQYn4yC^~OzNJWe~|8f_Y{dqx+@2o*XN>QAa*@rqHsp>aM(AJNCe7fiXe zaxt_;><+&<_=lKs$R$|=F7Nyqm}@j_wmUz|YEFaQB&F+PmLaYO8xh%Iz zARoW4o5C6`<}ZPa4nN?br)-1ozxN9*GqrGkME&_JSiIyQl5h{-$hl?itq%X&h3`!j zz*l{9s?{k+4{tMjI2NB67catXOGh^aLg8tDEO!7ZwQ^KT0o>Ku{F!kq>i_oQNo~2# z+yaIhZINf0@2%sqkX>>tx~5j`M>3AR{JjuF3w^vr$nt!`*-m-@$%>=IB%38uIqzd{ z_8Ur929t}*=e9&jbRfijQ^3SxwXPe-zMxCRwXx<1T09Q)_-6GoB2{Q3-Gz81MJ2@I zk(7(c@2Z0GJ3-}Qw^9NJ-x=;}7=F4Zos+$z#NDsf60KK9q|1%%aRH8-d2AKrmF&qeJ#>0S%-}6>sShr*6RIy*;la|X{G7-j+ZQ#yK(L2Ji?WOSty77H znnV%%JBF{mHD1FODn{fb<+hAhCn1vp*U;09Vo9*J(Zg+|pwmimR?8=W?+xx@hTaW^ z)QiS>p4$ss8S>5TmlOQ*2|?D--2CY_S!eFz3TjH&WE1xV6zr=l^1ZEMFR*n%@NAmk zjB2Iai`Hc<7P93V>SLP)DXwVWZ1>Vo35Xe5x1mixYh|0iFjA6NF;jrt4+`EHvzqvY z6YPHZSgF1J;F1p=e+MaqOvG}xk3BW2-(Mwuz|jFJJr!t_rmkp|_9|%-iuMFf*Q~D- zp7&p%F}1d9-gz%i-JP5!Ar1+^zlY7L>@%Tm%!0qX6nV-rsUSY-!wx4HrdiG zZB1Q2EqL1HD(!XqQlzYU9p`UWDJn7ExwEMZ-72JHtZvBcaTVk&R-@AvtY_Kc}gY9-1-fiUiZ$d&d@B;i6@WF4o#4e@dk8X<; zn*~b}ytU1suw>_(lf?T{ad4JT?1I;k)k*GLZBMy4WVxx`4nb^2;}8@-mHpb$tEr&1 zzdb{*2W-qvt1M}C{rdh=>rnFgBb(k$t;^ox^zT*9*@0f6dmST0;C4W&mA9X_4MXkG znCXLG|5aVVG|eIGPVMe?eHJRP?QDcUUf}=sSGnkoY+H%RG3B(+ZERXbJ{>pr4%HXj zmQrj~v7ylIUT4D0<}{`-Fg-s{^o>|fR?Rtg`Z-8z>vKJaPc`eH$Y40^=u*0wU^Csl zhxJagIC-I_@kNWW-SzVu_pYq-`ILYm8Zbnc{CI$1wkXsQ^4qk~_Y@Gfi*`5paGn%e z27dVZ-P12zNyaawzUBHU2!Yf7EjiuY2u-h%^ZvFYH?ZnF(YS&rAct(@vGq_SAO5j zUFY!K4>ybGFxus6-8faSfb?N<2OGW@tY&;q9p*i|J)d)fDQ>@5ki!M4(KCB(N3i0u zW3_CsX^9WH&g~$euB?wuD=#Cvf8U(mnAXM)ck+`(nz8ny>>L_jhq47sw2HRnpqrm> zR8mjd4&}z(gyHFqn5t9pj!tU9KSl}*_N=L9%H&dSlit;@gc7gOwMK!#j3KhYoQ+LBqg-<88CDbf4#jljK zlf@p`J^Fz`1_iqIrUG3|~Z*#{5+cQ`iyQVrn!2ch4|; zmEf8X1lEch_3hH7@icifX?qYY@6UR)0(m@ycH-S`54CPv6E0fs>?4?T@+kQ}zqLBA zG}XTcq#wWSfIO)Owd$tq_6|cns$V#ilYexX&gzYw7OOk4dRZOgGI6U|?{})^m>JL> zxtW6jQz*PO#87*WgC1py98S6jrq-A1byYXTRf$jb0!+scyEWcr_n|b9VT1l&zXMSU zaldz>jiN)Un~16^rj7IvcAYV=WKF+TRQXNP5bZ}#WB?}(Gs;vYDNdbjXQjyf0aunH zJIUh78_B7N@RWVoX*Qb+9G%9RcAI6`c)+^PS!oi*#o5ipbgn94YTe5zNFL6d%1aal zP=W+I7x`?w)wuVvwi*X{Y1oYA>U)GyZ}f=}r|H?Cn z>zLQW5kY7~K&hDZ;5wlrw&M@21V=T#gv9IqctHbCn(QH<=EutP>eU$H0kb8r_j=CQ zI{dyD;;`XHXP39U+y^$mL@fECI8;hGU;K^YoY(TTftR75KvOd8euW1pVG+Gv8=)cm znKL~Gh`q+)5yLMPCO0dBO)}iddnISvRh%UA-*i|gp2Ufz1VN8ZRCFJ zho?{t^03;oQ-+ljF4GwBb2q zA$Rm*?$Nst(vqU(>6HEOc9$ub3CiHx=b{5aI<>J)o=#s|_a$)`XG{S%?rRbSb%lpp zyz9KnNz?$f$i*AQPF|i%CbdC43XVIdXBlRU7H^M4MK^pofJn`)S4YyJ=wSLt`^jGhflgVejeL&Chfyy*23KP3h08q?%JT z?p?5a`OiWA^`BTeFl~z{Z{y;q;(jizgw_xmA;Ly3(MAIw!hjhKrQMSepPdpVzM6;c zh_f&T@O_QWN@aeWWMwOJ&=ibtA;?IXZC}8lgpJvK`gm(?57y~W2iu`wQCM z*XE^6UZ3fmem@%;s&l1_xK{&B8J6?K?O4hKEy?CjmilgDga|oT9IeF!n21@(LKHTW zu5?KbCJVdbAU))5x>O(TGkdRre#)=2J7c4yVk+BYd`w`Pw)ZW1l_RD_l}?ECoMiXn zQNAwaA!BImR02hf*jAy06SzJa%ee90ssYc{5W zZ3QD^bex;ae(p<5-L%7ATcJ%!XnUA(Me#0g_p}q^Tn^~HlzGb9lTA5KcyHx?M*Bn2 zcV!>B_Ae#*=|^phQ}Ee(+P#tSK^c4AQ0$hsn8e3qH4N1CF3b|ZBUKaaWxw8SE;`JK zpoIfS)jY)Y$vli;hCQwZK?|>GGrlXSOZ$*i7R%sRqdiK=eSR;@uk>F|Q!pbImnu^&&`BRv<#&*9Xa z^_R%|&%N3hPh250Xq$vW9|9yC{klgJnhz4B04FaN_aI<%8sU9~yzr8PA# zMCcbiL6_9@9T+WI)YEqyY;xa}6nmG|@9-_lx0=M7383wf|aysa6eTJLU zGx26fh_~2zS50Tt{CQ{@uMGA#dmJ7CCMTgI<5MP_?r($l!CB}_wxoHnS6=()3@MrwevJWswHC4ZcIBr64rMU@vdsnq{5-z2{Z-YE0m08l zk7lPqnnm#3RYLdG)fW0-sSjkXv%>D%(`&*<(v3J$v3zLe#_rzwbYsh~s#Xnyzqw`P$!B|apiQ9UPTaG{v@qxkfK9B3X6Xc|SvnOKM z@O;Y0YPF?hA9|+p@)ZWCo?!;>1|ANwEqD;z20UNejkr(W8yW1XKc{`4g3Gyj|3w&> zD@cy;*N{{@;?>iSAG-q+V$?2WAv`DB&l^U(hJ5-{;`zb*oE{$>&;-w?Wxauu`1zud z;vMw~$8bHn?E-N!-JV+R+Jf~_!lkwxEY_KzB^v=1ahg^W z3n9Lk3$ZEtKL3?Wb}GhWoH~p_GIRTAM8MHKO?aEyjE{IggMH-A&TMVLg4bb@t_nqZ zKpU(mx%4C|DLQdG%_b`{HPu{;dO7*Zu1`oWT{piHo&EKP3J(oq?ktf9Vue|z`Xi7n;TQIoOA8bY7!m#I0;kIL(q8+0Epkz2TmL9Tzq9X5yHKXQ_5=s&f& zjb+-&8TzqsH{NQtQ>&-Dzy5%~tZ+@ZA6Wgg=8-V%O_;`{i>OHE*-`ISGdx!XWe>|W6wlh=De%VaM|-I z$mg0~xN-8+K#O=P3lm$7)TQl^nYVj!!|ccF43=XL>N}<-FrMCH`j!~aQ%9rW46?Qa4H)di5IAqtT$UToKH0_NhP7$K2hqpg5<_RN3k9 zUVywFMvp6J6x{qpf`1GJbWdyz!aOC_MZ5bHNcZf@rW;_TVO)7!`hyBxlR+I1rSG^? zm5bKydZv--$&`C$l)OCQHs;%?U6 zT(;Fx1KK;KlLOi-ItoCiJCe8olBwnL$@ABSXSCeEK9-L?Wj!YMr9WH|bXXXUDkOxI z7xoUjpWUsktoWN%?qd%h9F`LAt(eizcD6c?jYG!XRmkMk@DL(HidZS%;X`#;6ftRR zP);*Ei`GHcKC=i5dewN!u&vSv0v!e09@lqfAmqgxJZx4myTHaiz-LPX-}Th+Q#bYy z!VUaRKdNbct`FL2?>U!u6|7BZl7(%w6;KeiL%+#uBfFbTJFT;JHaQVIPT_I9VG#;bS3$HVpvTv zX&^|Yp_CRYJ*}HX%3!gPuX|OcpmyxGv3;uZM|wazZnV|T;s^pZ#ZP7BK!NF4c?k+; zhflw6OOPHm92(FqgbCF?l=GHmm4B~Mn%tVZUals+#^rh+9!o25*r}tszqo8vy9rvX z*&J-Qzv)z`jqcR3SgK)mr^bDw^lJ8bA`Nn8&X&eaWriWrC}eVMK5OQQv%Fm&@yHAF z_w1K7v4H9{r8G9buexyaT^GlP1ZlnX$qA|cadKa<)$=u_vHIn3Zm6S@eF1A7Vsy#% zOPXKK-c%Wt_tGjn(pk*W9S&9Wx3b<58l_mxH<_iJDpM^Osqjm_XyZ`=9pE@`%*Ba; ziD_aRMe|3t+T2|z0T5I5j@akpfs4fJSQPUaBilS z7pC_hG!qzCy7MD!TmB%bC%|AC#xq&y)#4|$YDqlPZt*t6?~vD~?z$fPd^ z?JFtoqhe%%nTMURj$9t@h!_=~#NKuPrqoFEYdkgf)ziShy&fiPKw7#!VB&6U_O3DR z_i>FuWj_TZx0Eqj8DS`TvZndo*PrK+h0TadV%l^Po zOGsQf4<;(1{Aih>fPV9WCF^QId+=XL;vp$6wHt<%L&KXmzz3y^oSjp$5Q zcFei*c>3*~kpI!$wXR-$&iYiXUqS5QwSv}m%V7hGE(0I*JkKbx4pNRI)wT!w(UO`D zW40ks@7lBN^Y9HPiw>C3b)4X+M>+%1tTENxLU1_w`|aPgISa8dAR&~sqmbZZbIxb{ zhO}6fAIObuFT}lG3SIUahZn`(qd)beKG^=rkBY6W9CB z-80MYVO_NKRjninGd-{@K~jBpB(7zjfA4EAyw#KmG1K&u?<{Ox=ylU%H2?B)w3RbH zV2$RCX>)SE|*UDAT7RYcqI3y`-im~kdW)wFf+>! z!V(MBHq|sf8D_^m{R=X40P-H&%V*0|FSKlR#f)Z73%(8w-Z6J7VLWw(^ipMMq^q>- ze}GgB%Y4wScQAf%)L!2Jwq6WzfT<3?z(l2r20tJZF4~|Q8M6f2B~hEN-weONRr<%5 zY4^qz2c2t6#q92*UfEF1*g@7{2gMo{$7uvBk`P zR7+)FBEKCs&3NDWHZG`Ip=-+tuYl!csc=fjoNzzo2j?uIb z;%A8D$?(o>Bhs~WWay!5a;ht%D3EOMF{r?tDeXloNbIN0*yX&>B#TAL<81K9ehj-g zlRL24mLCy3SqSyBsTPw_76g#9e=rFGL8pyfqH2ylVOTm_aegms3_h)f512^dEdd(R z$^7g4?TWl(R7g5URcZroRIMB^)7~{y+NiHp%GA`L6ChJs*k!%!$T4As5_iS<%sl9k z>Wla4qf<>;_N89%kvQBXgK1T0dy&~oy0v*n8hVu~3E2~Pw&^ldlxQaDwVkX9!VS&u z+s$QwUOLlU<-7x9gPv=v&jE7}Tq)Zx3Fj5rwUEGyhb!Z!dQxS&l~Ki(eFMVp%e9{^ z6T(pAb%{X3-JSmZGtMDCi_xM3;=i5swYq!i)%Xdmm%X4F^6XADL~SNA68FDkO+1ff zmUpo~zd^blI|rbiF7MSVC)kcq&C2%}jTKM$Y%$4N1_NP?*V2?(DR!|r8R~1c)wV*n zrQX*f?rZJa)op48nu;K;;%={@o^md&&8)WZ3yG0$+G3NRK@IhcW_#^O*I&Wk2wS)r zW`xyvT9|B4nQM*)5*n) z+1nd2%$tX~reEeNMwAPn#Noj2hxS~nfd@D98Ptr2yej0((1Pfr9}Qs32T!5Nry8_I zX0pgs-#F6qt0QN6>4{Y2#wMxeY6H+v*x->+cM3$X**cGAq=|p-ppRbZV5_kJQ!Twh zm5J!>!=f|h;{g8HknX12d)Cv{egt-5cHql~ho#D4?8fg-!!4mo^s4BPlv|~GD248% zlRz0NrAQDA_bQzNA%adY;P?T#tnOXP8V9_g&ni?fxkkSrx!NW`J>MKESllm%;#A~wG3+@eHt3LH0W}!eqNW_o^*T4eg z#2s>wz1ptkww>;!62#ZFAr#V`FFcCJheO3Y;8@fm-L}gR1jK8{bVjT@H+83gUxybR9hM1CiIJWY7v;oEJ z*rlKaz}Yj3iVz{wU&<-PA>YF!A(3X`bN1E025B8q z{jRgNB@>%_K8s%Pun*$(TU+?iG&MrZr3{m zoI&2PK<_E_m+j1VVa12VlJ*M&V-d+ds)EB^lnbX0ko)_4Gfz6^vw*2fveTPn(rfVd z%p&INbZDQ*+-Uf26l0Z7Evr~78JpO$H85956qpN)la*8pudkSnCa-K7;M5gDiz_<+ zq~p(j*@`_aPSF357c;0i2$MvQkV(<&L&@_>$X)i^RL=6K zJ>xN{bH@)@G-%I`(zjQ$tqY26j>g3k*rDxwBm8I@bD{nrz} zX594k{yRlx_O#!AYH8y&rRhMbt{H_+ypxkBbl*PyA~{Zl-#x!`iZZaDL6m!Q!s*o9 zXdlkpsDZxSiPT_-t!`;fG}N|N?>^c+{_7ChK{!Y9ZAVZC%VhD0OfFSVBA*td{432z z4D7_mAz!aEvo3D)zI+kS7($#kKl549>uC>vy_vSt`yr4|gJ*_24waECnB!?G4{H48 zuUlapBnl_{3-0@;sM{r@jN@p@n~5Ql(ny#AE~6BUA&a1(G#~_*L#oXngd12$^8^X| z$z$n$F9tA}oeKW4R*GT}(OQl32b?0sgQJek!DL4e3P9b|3>L33Hq}+dML~~J4-Wi} z?Kba~MvPo&4CV8p^da{H$tif69G-IOPbp0&%rh_k1Dy4wu_Kn0aPN1md}!}3uT8~u zeKw{Q5pw^DNQy2D;Ae+NMfL~EL5z;fGt{qYBJv-jBu97lb2@yhCu>2OGSx|q}q zVy;M?{7)Rt=f}d}+ady*tNHf0Hv>fhK^EXXMla8Qz=HZ z*qjs{aYz-Aj$}U7>+cXWK*;~M(Qp4h$nk$hEdcfZI>^y1FGfPteJAxKslda*w#YqC zCpG-97J@;npaJ03OP_tvfl;T7fZXy8r&TKH0@n)D>K*gH(HL%;j9Csm#s$z>>cnIMjuDs+}LedKyCoX5pL?S-RtM+uURY{R>p zUcoPr4!{C8u-!TI8_ZwN=pHT)^Vb>C7_7_iazu=j=qVn0cdazKsGJfuhx&=6B8;XVK=p56$t zSDZb)ajWB`{#socm%47_a58}5uo)%wIGE!?=?BWc3^lt5gTQ%=I&>R7G;W&9r80BY zw8ZSfW$Ih);yd}u8NWQ;CD@3ktkyS3z)GMpL-PBH>2IgQP2UtbsTZ1Fl>%`v7#+Q6 zEA#ILk?_LLt+-UwAt!9{l`(ubjtk5otCl(@%Xi);x;Ib3zk3F+Z^JwMZ1>haoKEF< z<7`J*b3732{EkA-*?;`-yIIfesA`HNXHsKINB;{?0UxguWbWvdx~}{`XS@(iuJ}sA zf{WSj_fVi743XDkyvUMjGunyo?>a|yHz~G_s$zB(4Q-|w!_?!7jl7khm|lMl4|U5o z%h?@`(>>kG0Z6T6`Ut$;uz$>Rrf_tLMa3DX)So1!GDjW6w^uvKV3lUaP-s3qyx)jJ8b~q+jau`bfKa(^}KCt zz$LTCQlQRoQ7%l-o`^5QPvi!<9UPwII8#5unR^79YIr5ry?}n@Cn9>{(Kb@1VI(kA zyg|KC2)g&v?=5#`&nRbVcPVn2SOtp72&3O_XjY2=p}u}+40L0YiWwtb<&vjDh)B!K$d2G(5u zn2|W|lc_H@MY%$#PM-NE*6(`REv{-`4-c5mkKS}y5K3J%fQo{G_WfGhB^wGLU#sOL z#a>ZU3P0M;lrgG#J2UD3kS^sFwCKHUniI~9uZPtOkn}SIR#lqbcRE}HwT13)W49_# z|9q!coUW|$A;3w1^kDEG^YQ>RSfHqzl0Dgex`2eZ`3H;Rwv0b9%N=T?A)g1-6c(tC z4^J(^sekVviQMjYI;B6)+-QCd=h=MChn@93nna&_d{!C^8HqrBD~4HICV zT>y#fu(v(!Ev>z5tWP9CG(2KVf%F#k8zLRtC-gV`fW}F2PTEa~Aq{@(VCogc3h;^g zQr#KKB$g#hpYGw(Bkl9ccF#F~Hu%H7ZDxxtMrKPoihEcZihJPjyE4*}y&u;)=c9MJ zoN^DapIiM?(Q|4A7cGJ_;`4;(SrbYv%oZn`>PT+SqkW7KnB-q0xPP0* zY`s35@Qt%BhTfSEoNs?Y%`v&ndUM}qxJBR!wd{A}h$L=wu2M$sMUy}2n+DjH6gimJ&V~uZ~ zeRKOG?Gw~G6hp0XbTLUpHqd`tr#1f0zG_01?SfMHd}<38(t3jQP}uJuB^m=x#zdEe z=U_fzo{5%x_n8>##VoN8gMXR~ zQDGOb+GB%}FF>*8(ZWrR(o3Gt3jP+L91ttoERb6pgk1`fpB=pM0ufzY4lpKH4lBvq zzY_vLi{(>>w9@yG;Fp@WBvL+rZ!XL^_rd0qKrMmO3#Jvz z1HVHQlGQT|UG4)>4R^D=l^Mc~5Ye`^!;cc$4X{vzEX~0b#kA|ErVB*Z?hwvqeLJca-`h;rxZOFIz#lgBQ+UQ2HL7?R*^dMf;kgqC(=VC39lUzM3|$_mG43 zk53z(#7M0?cQ!^end+Z>A`pv>FE0J=qNiJ5Iz_hv45VH>#Y92ZWLqs#VLAWuO%@DI z$I6MkCEZ0xOqs-pDGJ4;R|Xg_Uo_0FNg%r^KB9xhgHY@Tgphr5z~zIz^>#}oyDta^ zzaH|0*DAw7wY776={n5~2}f=Fawf~-7rv+%w4Lp8Y$xyr!k#E(a1yL7jnXHKYMFVs zmnDFNKa!VOWY#b97iK=Avaa4v4VXHNlCwEZ+L<;xP^sf?DJeW^-!~MfFpIGGvg``iG!f0oui(2;}7 zuP?elct{BPRZ+9nLnZ4&2v=*1V$JnS@y7@2^iMnoXRp)lhFPX!lXS|1JWM9*k_=5( z=ExD;U{gIO;p?XPuQeBt+Bx+{yDmpX%p17$ML( zf?(R*h`Xr`I_GV@Rm_(Btt{cxG_@N~d}r=m=JEE=7fo+*s1`4p{No@Z5yZtG%v{a$ zaE#hNwf&3{up5F;{f=TP_@)9r|npsPo zlo~go*N^=<`Mdq|gR@r-R*%@WsF(8mCbt(bxm=w-jj@bh1e?B6sbV0!wRl9~E%d7E zz72|+3EuY{jo3C(!Oz&OS$T~wwQyk#E6F}vn6;+(Ggj~B8wj%1n7a~IS6zq zc17HBu4~>@Ej@k?n1^=)_dKon__(ClVzd$g&HepK%oR)M(qv#gGj?q?^8ePG%xjwcageTq$j*3xW*&XKA$9NGSBGjLfnNAl&q^Qm)#u-AiR-4GDtKl` zjFJEGtpAI>w+@P{>DEVamm~xTPC{@?aAy({NN{&c2s*gC#t0tVA-EG<2Z%v%cV-Aa zxDPfnzs)P(_n!0R+`9kVx^-`zs`+Dz9(wm)dv*8P&$FJ@l$y+55%MRQpE6PL9k!7) z)1~dTlGWc#q-HHHLa!`dLzuz!O{QLm76S%jlj!e$k)L`xmZe3uu0gXT&?a`QftHHd zYZapLdUCQc&mJC?joHr6DyXiZUIMLnK(Rz>Z9i~<{&PQFtsjQ zl&=MX1**)O8%>&hgV-mX!*mJoD^3oNB2efJ4t^{@s8C)j-}$JQ#EHtji{oLJ!aJ1@ zkiARbEt#B*0B*ZMG%)tG*KiJx)UPa9Miq$}Ur3bK-LDO~qG|>n*u7*gBcb zI_hZRlL$8@WX)ong=5xcTB5P}ZB&z)s7Oplp=I1UY(y=L_1Z_(nO8BNWzId%yG+NJ zEz#-MI5iuA~%>#W_qM74_7vv=uD@^h-D5>+BG_gYSy7vY4FsM zJp8ngy!QHg0|TY#JL?5S@E%BoVY_oB6!l{&NMnfIOgo#6%>?&~;XzmDU6i?b#iX=w zAtF&?ooDQc)K}_o9&HSPpqL*B?`K9=-Ma#>+G=Knv<*=)vuzhKzLO%f<$~&p7&XCt z-(UU#fBMWeqfFYX_bqowJhS6$8AW*2%R*~+UGqA_nbkG1@F$1HVrQ6e$zh(I&L-Ar zRT<9I2xWu4?H_2{3kMae#p3%Nd*XEN-NE8rsOecNQa!YzUa{$kyN`10Q59IvCJnA6 zwLdkKfLXi5fsHr0P6@0-O~KaLYsUzWnALcjs(0P>I;ewc5{(WHxeCb!<^`X`sUdfBiamGpBfFfZ?h?`6Q@wxB)>Be~M`dfz>H{sh!~8 zW+?{`xQav#j%_Mj+Ko(da9;vs%SWG9sOF?x-%BTF7g@`pCUHpMcV$^7(X>Dr`XG2A zd{EJBEVIplTP4;#(aWVyhB+Aq)8pUsBIBras_-%F=tR^(KRZp)d0_~QLblVV!5{&m z29vk8fc?_3&LygMv?}~F&U+r3J49IRVyz_+)lsoHW7jTe&2 zYKSz%T}TQ49Z<(C5j zYU4X!UHrVPLyIkxZ%oXKNrsh;W}Wm2d7K&z9tr4dE5tD5o0&3{Sr6F6tI7&~n8~xc z-CMDHp6kB%a3EIRJN>Rnu5~SwZr&4Bp*lJsSquP%*qyi4qf5sJpK+l1JlG!rbDsh$ zw?;|or^9S_!igh)lVq0nDW!hV0~ms(brPPMIs}gBU{y9ZgX)caw3;r?EzC%@4|0fZ zKLkQeuHaznsjZfi+~j4cG5vs%AI-!FyDZ~?pu_(CM}`YqJ}_X@XLxAP{{-M;IAhZ= zUJ-2^y1E@Zp`T4(Geop-+^4;V9z5)N>_-(NwPWtOQJacY^sOw_{_zG(lAfAS-qF$$ zc|g&j)`K&|0$U!jtsS7T*48!LmSeBBj)`8mDz6h9OUTYHR30;aFZ(Z~KjaG`y=0vl zTH-9vKZ=FB#A@eAr^-=Y)3WnR!Rv~@$%v2--0A=TMv1uPyhgxGM0911fx+2$T?+tM zEB}Jt{`D=v?3#^0%(`0r_m)2Z#Dpe*Pq@{AbN^4s2?!qkDZdXntA>J&oO|`DS?p%R`FW@RzeO6CkEJ~10@jFf4SYi@1CRMn*Z6WU+w>W zvkPES)(F|6tVBRg+z$oRc(o|O+U={J5%!hah2?<<3Hu84q>lLO@m%_UaF^?MJm;^ZqTlA3Fndy@c&=f%|1>Z_ zoe<(PTkJ#v%i6zg?4RF2_pn|F|1bISUyb+vx){U%R+#vQKmFgTFwwhTbJa%AAf8lJ zXD6Is(C$hjL-8^CH}hIO3HJjVMrxxPQ@Lp#xeVG6Epb=pgh)r@D^-QZ0FVz&rec?D zSt}6h8KWBmFL(+|$uESJZzm!XOz|{TZvD+Z)Df@P9lZnBt$kIQ^&7u^*cP;2yej-R z=|V)GC1;49H1b^^GUMvbo{yAB-fY=jxW;V?7}AgL27@@M#TuLG25OJ9c<^xw(qOs}3Z$9NKvbtWB~?f9(b#HN1=BO^=q}4vY*I7is7q z3#IbP*Jw1RI0kVQ8rm!Ux;x6V^XTggBeK#EXAZrQH5oI&qy@QS#)$Bx6x)K)j2?-j ziQNhH$MTkzm_U-pi;x67W>J2IGQu*4RObuamW$2F(xWn|o!Rq(27kA2@)P>;W*L>b z6ArbCbFZLOCwszA->I~edN5S!y{qG4;N*E=;?paF*<-1TacE2tY^;n?<8Cabd5yIL z7p9@y-yS(De^IdC&*vDh@}PbD_rUGdMC0G-Q^bA)on-J zs5b=WpWycswVRoB2>DiOxE^Qd5+s>;s<70BBM5X`@YsAPJ;wIiM|IQD|H=d{kWX#O zm?~W+UrE0*P;x41>bVa9H~?o;gR$~?AY$B4vv|#7PG(tn?KG(dA(P@|(`uR`ps{BZ z@^EW!DX=%o#SLDOxjs{%VX)k$cER7DR(jr^el}_{iY8W07Q|@&g2N0>a8DDL`D;@L z3WP{kMNPb4g`{sbV6(x08KupHdIC`TiOc}ZecmsS)y%4Q3SaAT%OJ?N60-L^s<}a9 z7+U)CJg4O6x!PU^u~5LOpq;59Z8MJ6PaNQJ7>4}G8JRikA9D*K&qwS`rH}8hI1RPZ zK5+ev#(ZX>GL%)hcACDlB=fl@3A%Lp%Nqzb?XftCKx-T1?RZ-b=&?OM^1+e)Xpg*% zZ!!Z-miZ#y$+{Eo&jAY3#O&=3gV^f2e`e}6AyfV*9TYTLqNUkxRWaU~^w}t#NwP@+ zwk^q1JAG9EZ}(Fk(P>X#C1{QATr@?PTvVzj31H%>uh^AlhnI_+HzPo@u25e8M7m z;hw#ICy!C{xYTywSE+5!Wbe~9D?5H(#DwYQ`AWp**!>Ax4(4zkfRq#9UYx1dY?P_% zO-CikS=lO1;k{U|@#(@&t3EZef|O7?GP>4%V2|MZB%mHwAi7cH% z)$6-q35)x5`*{1O*dbi%B>f=HOE=ktSh%g8GD84KmG)()Pxe`TE)S}tTdd;k&?80^yTXaimYM^X=n2I;$qz;N^s~hSRkw6eh zOW8I_;9Phg+v}mr!=aDeof+ny#BeGJ<@gluoc+yi4fsDUGtGIV#a7&!$DfZgC|Jh= z)1HUd;4@vjI(PCmpkt_fKq&(j!jt4P-U5pF7mb+bQpPMu+|qt`-ARY3M%_Y5 zJN3_EVn#TMQJsiZ5!)$*)%!Q49uv79dDZxhHI0MsMKR#~Q^ns;T8j26?8Z2B$hlRb z`cub_FV@2Byt|*MG!c~rwG6Q!%O)MJzAYU$z{F`j$zo_+;&(oC!jP$N>P$}tC;ayY+fDg-81s*?`jm6M+iVpX zwi|o;tNx^$!r}^juN1YUpMXQ%gc-bWs1^qD=aFIh7Abm4qH{Qxo*()81%&B*-`R<*q!tAmpo)S+ z@(rsdA`1c}%VZVP!sgywvww}94Fzk6SM_^{Z-$c%m!Jg?x1wp`h6TxM6#sT>XO?JD zDry*V%qVIYl~esBRbb(PR^gHG8xf@~v)mU*zf!j{ylDK)7jp&XKd$MmoqQH6wF7*7 zO=o>iz>O&%bsncm&K##+HY1FyhTC+=%RlA!@&}^G6Ow?#?3ET6 zBJXpFD}da1l^*C$q+{`&K}fJ3zk8u5xj(ci`dyHl52K`;$uR8tF{mn$d|V ziSw%U9u%Bp6@1RYVNxh!>wJTDuQGD=+X%2rR*E|KO?DLDti?uph)gPv@jutvN=wC9 ze?U=uvOpj-xnYnu(KRk#{W0Lx5S|JX&7JenhA4wIGEjq1{e6Xnk-@o0RLYE_qO)X4 zw(VNNQ$hAUonK7XL?8XkX%Zs?lWD}omq}5u`Q9!7Icr}^`GbKR6VPXZluF&`m8|2W zcZX`I9DZH?$*az1)$q(Z^VRlJYQ4v@z=66azQ+s)65_zO4JHo6|4Z zWRmTOgdPWYH_V#N%?dO3-5r1nEAOQbvgo|jde$deRgMfg{EsQRZ7w6kVH#uI zcFncl>gHXNyk0sEJq7~kPM{;xzK3%Ffi>^HFgEKUw`Iz?9iWjMqL~S;rZC3?!jT&~ z%($36#$#T~7*4P#34|4|(Y#t}^w+0DE9Db*sez$MzLH#xxC4eH6X`KtWUbA0Q+2hX zVGT@t+8zru_iL+`GeqDWuzc8f1_`mi50c~5qZZh)f$c<7IZSTG&9C`FQ|M^mGz4;= zLR`)I;$B-sDjzr?9U#Xu9Pn~o5can>BgIf7OfjjTmrhR1Bh69BmM27>v*~rBJAyib zPW-HS;R-pHR6vZw7hA5kK~v^jnb8^FT&ub(JCL>N(y(10b)J)CPCue zFXD!2r~)RkEi(!K_V;f6!GUu!rg@f1dbULsTh!o%!or~a)=Px?m_GK#)A^EswY$7D zV#;NK`vYLRmV{K>EzKVgwz2Cl)E{dN!Y%eJ8B8oV*tv1=b>ef#rD}h@X+q;uEj+H4 zV|j5>t7}~Q(X~$(Oy~K3Cj-5`pdi5=m{-ZgEBGNT})W@ zmKl!(QU|?6A&4v)lwA?z@*rPzWo4rYjy)`&42$RuAC!V6f9b}Bsxy#)ZGR!;d>+@`N z{6?~Ar`pshrkA~c!0*3+sfvGc-r`}{yi%baOBcOrfl%7_1m*b#+B$(4#1z~Zz%Q$*x6>;2@vN_0KG}>-dNc1- zS<_82T;p_=-gw8xaCE*CJMytz4S<`x#@Vhws-l|Aw)FS~$yKp^l5J-iUu8+Wx(!*7 zFYN<8&4k91XaTkk)?#MGe0IrU76UnpoNY3!@pdJLzstrBU{C8uz|w_Yt^oKhpEk=N zRZ4__@5^G(yrcW~I*1oid`~OJq*=0JBuYKZFV$m*W!ZHXd7L8PVgZWP)eO@LbyMoO z&5K8o4Ka&X`tZZb+Y)bu_mu$YZEjAkbV^DQxfRNE=psE4KZaW51?&eDXHHzU$shzy zT6rB=Gv9d1Y&OZ!#y^-*K#H1Gx`#pY{lg4FLU3@Nty{t&2kYxVQRON8pYBpXY zQDQ-8?NY|R&A@m_+TNa=@t^FTF|X&+JZ0H6dgel~{m{fEYTxTSrDqIH_vxU{2M9^R z?6HK_kR&i?nVDlh0*ONPF{u!;MjymX-B5gCxI6X+^vOu zGq|XlO}tWru`sZytWQ=rWZ6rCxhCyKo=dmZkKqarX(n=6m&n&s__}QLJU+%uwMaVv zsKMobQG;=p&2^f|y1ZKJI*k|aNpSIrkrvwKAb$5qpaPaJm>UwdDsC{D&%#BkQdqZxIrLQekl-2{QL8bpZ3`NPI~Rz+lAHrdc3ge4UMZV z-yLg~wS%Mr1|$fgH7XT80J&r zXtY`8tc)2ApI6fZ;X4o(Yc>SGtP8Q(9zRWslrC(uAu85=A#htLf-0rf%NJ_d|CrP% z`vuHl1>Uw(Sh)0LFA7{gDOdk8qt)r>7qZ6|U_(c|HUlep^~kCRM*BYUGB@Yuv9Nt} zE$-g}N|Gd((mn)=(Uoo4bKM|PzZG0(LV6vF65vGd2nNVP7<2c!-O|D8XqlGW0G2dW zsu0Q6C(NIMg?r4*1bedK$)5}0eLji_kPNx7iRXm4%ofMU=^C&sveFO8x7(rL;D0H> z(m&t!vYhSJhG9P+0W~Q9NS&{SiS4 zA2y|j@3IuH?lt6?kcN>TGB1YUJ93|ashK0U>dn`fCnD5!rS{}`HOOtM#YSZb^!&Zpwms&M_sK70YmS-=m8`Bc(4;E{{c*Q&BHLR%zdvqz| zZ-|gt2SX`5jprI(^!1Cs!}8ReyIU)i3&!qN;6g82`(Di0??^d6)|1|UjaS!zRp^pY zWWj$EdOMcy^5HE)(C!~0yOA*2J(9)v4mH7x=y5!!qJ11@7Gp{)03IO%ENNKOe3S%t zLu^b@dyNibHkMe#KT(KRe)MS?7mL!Q6NkC`ngnYj+o@k1fR&l1gxk8j#PjxW($!_9Q)rAE zq0Fo|kWd)%m}uK?tPW+g1V(!O#W}w4yFiUSwb~9vv>h$)P&XC>VSXaCI^ODf2UQOS z^eJo>CrSomvj5tCoN;~}DW%(7$BpfLA1Ms|C$g|bl2)W~{CPu50YJ353!64E&j z#KA%@DmrLXeBGh~-D6Qlp|lqWa&(bw2SmRQiviWZSSi|VQX!;}Yq0qz$tQFaG`=x!(wzl5`g3(4}kZYkXN{Wrsg7j+GbLP4;?zPL`!{4^wz)pWPHB^ z<+UMtl4^hGhhNz!buLVgTN4prWuY~+4X^HmP73oi@P_0`Z%aU*&UF$B6MWu zve@XIRp~G9&J18Vrw6Hk)%-2+t)cvI-l%$zS-1KOYS~4yVTP|*CVt1H4IdU0@E~KS?1hRbP{4* zyWXisa5Sldg%o;Z>z>Du29Ba%OU>R5I5M(|nzCp&;?jl)sQ}A%Su#R_07t^p}7EzL?wTD4UFuUJRWBstZ)=S(k?9TO>JDN^tmI(K~bvf zz9C+@agYk}gDh+Ft!GuZ1>Ei=+!)O-U8<_N&^Ip~gcy2ahKRynZ*Lzn{oe1aFLD%4{dK<26KG?AQF`i}+2Aa*76e-pfi-Y3uY=whL7ck=iZ# z_S9J;1ynMTx*MFW^_g%b`{U_H7poJWeJ3L$?;JE`g{^U-v~trDt!mXf5+PyhJD18w zR?=0ZyKG^FZ$YpZ(jIx7>hDmu#D|7;UuZoXTOGM*CWn`J6pEjGM_CDFZ|H+t1ZKQujHiuv*L9Js04p<=d)s}p z!xu-}0wDt~&`$S0Of|b+-YEa0^B7E;a{XxYLkfj+M z%d9s)eQix7=SJGe*r2jjU|^bn zitu?8`tWE~JYcpw@GX~u__M5L7gS@={cXXr3H7MsCZ$Ceb{6~ZVxyD|D)fR9x1eaF&i_`pyIB;y7`&0)U}$hTJ0tC5;oh82LBx{lf%Cpt)e(F!t7Od+*J>mS1^27r*=4 zX2qCm`IQ^*vp-~b|8enERe<_yZq3iQ@uxxl+XxC2fM$3zRsQewRj$|is(BMl)!(bL zMz7Ub$Hv5(f1CTi^bTsK02t3*K2G-EtFz$O>MY+%T+ZLi>E^HHbY95`6u%ea9bb#_ zq^Cj?{%y#Ad{f`OmeaK=^%DM`==HsB&H_JD{Kr%LH~Y!qbCs zV*w%j|5m5}w@>`>?f(PS>CxHkN8}$qku0BivE3ndd?JZ+>t^8thu5?;?@D@p%8vK4 zS$uI;SNKtQh=`HDrE;gn1}CUM#5TWcky({PHEt%xmX?#78(-lC-oTh@#Pj#yZ~Dis z=#7Q2)`fN>$0qFdu*+T9#>?Hyw!I_&EHBuC-)R7h74sJJ?>-yUn0U;WZ(_dO2VwnB zA1J{C40sgTP#gwf?EmmNj=;c>(4PM^grQ0L2mx?~E?7Txud>`&lx}?&ztBUmIRU=B*1Q3mDXk3 zz4=%Xc<<94!q*ZZ;K?zui<#uJgGfqglY7(bZ+**hkidZ2Z{A74$akkDUZ=`a8Go5( z5FA6TvRMInl<#-)9JPh7hYB0CV{qttLVnKe(FL=1c z-Vd5?3Q2zqL%`CaHcSV}3s=-@JRY3CY554+zfryX#Neg~L+fyv;x>if)2|f3Ki^d} zH^+)SWm~bN&NToj_s_pyAl7=zGx+1fQ+>qq)@puQC%xpm1H4#2L1VEV0Uab3tsB%b zM1S3Q8VD@C*4ENT=`R#D1R6YkcK|D(D3+Ss7#$RHASEmJHs*OE_`VSs!;RMP(s&Rv zp}tH2MOp6z#A&q7z}x;YiQ&+A-l?B32Ti;|@P{{H{?25V-#{Y5lPXwpAApA~(^7lh zDX}uaA)4sIQNsDCbnadZiO@NL(36^4lh^O_Y}u}zh2%Mw03^`*t|#i=srbaEq$qbx?93{{Y?MCXl%Kk?k6p9Ua?SjS3ggES&Gofwe{@sgoPgLc9 zW=+M>7C=1Xc+3_HgfD$fr|+PxNmRc^*1gQOdGf{JIyPU@Cht3Uwu;HbQk(0A-&}q`%%ocZx-Z zyGAf?a*fbWOAZHW$&W!rt5^LjabDphInI5Ej0>h0!5WcXwZT>qFPKP?eQ{^D;uExC z>ys(X&9#vfvAaLZvOW%cP@u15mkpLdO>Vo^j|G;O4WC+1{Z8NAhqT3|enRT1@W6ws zJg8Gt-!bpWpvhiHIGo*`=`2qD6HN>fkPJV?+eWYA1cq3s@^!A~a>u1{& zo^L*9it=A`c{$1t?I~i2Osp3PprtL$A4Vd3Z}j2uQ1;<*R_avI549Ju8H~>Y=)D73yYyIezLJl^V)|X6q?dyV`76Z0_(d2RQ$7&s%GuM-8U#aL!L^LGt-*xBGcVA3EDDUND>p z$;={%mW&B&^f7!>^)U>W)qYu!2QZuqlVb$vH;$hyKG4NshG}8P?t&p44!yTI&unJR zUpD9ydzgMuQgUeB|4vKlpTbtvO4snYub5Eh z*u&RawSK;HG~2x$n;H8m=#=Rv6kGwW*z&E)Oavn*JQQ`0(S?7T&7#nw7~B(FU8zAyeY=zOgh0;tBRXnUi)93q|;S!j^Edpft!EU)xmzpDAs~eym~xs;lv;@G{A+mlWyWw<=Lt z5u-YFN{)QaR>e703`GT;Cx5$ykK_*29`6sT*;`sm*VtxOZgTk4)bPMi$y@r(L~A&Q zFCtn@wTizZ$n;|Dxeaz6I%LVJ@ds_g8qjA2h8}vduy?)GR{ApSm%?9vjn!|k%Z)c@ z?o62E1q?|ScAuX0IU96oRxquru{3MDpm7p$Pm8ug(k`98)6UMNml*C`eix8#7|T{| z|53bDBAK`&`FM#bYuQh@M$Fq(!sZy8SZKFJMy8gc`_yhEgY;q#bU3#zT~QL|ikTD1dQaMOzs{+C$|1GcWYeSUVdI+@jaQsy^{)W^ znrBw&0SWWkT4ZMyro0n zZCo#*!)|Hm<9--}e==VhB|%?YJAd5itI|<9C!Q1rUK0Nj2jmqS7I&h>WHFz`Kuv{J z`k)eUi`2H+(4c^OPE=*5JY#jEu!~7EF!k;Xl(?}-^GIQ6yJuWloG+OMmH(m$pLi`mKlygYLmJD+D7+P=8AT6++e)^w^sJ?Csn62hTDe6Sg^( z)_dB24zxt~WZ2%rjs;ZJN$dfbBhH%14Tiz4t)ifrzEB%A+)$feq@SP$H=Gl3F=2=J zm`-Nqm_IPPFf7cqi0RS8Sly8H$oDygf*3OfZrOYj!+2L!C)#^cO@5OX2)_c{Sn6}^ zo?MdIW;6SBl*wM`@PNvzVF3Z=O*Nm*XU#tb$njn?tZy4*$5I_j*X+rj%-CK|UJaG; zp538&CiU4Vb2o5FMbcZ1x50cpxYMimyqy`qhD5KPFqZ_E89MC+Xbd^}YGo_fPa8~8 z%@>5uRch=-$Du{HKUJz&u-hqU=LpZrEr579}+x=9c_c6qMe^PG zO9IPxa56;?T4oaQR=8)HCWo%}WyB9eglGJhb%8tcZfgkNzP&oJbB{aXVL#UCgCkGh zU(u6!OE_$oZQ0N$0pgEHH3BhQsrtl=RrE}$^{F=#y|}3fo}KFmdXiM`g4OmXgAL|+ z!8kNd2s$T$ctfkJosZcZjt=^!bS?65Cej9v<-gn^nD<8Tdux~dC>SPYtB#mFK|q^o zBqqE8Y|-I)=dLghh5;2j24`8Tlvq&9NyGd0s&MvB;fGm7z@c(Eyo4D=7_aIB9eFQI z$?7zJ!bcw+inb6>&stCYRG*r09_FoiW4ixj<+yi@9FJTbg{U0WLEZzTS?-C1Y}HgT zNPHweDT8YP9=eZtYmLrOV=w(JF7vWSJt_XTFRMiKlMcLrrhY0Ns0EFHQ=QDg@$rqi zE%%t^l# zH%zd2)99S*@a1Ms#)jKVv+$}8g23mYUG2-00`4tUmNrwrl3<*Khjc_=@Wy8D($WN9 z9rpybL-xEbHsWJEuKENe0%ix}Gt2h-TzHItY8gCVC2>9y($7eJXQVwh>h^2gl5JD> zf_YY4coh}A6cC-fJA8#zUz#GVVHuKZ@4k9}tKB&wC?);2;FvE8N@q5C`}y?s8k8H{ zLEI1}Zduh*B{K=hw46mH1=;ZKRhzguOjqV|O6{+tO%tif7ywCkc!U|*82Nd0u3XIn zpBG1uau+`;?X|0yb~iGtLa$3_gtv7pZ(#Gi)9e+wCJ&2`faG7o2(hL~T0MA{ySx4TkZx)*13Q+l9kN+u zNO4DoIiVm$RFw`Z_MsFHh&cqLyy`c7C#oHZPdhAWC-xo(ZU7wpH*j}xl3R)0Ho$z| z>V%=9CedQ~`0=x>R#pvLvN@oysuK=Tq*E=w{~$wu1!=VO(N#s+{wpoc4I4VF zb#{yr+N~P5n2x@|M2ou06C29(E*Qf&3K6E3 zYEK@p!+F&^U9`VDYlobA*L{ro1%qcI@sDBAE?+d$Ul*c+q0P9=EJm+A+Yf!Cns@9@ zuGYS{Y)_*{{;W~7E8(Dl>eA=YzT<8JaT z>HAEjD>AL-#atdmq9-3HLD0h?1Xm)7zq5ag%47R4si2)NEVd)r3xfTXpT_pTi zYQ!Vvg^+y{{_GH-z)6ZV!0()e6&u~<>+m@4%mN`y^3_6+RaVA2>(pq!U-DP~;o&N3 zCYoaRM_!)aq@6At6GrvZ@Jih47YXD|s6jO8ASVX=&YXbtFmQAlzwgyI=c$bza|7g~ z4#4qxTG~A_s7c3LOxz|UU9n*zl(h~S6lTfX>&inX>#SCK(YTc?mRu=&os_XXL&0rE-){1oRpBDoypZ!+OTz+8`EbE&J{|F}e za!3ax=*UQo*8z11K8CxQU zYw3uGTz+ghvl(u}{In8;jqRWS_Y0;ZK;S8f{D3SoeydEJ+cu;T|7o){>h`yzE?~jQ zK~(u7C^NBG7SgO5Lvr#xhP!ZaR5YWRR&HK))`+kmN`(S{b#pw9`Xopi+2Ans9G84x z%0r_o8>x^A(a-N$1pJK(@xt)eunz+IedTB%zk5tC~`Rge2ADFn>*Jlgqh@MQ9yb2Oh6} zp60U0yGh@o+f-k5?1DUOGGdr}YLdZHv#9IfAQ_c*p7Yj|m^FGjl>9n4RB!HUx$SsS z5%iRwSVm^zY|-MIhT$;GFFslL)1p6+$~Spmy&jxf=y0f(+miGwynzm`%xON zDAXR3fAe5e6uJEPls4}}(QciiuZUlr2PbqFOi3g2qhQ8B!r%nrDFaA2P{Xo(uRf(F zBb$lQSL2}}T3B73m0>?@PMh|&K1NhTcvqsW5E(#CD#ZaN>27(U;||9(0W(}cgl~{l zdV9`*9Jw~ z&nj8|F8A<2Caz52aCJ~}W#>6EIVg-!UWN2UaJ;5H&=_SOR-!{P&r!%i&`lc_*zh6ebI`El^QDG&Nmg{X+_c zOjrkEjTbzYC3*^|YgCxkYhk;xlW%t^@u-#cMjWsw@+=zWsf4ZuvK$Q8+N=#fKgC7T zY8m`Wo5rz@1CA5!^ttiO2tzcf1Bk+P~Si`C~*{Dv*t#F@4uRpHZx5KgBy22u)eW1bWWEEv~QprMDYxr{Fq=81Lr!jB8gfbniU)517R!L?j_W4O=CtrWENsol=FDZRXGlcB<>{jSy9 zP5N|a^T!8w79?`K#&_+hwoQiM>*-LE`YaMh=b1znA=}v)p&X-XovffJFRE>8rR<3+ zD0y8aFs(-CM>ItDy>UbESk!7cUdIe<8zS6)|ZLXx;}A+ z#=RR};Aut2Wgy~_Vfm!&j|PaXAA!xePf?=4uEg~yG$!0i zd@twfmxZwH74Iek`?T}Yup_T;9nY|xNYhppt)SCe2x7)9Z53R9XNGH}b)v+=um05t zo6NpzNY)1wo|hS_n+Yan&xZ4~gzHO>yw?>6N55xEaJ8Boer}W6HO!RISx+|+ z=jK+(bggCSer<0qF6G#Wva&_(?#*&?pXbt5TQ{ms`d5^n8XS7pjL)PUW)CIL4fg3J z#Y9vPpPEUDPfoO@9xl6_)B6j)u&$qCWsoe*8UrhzW&(vtGTZxVgfd+vF9w~E3&Sj9 zCid{n%Tu4B@HtDELG90)y+vFGH%sYDpMTQ9ARulotsG_V?)Vt=_OkJ zT>okQKRH)W3!s!OG)h3$i-qK+7uigz?ww_Swg5FW^?&>gx1SRQ5a9L}>H1OW{er-VR}HS{+m{1pfGT1xOJw$bz;(gbU&;Nu<7Z9XP6DRFCkf0#=$nDN=TLm#|6^3aAT~hP-2bgs zHT5qkB(Prn5VI*ql@#H$j9#=-qGAB7lbsxMUUN!j4Ts{r4ujA%-7fP}CC3 z|8Ru-|NirTet3-)gw};+yUAk9t+o=QGXkSJBQ^8EGZHL*8?a#&X=rkk%dYxA|GCQ2eP)eP+Fh{o|kB zJnV^U=dlr`cic;de=?G+`}3%sPXk5%79Oqfbv!Q$c_gLdq2J+uF5G?;m>60xfIO9j z|Io;DUk=`NKnQ68AlS3#i{IjqrPIh#GBlhUeM&uZ;{0IhYr_6CkP0QB3dH_By`otf zKZi_}$Y!>DdCkP0`ZOE-Y?Rysn@G4WHePHW8w6}IbyT(OI+W`MKm&|4@r-AE(Krt1 z>BDhw9ak2u5=IScCgb5Bxl_(i-1%+<*z~}VK_w-gjLY~ipXkt)!g0h`x9P+aj>S87 z^zOC6V6$uFByWpWCWT`c5fJFMVhPVSyT8p6_m+5cbH}%-v+^Bw6Hr_7B{1n;<$w&a zW8aC$=D+ij;If;M_SmTpJKiBSk?=fjl(_o!9RFa|u+6d=&)uEJ&eqA~osN)axJ;w} zDoJ+LdeKT_9sS4bI>ILH7nnT)P8c;2xXhxrf|cobCI3jrvO!|)Dh}|c=>BtT zBfV0ykyv)BDjQajSGo8hOSc24EjhIpYM^r1>N@o}(O%-#?7g6Af`AS)J)MpYGf61X zlt*iHTGhbQadJ8Jwnxk?I8wfTI&n$ulq52q!}WQ)qq1I?Wl%rcRw(k``(Xf@HZ96s zF!aq!HxFD!ule4Ay(s`z8Rw8XTxK-}U-@N5(nf!JsX&MoERlcxnQ!GL`SJEg+>1of zp5=fOUhVI_H+dTiQZG}iDEp73;T`4XC6h5*>qDZlb2-4)C*rNidFy(i1f}6{nR!2B zHA03oc2O`*%*W|*_$!%|=_NsnPC}AE6_CM|lJW8pw<})hT70|nxA=)u4~-4=R?E`F z-0~o&Gp3cRlzoZo<@<#{Gk{PR)OUNI9#$6jo$e~N4`K?=N-dPhjdq1jj=7be9J51C zOcbQ;$T=%*QgNTN>KT)3X}qKZ+)sm91+#}1>Ax-QE53hxS%^Prcj2)>@hVr5TVZfK z5H?H3WzbWR?X^p6kr-_f|7ciemxp~@&*D9ebh9n4DfHGT+cXGoNFI@IdsK6x>* zUNdppJ6aF`DU(@vtQQ0u-C(Lr;|e(L9@6ofTvm#oLo&Jh>=`St&Srnn(JB)(L&}d? zIqvH`aKl?D^FXZjX|bD(@`~Q8-c?M{?}YK&Ne^}D?X=UTep zpZm~<@1(RP*8T2*%&o|{0oG&BqpSJ&) z9#-}G(as*(e{<^VGjbUK4l=s^ObKJuWBVy(|I9Apvu+f>LD`SuKIhj9Ke4y8n+{cA zHR)^874Wb&3ybMH{<_Tzj)`-XQ<0OxL4i7wP;wD)vZGn`bFDCBy~D?J6>i38eI_Z; z6qu;q{s(~xfKY8#%9M{z#BAv?y+zO{=kZ|+BSzq~wDij($dC_2RA*cxxeoMB^87RT z;Xc2D^(P<#13oXk7N0j+$eoO@u#$tdzv(|CmLHey0H54C{1nOW@e4QO*mok?f%^Xe+VW$s=Ar^EXgZ zj}fBnX+tWh!=8@KH{#s>yrdawgfWikxdxQbqIa1Uu4c4G&0aG;v|F-jdCB&l-ELJP zMK}B}YaU|=ILn{C>GYx8M=xZL>`?4YW{Q$)uJ`m!`wY>HCXp=& zyQ^nU<0nPa?IvKyhFm07wi#%#&*zq1)Wv=touBaClZ}B#A~4T#?ru_?V9Z}ii?!_B zEz7&=g5C`7U+Ez3%}T|B>JF3h!AfXPCcb#c>Y!6v6r)5`S`;L=AtI}vSDP8F){?_D zSz=#c*%E>nKZ8c(AfPy*A6So;8A5W-vQA%jubx)eC1)}ef0Muhx!ih#rA+0X=Jcp& z_|6lT2A$;elkFcrTZEbhxxOD&aMuf}sCwcD?6~5wR1m8)d*mMcsHRWcSmev-=q|4o zI6cB|jfPaK*EYQECUAA6_757{K1q|wH(Ip#uu8%fjc7fswo=hw+M~U#&t#NE#$}3h zTcF(~XHJ$L;U|z9iJ5WWT4u}wCRUFYGMWumD{(J}F5N6BIzyCquovkHHo0+BUjryG zc+|HUNM!GE%nu*!NY*X4<-hzj#3u`A9|o`#MCQ=(nJ7mk=>KW&J)@f1y0&4efQX0) z2uM*B1O%j)P@*WPNS77}MNoPtl+aZK3r(8zDoB?Ofdmlgy(RSCLqNLpcgLgW-1l?* z{(L{5@%Y1$F&2C8wbxpEtvRo2&TB%cSw$bpu{t8*i%kP-{a$9SKyGI+&F8VQj;6;) z`SuKNAXH7OlPh$M+F6I|Q*RB4wG2cQ8qRF19I%5b_dO16XHEFaH*QbvJTp!~<38q2 zs8gf5^(Q256OkwQ5;jW{bv4mF6mHtN%}vb<1gYDSXWSt07-%Amn`dVDoE7}i)IZG` zq3S~7=T(k|J4|>Z)~Y9q$)lBtOvjyzS36Ok?!#~0ZVD?bn?*LjvUi4Im@EAY0bAQn z#$FH2=PNb?7{%z1~RPHfQ^T$Az& zNs1L}*_=_=)=r7mL+Vsa&|W@mfGjH;7C!*uP^vFg!?uI@;^^Y2q*bk5Z#`i%Dt^)y zyV^eAi&gUdEJ{W^3LsmeTRvv0i&W!DL$xQ{LixkYl^)$Fy=U_g@+HCwWb6VZ=rlP# zg!ZJB{b5_1wrihk=;HHroa=ELhiTc#@z4?6DtFsgTh1n5qQQU4g@nBu=LPDbBy2G= z0t5U}1`${2Dw<=f{VU;{x?X-}k-HI1!F_%c)9A$_Mn^2tmbvUyxMHM2=_K^?TDKh) zlejWwM+F6471RZ+jpC~sDrofnihL46V0)q~7{ZpoH_{}}%a@$V$}Kp2t2mb)8(Z;} zn&qe!Znf_Za1jFa-IrgMjcy7DQl*Z zpos=fyiD>+oT2T9@!b7&`)kZ;*M%*{c6z*zSfXUjL+_*1Rgq0NZRnbD)Fr653N;)s zSU`b!@&x_|Bz+hBT&%N5KPGLEe%vh0d1~@l*yS{!O-}5o%WM9&BEeHH%tJ+jmb=dZ zp1O>&;abJ+FP^%Z%E%ImWVjF8DZk^yFq$%r;_A~T^g;8~O=yP|%(5-j9dfzszOc7w zp?aByVAQe34Lv<>$FLhmYwy>vHZ9L+OGtLD zXk8g~9>&IqwCu!5AzErv^VOXg8lF+z%V%#NRFr~KJKJ`3jkU$ht(DCQim_i}N4!o4 z!$GQ>+%uI_U-r)TNq3#G+@%^|6|XG=48VJ~)FdH^U5g_`n`2!Svo$~!n@we7Iy~jo zn-#d5?d2T1>wCoqJT2p-Qg{_{n8dbnP_yQb-s(CUu<9RrGeRRX@Y0#K`7kI|=1RK< zU_-Z(ew9buvo)udO)eu#PG!BRo=fY!EgR;w6(g`b{FZn+)x?W=w4<0bfXQ}Lux2Mt>Sj(c6Otww`0mq5cyK3$*Bwr?P^P7}^NH$vNv)DyJMfR+ z5m(k7vd<&wBBHS-=O`!vZ0Qb>B{>*EVEKVGtj5&f8`bR_C^ZjfCcEf0assGFp(`H~ zy|eT3xfDra>#mD|)$5<6I8`MG7S8$X@5YJ{Sk5CRoAS2Lbisqm>jRpb&hy(WzchU) zmXXl4d9T)uX-s6asr$-Bkl_d6rNvtj!nPX+i!f{B&EzH1Z-=XoM=yAqBkt|P>}EUG ztIn{`&db%@>fLwsIwc3cu5t&89MDTVqYm?_^3E-&8to_6aSt^Bl+NQl}kQ~5;mu1Li^(o-qhXVHL z2&u}@oQNA4XIA)?2G2yLdM0<7?J~bHyH1CdWx^R;qN_w&xxgwb#@UCe#Z|rezSeSjF#GONeZ_sW5=tzxSZN z?GD3)N>9I|uma{^YBqcVsczLF8lOYoW0-pdUqAa%8J*a3Ic%4@=xbATURD5V>&Nsg zJSNkQO$GD)NQ{SIcxsD9R^S=|px08Pm~tNovuWO0n?UkRmX7wyfa*!K_^vgZ5&4(j zn(EDtgRGNF?&kD2EACp)(t_^##!I5xXZFu*7Y9WZYxjsPcVa<2rL)GBo~~Wr_u)Q| zd{!5fYrYLfWUb|fmf4%f%3PvYXour$AX%x>9Tqbsb!#*h!$esjHJ=@~DguQV3*8JX zp1quKkg>VIz|7$Xen*eDx)^W5zrA>>bDy+vtZeXL20JzM20bz-K&B<+oGu`N@jcSW zN8~174Il=0rNn9qj3IifM0yXu2WMAX6=p`;1brmAVVLuAmXi`ZGWE65H0Bl0=*I4% z0eyi8#z@5+v9xFM^1ABa6=-kbv$JIE=`Tup$=+;9dW}i&n$id08kRo*)%HrZJn3j2 zpJ(k3?~tUz|^NH}@+nyC-dHw`sVuPY)wy&p1;@&h9WlGNx5%b@$Eb1UmkW zAY~Hz2VoI%piA_n0d==lhzqb7`$+mO`sIz3^fXCrcax0!9-XDUx%<#iMe%#VRl*j{ zz&4S!-f@&S1HW$m=^*Y!UH_hYV7o@l+IXI|p_x)+fsSFJU;}WHt^|sygEfFSY!Qy40COI665aOpX%4EP>s!DqwaQd+1Xy|`Nd~r+qaEAtxxQ7KnIb8%JtY$E`ZK<@K>GG z&E%Hb3iW0=sU5p;>Gb96#CbaS{;@HoyzfDQ&P3i}*84%fwXQXR#Vvc(qj!p3+o7N& zik3`vYIx$Hn}|uYWE! z;Y4SL@q4orQWbkhH@(|-d2*LygqI9$;~8Yi=c-ih&MrahXeT0vUx_E0o@cecb3Fe( zhNrT_^0s>$L`J&C(?M&g;&A94!Zf6o5-d@NU41eBJ({9xHq*vl(usvzr~ZfqRdb_l z{UH`&ba@G~F1;5eRt?7Gv4PW18HO5(7ZOl3}VmYtm5x=Y#8qw_|bJVD%FQ%h`j z_MKys&+3P)*88)t_ApsTo~};8tBoEU({WE+%JvZ8Za z&j72Lz<-;B9p`5+w`0mT%6yrm2-C>}71fiO8K9GsQOo4!JMG+`@NrewAuOrXyz!OfyL^OH3h!Dd)S zu%5Z3!x;@SRXyAKx`~PR0ftGl>q{G=bW#V)j8Ng(Ng~w-jhp5c)Q%JPF6%Z{jy-1y zCGXH{?>NkV_S{(8a?wmU*pe_iXg1Sgz0(JG2V~Amw0KL!hXY}A!>Wioz?_CUu5zc1 zBck-g`_zr!C4E-HoC<`W69OTN4ttU~qp5)+B-{!bI}x08j9A}6xXmOY3XF>1BJ~d+ zv0(&4s|N*_*;nlb(dGT0W2vnyDuR5kbs8Cm1c9wDd?A=S>UDM*iosmv8Q;;|32Lp_ z0e>_zHT9i_hG$^p?;)6wsC~=);bnVKM#Dip@nC@`_1|-0miJh=Y64E7{YRZTxZ)(4 z_#+w50;GNZ^T`W`U-rHc#@jrdw?^5B_;j={Ic)&c0i?z+xT?0&IH{Jd^jY}^9~-lq zj0@XjVWYKAzAK8=?aj{E8l(*C8MqL_9NBkW#DJPszZf7DnE1nhim|u83%5g<&Lk5& zl+lv#%qcisti?2<)emw7PLw<=hEv8v>c16w*YUtGq`a_c0<{`rqc0YLkFu7gWw?&n zO18GAI3jigWI)RTMn?}%H)a%~NXbaqM4;6R(A>PBMctUY3#l1oY~L z&TS;qg4Vm7I2d?QtEp9Iq-Z7D)?$o*G+5_YOgxT&rS|`oUbYI2)#9NVMvKA7-b)#aBhXW zn}N-8u|E_$Pu);*@`@;I(F@hE7(ig=mtaJmK<3X8pS6^l=x<%aG6~wUo&KmFg^{sx znBEE#KmP{INqO%`pkg>#=@F;J1K(WgWqT?09gwXfS`Hqp-l*v&^;+=(ySa?TTN`Pd zvCJdXFs>O}o~rY?f8R??t;9$!p;AmFd}`{&G7rKw1@6)jJ=&Ccq&zSb8NgfqImM{& zfh7?EJ#ftCUUJZ}v<>$q7w|m^v{{JEKNfJG3)# zBWCnmZ@awkJy7%K_q9l0%T-62k&2Z-_-cO3lL`RR$o4pipQa@P64(2T-*vU%UL=1N z_V$d)fOHwJxYy=(<_wwB+2{KXgX2lP3cQe`#LSY!VH*e$t_d9ng`uxm9=gWJh{s;%L>V15>8sL(^9t;KCr(ghJ3Dm1dv?eC9Uou@`xDg@ z%uT89tu(tCWKFPin_o)vk2QBD87T&TlOHxC?JSWR`l`lqYdCu1s+SjGi=q4z4VS&acj-a+@h}w)A0@|7-iLkT)^S9Es*82BL(IqtshIGt#Cpfj8$*8N-ra20 ziddT3kfDs1n}MN&85CvpQD8@53FHp(agR(9)|F~*RR~2qIL#;TvJiVV^G=1^UM#4 zvC$2ZSk?tnm-7wlBqw~110IS+=Lw5qbD71wHyJH<{x zWl9bgV#Xy-|0dHDov5VeZNAwoZA33eb{3Bt!i+G>Gb(779LDpxXGRa5*Bl2|GUodi zNOcZXSO#0IV~x>v=6W1`^SV+-Z0?Kn8x!I)Ud^K{KsfGCYP0Ud9+R`8n!MBwc59%N zGDx!r$~eGO)a0yiW?4D8+pQagbL$2k{=gofztSsH(O8PM!J+2ZfL%S50MmeWE@=c&)m2xFkzi4 zUB62h{x@^pas_CObuQrM{=;(tZXB;RKHMtF{>Ro>KvF=+xl8m0UU>3ft0`p|2yqYt zNs|3hs2iB%U)=aYWYV9<+Yg*hVplaEodHCzaL$|D_+uXcX4}GnKxnv2_m5REfdLb_ z$DT=Q_E$j7zlIh+pka@J!53or{~*_aP5`g*HuT%S=jHz{#*e`N3?up9JYb1EQWYxN z$sZK;5E=RfBLUI)(qsQx`M}rkfg*#OKrdJR&?h~@Jc6aIdM%HCb;wAbY^Pk!5bIGD z5|wM*b>8T;;!lEyMPGHH_3tk*OpGvh3%T*u&|@}5_F(g|abe8OQ?dq^=FcW8;fdwM zWkD>7IZ}B>sH3)e(;1J^FA(ApnzFRAo;upjorRjUx4Hk3uJN zI34cH!QD?#o>v(C(Ko}A2r{r=-Zt^~E@qRPh4z$h-i1ASHcDSiMc6{`bgcz1F@>Mk z1aZQ>jpI0-8f`_|g_M_PW2CcR)Qr8-FHNGeLqZYY_T8haCeh9XSJGkc)}AcB(*FWz zSdSEdpF=rRSUuGH7wj<@3YD9f29n!&!C`W|7dXylp;#0TfjdnFQ|Ln6j7A6Pz11<>oS!3t20oIPydULF-GoAbf> z-jB>D4}Fi9Y@=`#MqJLXI|s@S3+=RIScGRsA92%IND^66gI!td=!&>)bFV276wiB^ zwEh-t5vc{wQ&MM}<;r~*bDv3%18tsdR(_sus0U9*vq|IHhn|rtmH8OdE_9nALYDsG zpQ+J*^Uu=mVY!=H$_Z714#GZF(K@$uSczX{DJ37X_iU01tuRdk+{C+5P6zMWT@+oM zw|sJsT(e2n$SLaU<5!I(axK>RTSp{fUzQAd{bxRhVawggqVnlu zbbbliADOQ7i>7@&D~UlEZ}WF!%qxPAy%mjD*Z;~;d~6B(?DCp*V(ZPbUWnHS)t;XN z-Hz9@_*tMZr7(pVS~N{Tj?FXFaJSrPUH@~8c1TccSlK42wIFJ6@}g76TRx`AP|Bym z-gH2qR=!FKDIn)6wf)r;?vAxL{eC7iIh}du1O0vyP|4-9Zp`{q>?Y2;uWyhjDSnnj zI@Bg9EzmV767gQUv8gD8fAVib*h*nd`uo(2*f^QBl^{x9!UtIw0E6-8amL$kjDdW4zwmaI{sctVvzp8|Zw z7N7AJ{xjJ?m$Jn%yWDSkh+wkc#?PMe(rZJO$n<{47cWE3CYg4$-isPjYy1*J%ASme zF`Kbp;!2&0aqg)_?=src+JK|k#mOBA3I6L#{zm>}4f5QS^MH+$%S!Ze#OE^pFB`WR zM%_OGNkVdJSl14%bq8u38p5*)bYS^n%NOf!seRn&Yh#{r>y<{TF*go@)h2c$Y?r{S zOSE4LehSX+qGeb~TMe;d_?+R9<7#d8P|u>empZTK6}CooP$ox8Eme}&O@b&qoR9rw z#KZuzpG&!4Y1UTbuFHg+LgPH#Vdo{&2p4AW(y7_J4_`84!tjPue({Fa0U6wm*6&>6 zwF&$tq|7rVdlK>}xn%Uf{u`rIgNIp#KCJ4cmd2b-(Hi=nTvKW9@#JwP5xckoY@!!v z@u~-q`$kB}%(`fnPvso(OtkzT22JV^9u=*s85;S8=}>%hXq)?4p8|2tV78Q=hs-}z znRF#;*?%X_2U;=U`b`_~S~yAjV8Ir_F!t^ZB$G}}y>GW2=F}W_;Adof|q(_nR|AN>io(px;-J9Wb+Ew?q$k`F{1HTcA#QI?$#GYMPSFt@E<|xgtQA@n5 zkNL?lOek6Q;wn|TufUp=*N;G*s9`%8^xE-Gt z_;!%|8@U9RagbrGoE2h$9*Vu}34&8DE;gNXiishG1>Q`6xTfBWWVVB4ev&xPYf5f1 z%3ce*L$qA2P+B0RRwky*gAS={h&>T|HJlR}XwsfiNOm7k5(Lg5GIlb$9GAYmxs8x9 zmMsK05j7}t+j_?_p9KW1Eso}a26kf>xc3G1JsGZe*Pq!B5vJ&rgw>zFGTqN{xO0G- zTuxV&MQ;dnFKt_{Uk=M+Y33qWuA2e^w*NAQrY?M6dZL`a^K1jPvRyQoD<)gSar;|v zq_9n9Qthc#^KSlOey!)Tlu{$_Ja15Y=`r8>xuFRi^Um>76?u=3(~DDFM2t7XJTO5J zt&XJ6#ni-HhiwmmNOC@!h|1?v7{6-(h;tTyL>X#-8Q@G^);UcDV#1_d-xNZ_65kOJ zTw}(Ub1G(%S$-$%Xdx6^TI%PryrU&I7*~-HF(DC3?**iv-K@9&ZMviw`Ue%bhwprb z_m_XA;@KO@0DU@n=yow!apLT|`_TZ`s7z$L$29#b&+oG6?>s-V$F$e0|JoN$2MdcA)WY9_h8NCR&n;;W{t7@Lq38&c*8)MZz@|hj}PQ40R8wi z9JQ~?BrY_$@ohX^-jm1W%Yn^t>nV@-UOC%x&f4R<5idmI0a4q!Ec;tPp{)IvPU*I{ zzP#B9Aw>_>4mV}0X6sUj<@C*j3jZ7$)m zGA)NEz^1I#yHUes029_|2}c`BWpf2TWT0Oo_8(@GvNMC;JT~obmMhdkBOK!jVWvsiTDP8@ zR(5k=ferABT}3M_zS7;;YX^K#Ahl8%*sYdfDe|MNeGPQJ-;?iEugO?rWS-a9F;7}P z@&`Lp0I*1`6@v}$bZQ?X+&$FyT;kCi?Nk;3dJ^o9FnBe3_L z&1`M&N1?f{-s?SMjxR6yM9!g6ea$=Vvv#n@Qg4Fw%gS&fRtvykz?UFZ>J04q;0!-f z_QlYo^aj)?`6mc$)(TBkfOihM+xG5@7;oB(?#=gal{wOwioJ#O{G8CxHkB#U!GeIe z^5vlQ{<6LN!Bu8NvyJ;qoiM@^fW7B!Z9CdOek}&Nc8w%%g{k)g793S=GxSFJgnn)1 zS$%UBxvYX0H-4|>rpS;X$mYjj>#xPDE!ZSHw6M`be>%<|bxygavlZea;aG+`Z&r5t z-4izXSvnrM{obc`Q4)aT3y=17KrdggB2lFetcgrj=l+m2wY>TQYDw4j0UV%#EWpgf zx~+g>%$5h(sI6uJg@aW}S$ODzG(C_`q(WW$%dcRUC!O$FX-4DtZ<{1{o)~Bd-H|_V zE3s?#?SGRqY1zIxIKuXkN(K7DugK#EIEfuzEp8Y^P#5Op}}1O*0!l>!^IFS9vi;!rM%(M{_4fA=>6~ zjfX=pK%AF5=GDGSq6(9(W&c4US9zyqGMw)^RIyF#wr`+)!vNw^<0#i}#EZaRn`a8g zEi?juI}m>?%{Ni3Mhw|MHqY7VKOqgZ8mxTMBwNT3LIt3>bQFLI^7S?1qe!o~y*y7A zm-!Db8M#I#4nUfMkYt7&aJ@V_h&;E5pw2T_8mm4R%Fr1x?ms#D2E?RpTg2LH#D33| z5q&;!&U?LNQKTJXB`8F&JiZ2SHjrYAmm#kq{O}l8G>%$2#ai{W`ngB*dR4KS03Q=g zQbx+JM4k!ib4gg`r-euA!lFBIkj{jv-XwRUg8BLOCkwdr08loZD9n;( zbLj~zc*Vv@PY*Z9U6F2YP*_=FtawF0XuL6+KMGzrs+C^rUK}~UvGgLOs_psZFXURX zfBW)|q&DZ!Ida}<0;-3Bqg5#lHxAH~duofWwk1Na9K@sKE2oeA`&Qz|QJ+}HompnP zo3x%@1sts)0!+|3$=*w}087Nz^MWXBqg}(7K*&cb`j42IU)?@~g%)8;|tr(c51L)QkotJIV;)}LKyp$arlo#1%W)a2=F_=G;d;*a?I&! z3g}Q@G>DT+%37xg{3A>?l|TaVA=pfKd;bq2(eU_k)@d-`AX*a6`BXJ`KVzfyGo zL9^Zdrg7`{X^Ow-@9Y!E3jmLdcA#$|{%sxn*X6cA(M2<6nW9U7!ea^iquBxc?M4FW z=6_xEyB}a?piJSgg6gftpK;!QKl9hqQ*PjqR|<1~=|Ae?jrAJhFG1Bg%P z6XauO{_qh9P59Fd099*XWkoE1Iscyv=ASz--2f(E-Rt@vGmrl7;(rh6|Kej}>K6H8AMC5nycs*MJx<-F%7G558j>ld}770h3=0ME&#eXuC{3Tu)JIySou zoEiTKOrl=8p~sXzF~wa!<6*XSNv3owBU0w>XdC5VwtsBM_}C{7GN0UTmE@iAFAFgH zwH0fd5J=lafbW`r97c^#3aZ=jKbF&*I`sV@?1f57{*bI8BnD_#WimJbK@J{`)-aLj zRH@Gntr0veN*WaJA2H|9=9JLK#8`n#w5F{}fpAC}=bj|jZ{B)(^tyUyG7O_Dt4)AY zu2kKB+ep5WwY<0REURG}#o@Jgoi1c{53et?tt1l!ywXE)YSx)DIaKKg?q(;bj>!Jm7$ z_ZmrJ*VOjE%JU3cQHSD0e*>Zyq{`sWmc1ct7Rp0m2;I*rL+=S6;BgZ@05^GFT!@J? z&&*c6`F$BZ`pLP}CKN_lp`+t@p*WW8st+&V@A;{wWEWgDpx%Fh&&FV|C??2YC~#M< zvcs`}a1WPe;;s`c=M)T(MI~eN*3X%7R~*VMZBA5$v=y8VR9acV7IG&oMmnetM(F3y zA#|{7?J-#@RLL%lnz?P0zMpB$rBR7S--GS3Z)+xY%6qm4STXBz5(Sh_przI2Ti(_7 zY|uDc)pi;F#LxQ>Jw3^l1m>LKHZ4uSW|Fp=%#N38+|(_GBpxAD^du7pBtdIua&nF$ za;#5Qx2mSn>{p7X1dnf}CX=wsU&5obX#6=Yns8(DqazW&*5kMoA*1#>>HN_$HDo%K z6?Klq*Rzw~DX1)wS8`~@ObIsU>dl{2Zi^aGoYfiC9rjLJfrOBau820lv15An(A#k@ zyMd{5B?TOruRC9GKIS(8-RK{raUulVo;d~(S!J2DQh5f9JK+PJsPY2cOx*hD6G(q1huknUbojyQG2~djpygG17VJj?KcGOFfjK%hUO_ z!H=IN|BbOiC?ybddP(p2x=(FKufua#nNB&Hn)BtW#jF}L#b@XGL-cNImNY|I1p7sS zO1+v5_*jTmyb-lIeAS-umcz`@9Tt#DM3>7_LFxwQ1s4(=1WwSWwV%UkfYKCQ$qm5z zhL%39Iou3g;JfIF^;Px3q3 z*n(}v$W;{Zo)A36Ogoo)w50@%z2-0IB`hClh!Zt==zi;=QdGqF9i!?OrrYjr{dXF3 zeGBxF-p{T?jRz5v$iCMrx<7jupwHSg+El%|L`f-XjSat->)mc2hcxUDbaA!;!muJ$ zD5Lso(_T8cNU1pVG|3k`273|bW;r;sY`)3a)`q-$`9ZnMYh(kD#X81hw4b%ah2{nMHNgnVWXgun5CDYyU1ow zqbLyG`#Ztm%|S>{oCi?kFl1ME1e%XYHp~?DJf7XSdH6gkO0|6d;;}#|>@e1D+{Y{b zEZ@}rd+{ktXhKP`#$}Njc%dDV>-PCDoZ~OyM%a!kc8OVE1Z#f6FQt)$Ue*09>kozh zh5fRVp+`3T>EaxqY)XsjgZ^hb^{N_Zg5KK@GA4ZCQm6EYTWGU_aX{Q)%^C?Fg+v=vqM(crtkVLXdk)?Do0;?q_g)+PjKcO z@x`L6Y=ApBO(gcMqZPu;)3l=ZJ58dtSx(k0Z9?G9`HMMQ-zdO{(AxCA#XPb7;5pxQ zr)sSjknWB9;YjdB7U6xTia|v#J@0|j@1R{mm(77Z0403PAh8GRkqwVe*VKGSMo{1b zAX!OI3EFJl~R2l1%cKY@g?% zg{@^WAk_nN2X@R{ng?*1!okcU+Ome;6_8%l&ulc!#JA$ZA$X>ohUlmEQZ49d?D|U? zb{{nB1tX?v&gl}JDY*KtL4_;o6g#x@2;B()8vyyOniZBa>A=hcTOlqPKxqFSJtjPGC4dSBFwn< zdiH_&tU>g!L`*&UW2y`N?YuAEF&=xkyeC;<7$>{ z7X9O-e*h0YSL%19VP&K$wlcti=u16)^Q2^EN1c&Qik|&|A8neuZ~#=$Q9x9 zY#DJr+1}J(0o3e6_0qHcmkPr3)2^6Z^cUm%8R?Iup79=BHqDK<@p`Ig{P8k# zt>@5V8Q;@EZi-hc(%L#_gJTjk+P3XnR$AVnoo@^|E560>av~MY?17UgnxxN>Y8pNi zjiQMp1rQht5l4bP2%l87wkK~4o$5+_HCvyEy>S++s(vI{Ov`?vs)9!=^W7Q41q3O;_1C;03StVEpg9@Ms`h`k_}>W zeR4$o*JiqA@1eK|7?d6HbDEaxgcXCmXOCc|L1cwH zm+?A(@OJmWuSFY3w}9ro0xms>PIa!~;HwY8sbpQI^wRko#7{kEGIml{0dQgDJs5#E zK}I`tUc3N|RU&^*YA`$$dqq)Qhu+F=;ws-(mlTftOQO`1Ovg}oGwn)VRs17S5kqA6 z3%5p6N-x5OQWAEp^uG=<%1+*93uvooTh5kT_uc~NAS$X~F9y!l=$L`EQr#Ela&Kch zdtIT-@4fz+=K8z!x`Evft*=6stgo(&e<`tJ@{M`Du+t+xQlZn;81s2;NL!h*l%0wc zWUB-LOBv&Hi};M{G;`Ax<-!;9Tw3L}HA6$X_y<^7Ml@4j4w}PEyl?F1ZjX=pWaJ_+ z!XEd%4^3o%gY(w+W#)E~dGiy;5>1xfMp%soo6kN2)ZvcK&N^mX{E6y!zu%Gbr_>V~ zUR%i`c4jE11BP{PHQC`N_ETwxp5EOl(`(eSFS7gq>ryQgEL8}M8n*tqD6rDt{?94E zaQ$_yj`EEQa_={tNQM%KgQ=5>wE+P&*0Lf|m(n-3Yvhk@`hVqh_{Lv?&^4`gRMa&& zC779l;lyDS%}(!Q`odL+%j_@)J0Zh)BM3`|LxiPbDanI@VCl$-BKttqn>Vw6rCGa8 zL@G@a09nTJk$BrnjI-{#i0$etjRf?yDV)DhJw$w@M|fZR6ml`o_x`+`^wp$DD*_=a z1{$_RSVIuqhU^fD#HbBX!Wreenx48PoVR*=>rU*Fk%pRq?p(}s0f4)e!QS&8bDFw}i?oPNM=ScCck);0m2jLnDM!EE)3?4qnZd8| zGdJos`C@T&xan-^`*SQeHvsh{RVL|YWu&s6iZ-kNttrR1{p3gua7AjGoXznZ>k86FAK+d6~SfHD=(emuUykmtFXJX{n{E-n?Cef`sR5<$m zFPSitFntn85FVEwy&w_jJErvD}S<$I-poE1w%0$f42fOh|+~<}9ZN<|sb>`uuvM-_bH6`LINYfPl zqHsT1siSBy=^~j6mx8T4bbn&~FpHX$lZN#{@AOTZ=V;-BM6<}in`tDgSSvE}@wFZV2_F52l#Ab9hQ|2waL1-xJa;%IoD)% zwZ9O7AfsX-uqP`y`7q<6ax#cy(_|}{Paqka+;gMU->p`z^!PRXkBKpqSw!YSW4sE; zsrqNIx5oO9h&M(Xobi8=2E+c}-rTjm1~4Yft6mvYu7>OFM5R~Rn8&g{-NyhIHV;wv-~GaOyOv(Y(q}Oc#2YP(_ODz*d$J0shu7VdmM= zKf^~UnsTzD^LYJZf!Ofm^K~;{pKv{6&4Yl34yY`60UDVoF$Kds%vVk}f; z2HX6&yiqoHFed55t&^wh&Y11jq!@Zh2prAYpE;;%(R?^De68UhK4LksUeJl&ac=_| z2dSyYd@XUOCm0uznaN+|FGlGq38LZnM5-6m3S}g-vPvchb~YQ zp3Yce=ckh`I1v{64P48pgyXq%61a6lRlYP$qhyWX!p`3;5RDWiMBLG8t~anT&I3Kbed^+VHif zN->XU#`)XRxs>#@(UejP=*dp$nU8=>B+!d;ztQ;*&8A|;4F0U;3y&i|K|X%(uK`RJXf*e`JXd_zw->qYm=p}48PUQ$_M`} iu794XYFyxVM6^@l#Q#j{#&ZJT-^2SV_X_SBzxaQ|e5LdN literal 0 HcmV?d00001 diff --git a/materials/x18/lab/2/lab01/lab01.ipynb b/materials/x18/lab/2/lab01/lab01.ipynb index 99bdfd1..2f1baea 100644 --- a/materials/x18/lab/2/lab01/lab01.ipynb +++ b/materials/x18/lab/2/lab01/lab01.ipynb @@ -4,9 +4,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Lab 1: Randomization, Iteration, and Probability\n", + "# Lab 1: Randomization\n", "\n", - "Welcome to Lab 1 of Data 8.2x! This week, we will go over conditionals and iteration, and introduce the concepts of randomness and probability. All of this material is covered in [Chapter 9](https://www.inferentialthinking.com/chapters/09/randomness.html) of the textbook. \n", + "Welcome to Lab 1 of Data 8.2x! \n", + "\n", + "We will go over [iteration and simulations](https://www.inferentialthinking.com/chapters/10/sampling-and-empirical-distributions.html), as well as introduce the concept of [randomness](https://www.inferentialthinking.com/chapters/09/randomness.html). \n", "\n", "First, set up the tests and imports by running the cell below." ] @@ -14,7 +16,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "import numpy as np\n", @@ -34,15 +38,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In Python, Boolean values can either be `True` or `False`. We get Boolean values when using comparison operators, among which are `<` (less than), `>` (greater than), and `==` (equal to). For a complete list, refer to [Booleans and Comparison](https://www.inferentialthinking.com/chapters/09/randomness.html#Booleans-and-Comparison) at the start of Chapter 9.\n", + "In Python, Boolean values can either be `True` or `False`. We get Boolean values when using comparison operators such as `<` (less than), `>` (greater than), and `==` (equal to). A list of common comparison operators can be found below!\n", "\n", - "Run the cell below to see an example of a comparison operator in action. Three is indeed larger than one plus one." + "" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "3 > 1 + 1" @@ -58,7 +64,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "result = 10 / 2 == 5\n", @@ -75,7 +83,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "make_array(1, 5, 7, 8, 3, -1) > 3" @@ -93,7 +103,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "nachos = make_array('cheese', 'salsa', 'both', 'neither')\n", @@ -112,7 +124,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "ten_nachos = make_array('neither', 'cheese', 'both', 'both', 'cheese', 'salsa', 'both', 'neither', 'cheese', 'both')\n", @@ -123,7 +137,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q1_1.py')" @@ -176,6 +192,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "for_assignment_type": "student" }, "outputs": [], @@ -191,7 +208,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q1_2.py')" @@ -208,6 +227,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "for_assignment_type": "student" }, "outputs": [], @@ -229,7 +249,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q1_3.py')" @@ -248,6 +270,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "for_assignment_type": "student" }, "outputs": [], @@ -260,7 +283,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q1_4.py')" @@ -276,113 +301,77 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "number_wow_reactions = ...\n", "number_wow_reactions" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "check('tests/q1_5.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 1.6**
Change just the comparison operators from `==` to some other operators so that `should_be_true` is `True`." - ] - }, { "cell_type": "code", "execution_count": null, "metadata": { - "for_assignment_type": "student" + "collapsed": true }, "outputs": [], "source": [ - "should_be_true = number_cheese == number_wow_reactions == np.count_nonzero(ten_nachos == 'neither')\n", - "should_be_true" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "check('tests/q1_6.py')" + "check('tests/q1_5.py')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Question 1.7**
Complete the function `both_or_neither`, which takes in a table of nachos with reactions (just like the one from Question 4) and returns `'Wow!'` if there are more nachos with both cheese and salsa, or `'Meh.'` if there are more nachos with neither. If there are an equal number of each, return `'Okay!'`." + "## 2. Simulations and For Loops\n", + "Using a `for` statement, we can perform a task multiple times. This is known as iteration. Here, we'll simulate drawing different suits from a deck of cards. " ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "for_assignment_type": "student" + "collapsed": true }, "outputs": [], "source": [ - "def both_or_neither(nacho_table):\n", - " reactions = ...\n", - " number_wow_reactions = ...\n", - " number_meh_reactions = ...\n", - " if ...:\n", - " return 'Wow!'\n", - " # next condition should return 'Meh.'\n", - " ...\n", - " # next condition should return 'Okay!'\n", - " ...\n", + "suits = make_array(\"♤\", \"♡\", \"♢\", \"♧\")\n", "\n", - "many_nachos = Table().with_column('Nachos', np.random.choice(nachos, 250))\n", - "many_nachos = many_nachos.with_column('Reactions', many_nachos.apply(nacho_reaction, 'Nachos'))\n", - "result = both_or_neither(many_nachos)\n", - "result" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "check('tests/q1_7.py')" + "draws = make_array()\n", + "\n", + "repetitions = 6\n", + "\n", + "for i in np.arange(repetitions):\n", + " draws = np.append(draws, np.random.choice(suits))\n", + "\n", + "draws" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 2. Iteration and Sampling\n", - "Using a `for` statement, we can perform a task multiple times. This is known as iteration. Here, we'll simulate drawing different suits from a deck of cards. " + "The unrolled version of this `for` loop can be found below." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ - "suits = make_array(\"♤\", \"♡\", \"♢\", \"♧\")\n", - "\n", "draws = make_array()\n", "\n", - "repetitions = 6\n", - "\n", - "for i in np.arange(repetitions):\n", - " draws = np.append(draws, np.random.choice(suits))\n", + "draws = np.append(draws, np.random.choice(suits))\n", + "draws = np.append(draws, np.random.choice(suits))\n", + "draws = np.append(draws, np.random.choice(suits))\n", + "draws = np.append(draws, np.random.choice(suits))\n", + "draws = np.append(draws, np.random.choice(suits))\n", + "draws = np.append(draws, np.random.choice(suits))\n", "\n", "draws" ] @@ -393,7 +382,7 @@ "source": [ "In the example above, the `for` loop appends a random draw to the `draws` array for every number in `np.arange(repetitions)`. \n", "\n", - "A nice way to think about what we did above, was we had a deck of 4 cards of different suits. We randomly drew one card, saw the suit, kept track of it in `draws`, and put the card back into the deck. We repeated this for a total of 6 times without having to repeat code, thanks to the for loop. We simulated thie experiment using a for loop. \n", + "Here's a nice way to think of what we did above. We had a deck of 4 cards of different suits, we randomly drew one card, saw the suit, kept track of it in `draws`, and put the card back into the deck. We repeated this for a total of 6 times without having to repeat code, thanks to the `for` loop. We simulated this experiment using a `for` loop. \n", "\n", "Another use of iteration is to loop through a set of values. For instance, we can print out all of the colors of the rainbow.\n" ] @@ -401,7 +390,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "rainbow = make_array(\"red\", \"orange\", \"yellow\", \"green\", \"blue\", \"indigo\", \"violet\")\n", @@ -420,7 +411,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "for another_name in rainbow:\n", @@ -450,6 +443,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "for_assignment_type": "student" }, "outputs": [], @@ -464,7 +458,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q2_1.py')" @@ -474,33 +470,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Question 2.2**
What is the average point value of a dart thrown by Clay?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "average_score = ...\n", - "average_score" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "check('tests/q2_2.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 2.3**
In the following cell, we've loaded the text of _Pride and Prejudice_ by Jane Austen, split it into individual words, and stored these words in an array. Using a `for` loop, assign `longer_than_five` to the number of words in the novel that are more than 5 letters long.\n", + "**Question 2.2**
In the following cell, we've loaded the text of _Pride and Prejudice_ by Jane Austen, split it into individual words, and stored these words in an array. Using a `for` loop, assign `longer_than_five` to the number of words in the novel that are more than 5 letters long.\n", "\n", "*Hint*: You can find the number of letters in a word with the `len` function." ] @@ -509,6 +479,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "for_assignment_type": "student" }, "outputs": [], @@ -517,24 +488,29 @@ "p_and_p_words = np.array(austen_string.split())\n", "\n", "longer_than_five = ...\n", - " \n", + "\n", + "# a for loop would be useful here\n", + "\n", + "\n", "longer_than_five" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ - "check('tests/q2_3.py')" + "check('tests/q2_2.py')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Question 2.4**
Using simulation with 10,000 trials, assign `chance_of_all_different` to an estimate of the chance that if you pick three words from Pride and Prejudice uniformly at random (with replacement), they all have different lengths. \n", + "**Question 2.3**
Using simulation with 10,000 trials, assign `chance_of_all_different` to an estimate of the chance that if you pick three words from Pride and Prejudice uniformly at random (with replacement), they all have different lengths. \n", "\n", "*Hint*: Remember that `!=` only checks for non-equality between two items, not three. However, you can use `!=` more than once in the same line. \n", "\n", @@ -545,6 +521,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "for_assignment_type": "student" }, "outputs": [], @@ -560,47 +537,15 @@ "chance_of_all_different" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "check('tests/q2_4.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "** Question 2.5
** Quincy is drafting Basketball Players for his NBA Fantasy League. He chooses 10 times randomly from a list of players, and drafts the player regardless of whether the player has been chosen before (You could have 10 Kevin Durant's on a team!). Count how many times John Wall is chosen in a version of Quincy's draft." - ] - }, { "cell_type": "code", "execution_count": null, "metadata": { - "for_assignment_type": "student" + "collapsed": true }, "outputs": [], "source": [ - "players = [\"John Wall\", \"Steph Curry\", \"Kevin Durant\", \"Jimmy Butler\", \"Russell Westbrook\"]\n", - "draft_picks = ...\n", - "num_wall = ...\n", - "\n", - "for ... in ...:\n", - " ...\n", - "\n", - "num_wall" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "check('tests/q2_5.py')" + "check('tests/q2_3.py')" ] }, { @@ -621,7 +566,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "pizza_prob = ..." @@ -630,7 +577,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q3_1.py')" @@ -646,7 +595,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "all_prob = ..." @@ -655,7 +606,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q3_2.py')" @@ -671,7 +624,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "something_is_out = ..." @@ -680,7 +635,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q3_3.py')" @@ -703,7 +660,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "winning_prob = ..." @@ -712,7 +671,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q3_4.py')" @@ -728,7 +689,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# For your convenience, you can run this cell to run all the tests at once!\n", @@ -756,10 +719,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 1 } - diff --git a/materials/x18/lab/2/lab01/tests/q1_6.py b/materials/x18/lab/2/lab01/tests/q1_6.py deleted file mode 100644 index 6f05f06..0000000 --- a/materials/x18/lab/2/lab01/tests/q1_6.py +++ /dev/null @@ -1,22 +0,0 @@ -test = { - 'name': '', - 'points': 1, - 'suites': [ - { - 'cases': [ - { - 'code': r""" - >>> should_be_true - True - """, - 'hidden': False, - 'locked': False - }, - ], - 'scored': True, - 'setup': '', - 'teardown': '', - 'type': 'doctest' - } - ] -} diff --git a/materials/x18/lab/2/lab01/tests/q1_7.py b/materials/x18/lab/2/lab01/tests/q1_7.py deleted file mode 100644 index c1dff07..0000000 --- a/materials/x18/lab/2/lab01/tests/q1_7.py +++ /dev/null @@ -1,44 +0,0 @@ -test = { - 'name': '', - 'points': 1, - 'suites': [ - { - 'cases': [ - { - 'code': r""" - >>> (result == 'Wow!') or (result == 'Meh.') or (result == 'Okay!') - True - """, - 'hidden': False, - 'locked': False - }, - { - 'code': r""" - >>> ten_nachos = make_array('neither', 'cheese', 'both', 'both', 'cheese', 'salsa', 'both', 'neither', 'cheese', 'both') - >>> ten_nachos_reactions = Table().with_column('Nachos', ten_nachos) - >>> ten_nachos_reactions = ten_nachos_reactions.with_column('Reactions', ten_nachos_reactions.apply(nacho_reaction, 'Nachos')) - >>> both_or_neither(ten_nachos_reactions) - 'Wow!' - """, - 'hidden': False, - 'locked': False - }, - { - 'code': r""" - >>> seven_nachos = make_array('neither', 'cheese', 'both', 'both', 'neither', 'both', 'neither') - >>> seven_nachos_reactions = Table().with_column('Nachos', seven_nachos) - >>> seven_nachos_reactions = seven_nachos_reactions.with_column('Reactions', seven_nachos_reactions.apply(nacho_reaction, 'Nachos')) - >>> both_or_neither(seven_nachos_reactions) - 'Okay!' - """, - 'hidden': False, - 'locked': False - }, - ], - 'scored': True, - 'setup': '', - 'teardown': '', - 'type': 'doctest' - } - ] -} diff --git a/materials/x18/lab/2/lab01/tests/q2_2.py b/materials/x18/lab/2/lab01/tests/q2_2.py index 287fc0d..2d10ec0 100644 --- a/materials/x18/lab/2/lab01/tests/q2_2.py +++ b/materials/x18/lab/2/lab01/tests/q2_2.py @@ -6,7 +6,7 @@ 'cases': [ { 'code': r""" - >>> 1 <= average_score <= 10 + >>> longer_than_five == 35453 True """, 'hidden': False, diff --git a/materials/x18/lab/2/lab01/tests/q2_3.py b/materials/x18/lab/2/lab01/tests/q2_3.py index 2d10ec0..5466085 100644 --- a/materials/x18/lab/2/lab01/tests/q2_3.py +++ b/materials/x18/lab/2/lab01/tests/q2_3.py @@ -6,7 +6,7 @@ 'cases': [ { 'code': r""" - >>> longer_than_five == 35453 + >>> 0.58 <= chance_of_all_different <= 0.68 True """, 'hidden': False, diff --git a/materials/x18/lab/2/lab01/tests/q2_4.py b/materials/x18/lab/2/lab01/tests/q2_4.py deleted file mode 100644 index 5466085..0000000 --- a/materials/x18/lab/2/lab01/tests/q2_4.py +++ /dev/null @@ -1,22 +0,0 @@ -test = { - 'name': '', - 'points': 1, - 'suites': [ - { - 'cases': [ - { - 'code': r""" - >>> 0.58 <= chance_of_all_different <= 0.68 - True - """, - 'hidden': False, - 'locked': False - }, - ], - 'scored': True, - 'setup': '', - 'teardown': '', - 'type': 'doctest' - } - ] -} diff --git a/materials/x18/lab/2/lab01/tests/q2_5.py b/materials/x18/lab/2/lab01/tests/q2_5.py deleted file mode 100644 index 490a27e..0000000 --- a/materials/x18/lab/2/lab01/tests/q2_5.py +++ /dev/null @@ -1,22 +0,0 @@ -test = { - 'name': '', - 'points': 1, - 'suites': [ - { - 'cases': [ - { - 'code': r""" - >>> 0 <= num_wall <= 10 - True - """, - 'hidden': False, - 'locked': False - }, - ], - 'scored': True, - 'setup': '', - 'teardown': '', - 'type': 'doctest' - } - ] -} \ No newline at end of file diff --git a/materials/x18/lab/2/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb b/materials/x18/lab/2/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb new file mode 100644 index 0000000..c1f5560 --- /dev/null +++ b/materials/x18/lab/2/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb @@ -0,0 +1,664 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lab 2: Sampling\n", + "\n", + "Welcome to Lab 2! In this lab, we will learn about sampling strategies. More information about sampling in the textbook can be found [here!](https://www.inferentialthinking.com/chapters/10/sampling-and-empirical-distributions.html)\n", + "\n", + "The data used in this lab will contain salary data and statistics for basketball players from the 2014-2015 NBA season. This data was collected from [basketball-reference](http://www.basketball-reference.com) and [spotrac](http://www.spotrac.com)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Run this cell, but please don't change it.\n", + "\n", + "# These lines import the Numpy and Datascience modules.\n", + "import numpy as np\n", + "from datascience import *\n", + "\n", + "# These lines do some fancy plotting magic\n", + "import matplotlib\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plots\n", + "plots.style.use('fivethirtyeight')\n", + "\n", + "# Don't change this cell; just run it. \n", + "from gofer.ok import check" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Dungeons and Dragons and Sampling\n", + "In the game Dungeons & Dragons, each player plays the role of a fantasy character.\n", + "\n", + "A player performs actions by rolling a 20-sided die, adding a \"modifier\" number to the roll, and comparing the total to a threshold for success. The modifier depends on her character's competence in performing the action.\n", + "\n", + "For example, suppose Alice's character, a barbarian warrior named Roga, is trying to knock down a heavy door. She rolls a 20-sided die, adds a modifier of 11 to the result (because her character is good at knocking down doors), and succeeds if the total is greater than 15.\n", + "\n", + "** Question 1.1 **
Write code that simulates that procedure. Compute three values: the result of Alice's roll (`roll_result`), the result of her roll plus Roga's modifier (`modified_result`), and a boolean value indicating whether the action succeeded (`action_succeeded`). **Do not fill in any of the results manually**; the entire simulation should happen in code.\n", + "\n", + "*Hint:* A roll of a 20-sided die is a number chosen uniformly from the array `make_array(1, 2, 3, 4, ..., 20)`. So a roll of a 20-sided die *plus 11* is a number chosen uniformly from that array, plus 11." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "for_assignment_type": "student" + }, + "outputs": [], + "source": [ + "possible_rolls = ...\n", + "roll_result = ...\n", + "modified_result = ...\n", + "action_succeeded = ...\n", + "\n", + "# The next line just prints out your results in a nice way\n", + "# once you're done. You can delete it if you want.\n", + "print(\"On a modified roll of {:d}, Alice's action {}.\".format(modified_result, \"succeeded\" if action_succeeded else \"failed\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q1_1.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Question 1.2 **
Run your cell 7 times to manually estimate the chance that Alice succeeds at this action. (Don't use math or an extended simulation.). Your answer should be a fraction. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "rough_success_chance = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q1_2.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Suppose we don't know that Roga has a modifier of 11 for this action. Instead, we observe the modified roll (that is, the die roll plus the modifier of 11) from each of 7 of her attempts to knock down doors. We would like to estimate her modifier from these 7 numbers.\n", + "\n", + "** Question 1.3 **
Write a Python function called `simulate_observations`. It should take no arguments, and it should return an array of 7 numbers. Each of the numbers should be the modified roll from one simulation. **Then**, call your function once to compute an array of 7 simulated modified rolls. Name that array `observations`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "for_assignment_type": "student" + }, + "outputs": [], + "source": [ + "modifier = 11\n", + "num_observations = 7\n", + "\n", + "def simulate_observations():\n", + " \"\"\"Produces an array of 7 simulated modified die rolls\"\"\"\n", + " ...\n", + "\n", + "observations = ...\n", + "observations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q1_3.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Question 1.4 **
Draw a histogram to display the *probability distribution* of the modified rolls we might see.\n", + "\n", + "Question 1.4 does not have an autograder test, so it is not graded and not in the overall lab grade." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# We suggest using these bins.\n", + "roll_bins = np.arange(1, modifier+2+20, 1)\n", + "\n", + "...\n", + "np.arange(1+modifier, 20+modifier+1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Your histogram should have values 12 to 31 each with a probability of 5%.\n", + "\n", + "Now let's imagine we don't know the modifier and try to estimate it from `observations`.\n", + "\n", + "One straightforward (but clearly suboptimal) way to do that is to find the *smallest* total roll, since the smallest roll on a 20-sided die is 1.\n", + "\n", + "** Question 1.5 **
Using that method, estimate `modifier` from `observations`. Name your estimate `min_estimate`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "min_estimate = ...\n", + "min_estimate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q1_5.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Another way to estimate the modifier involves the mean of `observations`.\n", + "\n", + "** Question 1.6 **
Figure out a good estimate based on that quantity. \n", + "\n", + "**Then**, write a function named `mean_based_estimator` that computes your estimate. It should take an array of modified rolls (like the array `observations`) as its argument and return an estimate of `modifier` based on those numbers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def mean_based_estimator(nums):\n", + " \"\"\"Estimate the roll modifier based on observed modified rolls in the array nums.\"\"\"\n", + " ...\n", + "\n", + "# Here is an example call to your function. It computes an estimate\n", + "# of the modifier from our 7 observations.\n", + "mean_based_estimate = mean_based_estimator(observations)\n", + "mean_based_estimate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q1_6.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Sampling Basketball Data\n", + "\n", + "Run the cell below to load the player and salary data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "player_data = Table().read_table(\"player_data.csv\")\n", + "salary_data = Table().read_table(\"salary_data.csv\")\n", + "full_data = salary_data.join(\"PlayerName\", player_data, \"Name\")\n", + "# The show method immediately displays the contents of a table. \n", + "# This way, we can display the top of two tables using a single cell.\n", + "player_data.show(3)\n", + "salary_data.show(3)\n", + "full_data.show(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Rather than getting data on every player, imagine that we had gotten data on only a smaller subset of the players. For 492 players, it's not so unreasonable to expect to see all the data, but usually we aren't so lucky. Instead, we often make *statistical inferences* about a large underlying population using a smaller sample.\n", + "\n", + "A statistical inference is a statement about some statistic of the underlying population, such as \"the average salary of NBA players in 2014 was $3\". You may have heard the word \"inference\" used in other contexts. It's important to keep in mind that statistical inferences, unlike, say, logical inferences, can be wrong.\n", + "\n", + "A general strategy for inference using samples is to estimate statistics of the population by computing the same statistics on a sample. This strategy sometimes works well and sometimes doesn't. The degree to which it gives us useful answers depends on several factors, and we'll touch lightly on a few of those today.\n", + "\n", + "One very important factor in the utility of samples is how they were gathered. We have prepared some example sample datasets to simulate inference from different kinds of samples for the NBA player dataset. Later we'll ask you to create your own samples to see how they behave." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To save typing and increase the clarity of your code, we will package the loading and analysis code into two functions. This will be useful in the rest of the lab as we will repeatedly need to create histograms and collect summary statistics from that data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 2.1**.
Complete the `histograms` function, which takes a table with columns `Age` and `Salary` and draws a histogram for each one. Use the min and max functions to pick the bin boundaries so that all data appears for any table passed to your function. Use the same bin widths as before (1 year for `Age` and $1,000,000 for `Salary`).\n", + "\n", + "*Hint*: When creating the bins for the the histograms, think critically about what the stop argument should be for `np.arange`. Histograms are inclusive on the left hand side of the interval, but not the right. So, if we have a maximum age of 80, we need a 80-81 bin in order to capture this in the histogram. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def histograms(t):\n", + " ages = t.column('Age')\n", + " salaries = t.column('Salary')\n", + " age_bins = ...\n", + " salary_bins = ...\n", + " t.hist('Age', bins=age_bins, unit='year')\n", + " t.hist('Salary', bins=salary_bins, unit='$')\n", + " return age_bins # Keep this statement so that your work can be checked\n", + " \n", + "histograms(full_data)\n", + "print('Two histograms should be displayed below')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q2_1.py') # Warning: Charts will be displayed while running this test" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 2.2**.
Create a function called `compute_statistics` that takes a Table containing ages and salaries and:\n", + "- Draws a histogram of ages\n", + "- Draws a histogram of salaries\n", + "- Returns a two-element array containing the average age and average salary\n", + "\n", + "You can call your `histograms` function to draw the histograms!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def compute_statistics(age_and_salary_data):\n", + " ...\n", + " age = ...\n", + " salary = ...\n", + " ...\n", + " \n", + "\n", + "full_stats = compute_statistics(full_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q2_2.py') # Warning: Charts will be displayed while running this test" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Convenience sampling\n", + "One sampling methodology, which is **generally a bad idea**, is to choose players who are somehow convenient to sample. For example, you might choose players from one team that's near your house, since it's easier to survey them. This is called, somewhat pejoratively, *convenience sampling*.\n", + "\n", + "Suppose you survey only *relatively new* players with ages less than 22. (The more experienced players didn't bother to answer your surveys about their salaries.)\n", + "\n", + "**Question 2.3**
Assign `convenience_sample_data` to a subset of `full_data` that contains only the rows for players under the age of 22." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "convenience_sample = ...\n", + "convenience_sample" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q2_3.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 2.4**
Assign `convenience_stats` to a list of the average age and average salary of your convenience sample, using the `compute_statistics` function. Since they're computed on a sample, these are called *sample averages*. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "convenience_stats = ...\n", + "convenience_stats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q2_4.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we'll compare the convenience sample salaries with the full data salaries in a single histogram. To do that, we'll need to use the `bin_column` option of the `hist` method, which indicates that all columns are counts of the bins in a particular column. The following cell should not require any changes; just run it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def compare_salaries(first, second, first_title, second_title):\n", + " \"\"\"Compare the salaries in two tables.\"\"\"\n", + " max_salary = max(np.append(first.column('Salary'), second.column('Salary')))\n", + " bins = np.arange(0, max_salary+1e6+1, 1e6)\n", + " first_binned = first.bin('Salary', bins=bins).relabeled(1, first_title)\n", + " second_binned = second.bin('Salary', bins=bins).relabeled(1, second_title)\n", + " first_binned.join('bin', second_binned).hist(bin_column='bin')\n", + "\n", + "compare_salaries(full_data, convenience_sample, 'All Players', 'Convenience Sample')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Simple random sampling\n", + "A more principled approach is to sample uniformly at random from the players. If we ensure that each player is selected at most once, this is a *simple random sample without replacement*, sometimes abbreviated to \"simple random sample\" or \"SRSWOR\". Imagine writing down each player's name on a card, putting the cards in an urn, and shuffling the urn. Then, pull out cards one by one and set them aside, stopping when the specified *sample size* is reached.\n", + "\n", + "We've produced two samples of the `salary_data` table in this way: `small_srswor_salary.csv` and `large_srswor_salary.csv` contain, respectively, a sample of size 44 (the same as the convenience sample) and a larger sample of size 100. \n", + "\n", + "The `load_data` function below loads a salary table and joins it with `player_data`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def load_data(salary_file):\n", + " return player_data.join('Name', Table.read_table(salary_file), 'PlayerName')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 2.5**
Run the same analyses on the small and large samples that you previously ran on the full dataset and on the convenience sample. Compare the accuracy of the estimates of the population statistics that we get from the convenience sample, the small simple random sample, and the large simple random sample. (Just notice this for yourself -- the autograder will check your sample statistics but will not validate whatever you do to compare.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Original:\n", + "small_srswor_data = ...\n", + "small_stats = ...\n", + "large_srswor_data = ...\n", + "large_stats = ...\n", + "print('Full data stats: ', full_stats)\n", + "print('Small simple random sample stats:', small_stats)\n", + "print('Large simple random sample stats:', large_stats)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "check('tests/q2_5.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Producing simple random samples\n", + "Often it's useful to take random samples even when we have a larger dataset available. Another is to help us understand how inaccurate other samples are.\n", + "\n", + "Tables provide the method `sample()` for producing random samples. Note that its default is to sample with replacement. To see how to call `sample()`, search the documentation on the [datascience documentation](http://data8.org/datascience/) of the course website, or enter `full_data.sample?` into a code cell and press Shift + Enter." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 2.6**
\n", + "Produce a simple random sample of size 44 from `full_data`. (You don't need to bother with a join this time -- just use `full_data.sample(...)` directly. That will have the same result as sampling from `salary_data` and joining with `player_data`.) Run your analysis on it again and think about these following questions.\n", + "- Are your results roughly similar to those in the small sample we provided you? Run your code several times to get new samples. \n", + "- How much does the average age change across samples? \n", + "- What about average salary?\n", + "\n", + "Question 2.6 does not have an autograder test, so it is not graded and not in the overall lab grade." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "my_small_srswor_data = ...\n", + "my_small_stats = ...\n", + "my_small_stats" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that the results are similar, but not the same, to the sample we were given. The average age tends to stay around the same value as there is a limited range of ages for NBA players, but the salary changes by a sizeable factor due to larger variability in salary." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 2.7**
As in the previous question, analyze several simple random samples of size 100 from `full_data`. \n", + "- Do the histogram statistics seem to change more or less across samples of 100 than across samples of size 44? \n", + "- Are the sample averages and histograms closer to their true values for age or for salary? What did you expect to see?\n", + "\n", + "Question 2.7 does not have an autograder test, so it is not graded and not in the overall lab grade." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "my_large_srswor_data = ...\n", + "..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The average and histogram statistics seem to change less across samples of this size. They are closer to their true values, which is what we'd expect to see because we are sampling a larger subset of the population. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Submission" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You're finished with lab 2! In order to successfully submit your assignment, follow these steps...\n", + "- **IMPORTANT** Before you do anything, **Save and Checkpoint** from the `File` menu. Please do this first before running the cell below,\n", + "- **run all the tests and verify that they all pass** (the next cell has a shortcut for that), \n", + "- **Review the notebook one last time, we will be grading the final state of your notebook** If you make any changes, please **Save and Checkpoint** again." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# For your convenience, you can run this cell to run all the tests at once!\n", + "import glob\n", + "from gofer.ok import grade_notebook\n", + "if not globals().get('__GOFER_GRADER__', False):\n", + " display(grade_notebook('lab02.ipynb', sorted(glob.glob('tests/q*.py'))))" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/materials/x18/lab/2/lab02/lab02.ipynb b/materials/x18/lab/2/lab02/lab02.ipynb index c61b274..c1f5560 100644 --- a/materials/x18/lab/2/lab02/lab02.ipynb +++ b/materials/x18/lab/2/lab02/lab02.ipynb @@ -14,7 +14,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# Run this cell, but please don't change it.\n", @@ -53,6 +55,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "for_assignment_type": "student" }, "outputs": [], @@ -70,7 +73,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q1_1.py')" @@ -86,7 +91,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "rough_success_chance = ..." @@ -95,7 +102,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q1_2.py')" @@ -114,6 +123,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "for_assignment_type": "student" }, "outputs": [], @@ -132,7 +142,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q1_3.py')" @@ -150,7 +162,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# We suggest using these bins.\n", @@ -176,7 +190,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "min_estimate = ...\n", @@ -186,7 +202,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q1_5.py')" @@ -206,7 +224,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def mean_based_estimator(nums):\n", @@ -222,7 +242,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q1_6.py')" @@ -232,7 +254,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 2. Sampling\n", + "## 2. Sampling Basketball Data\n", "\n", "Run the cell below to load the player and salary data." ] @@ -240,7 +262,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "player_data = Table().read_table(\"player_data.csv\")\n", @@ -277,13 +301,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Question 2.1**.
Complete the `histograms` function, which takes a table with columns `Age` and `Salary` and draws a histogram for each one. Use the min and max functions to pick the bin boundaries so that all data appears for any table passed to your function. Use the same bin widths as before (1 year for `Age` and $1,000,000 for `Salary`)." + "**Question 2.1**.
Complete the `histograms` function, which takes a table with columns `Age` and `Salary` and draws a histogram for each one. Use the min and max functions to pick the bin boundaries so that all data appears for any table passed to your function. Use the same bin widths as before (1 year for `Age` and $1,000,000 for `Salary`).\n", + "\n", + "*Hint*: When creating the bins for the the histograms, think critically about what the stop argument should be for `np.arange`. Histograms are inclusive on the left hand side of the interval, but not the right. So, if we have a maximum age of 80, we need a 80-81 bin in order to capture this in the histogram. " ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def histograms(t):\n", @@ -302,7 +330,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q2_1.py') # Warning: Charts will be displayed while running this test" @@ -323,7 +353,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def compute_statistics(age_and_salary_data):\n", @@ -339,7 +371,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q2_2.py') # Warning: Charts will be displayed while running this test" @@ -360,7 +394,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "convenience_sample = ...\n", @@ -370,7 +406,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q2_3.py')" @@ -386,7 +424,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "convenience_stats = ...\n", @@ -396,7 +436,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q2_4.py')" @@ -412,7 +454,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def compare_salaries(first, second, first_title, second_title):\n", @@ -441,7 +485,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def load_data(salary_file):\n", @@ -458,7 +504,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# Original:\n", @@ -475,6 +523,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -487,9 +536,9 @@ "metadata": {}, "source": [ "### Producing simple random samples\n", - "Often it's useful to take random samples even when we have a larger dataset available. The randomized response technique was one example we saw in lecture. Another is to help us understand how inaccurate other samples are.\n", + "Often it's useful to take random samples even when we have a larger dataset available. Another is to help us understand how inaccurate other samples are.\n", "\n", - "Tables provide the method `sample()` for producing random samples. Note that its default is to sample with replacement. To see how to call `sample()`, search the documentation on the [resources page](http://data8.org/su17/resources.html) of the course website, or enter `full_data.sample?` into a code cell and press Shift + Enter." + "Tables provide the method `sample()` for producing random samples. Note that its default is to sample with replacement. To see how to call `sample()`, search the documentation on the [datascience documentation](http://data8.org/datascience/) of the course website, or enter `full_data.sample?` into a code cell and press Shift + Enter." ] }, { @@ -509,6 +558,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -540,6 +590,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -575,7 +626,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# For your convenience, you can run this cell to run all the tests at once!\n", @@ -589,9 +642,9 @@ "metadata": { "anaconda-cloud": {}, "kernelspec": { - "display_name": "data8x", + "display_name": "Python 3", "language": "python", - "name": "data8x" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -603,10 +656,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.2" + "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 1 } - diff --git a/materials/x18/lab/2/lab05/.ipynb_checkpoints/lab05-checkpoint.ipynb b/materials/x18/lab/2/lab05/.ipynb_checkpoints/lab05-checkpoint.ipynb new file mode 100644 index 0000000..42d7cd6 --- /dev/null +++ b/materials/x18/lab/2/lab05/.ipynb_checkpoints/lab05-checkpoint.ipynb @@ -0,0 +1,667 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lab 5: Resampling and the Bootstrap\n", + "\n", + "Welcome to Lab 5!\n", + "\n", + "We will attempt to estimate the number `N`, a *population parameter*, that represents the number of elements in a population. We get to observe a uniform random sample of the elements, and for each one we can observe its serial number (from 1 to `N`). All elements are labeled with consecutive serial numbers from 1 to `N`, so `N` is the total number of elements. \n", + "\n", + "Given *just* a random sample of elements, we'll estimate `N`, and then we'll use simulation to find a confidence interval around our estimate, all without ever looking at the whole population. This is an example of *statistical inference*.\n", + "\n", + "As usual, **run the cell below** to prepare the lab and the automatic tests." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Run this cell to set up the notebook, but please don't change it.\n", + "\n", + "# These lines import the Numpy and Datascience modules.\n", + "import numpy as np\n", + "from datascience import *\n", + "\n", + "# These lines do some fancy plotting magic.\n", + "import matplotlib\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "plt.style.use('fivethirtyeight')\n", + "import warnings\n", + "warnings.simplefilter('ignore', UserWarning)\n", + "\n", + "# Don't change this cell; just run it. \n", + "from gofer.ok import check" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Preliminaries\n", + "The setup: We want to know the number of elements in the population. That number is `N`. Each element is numbered from 1 to `N`.\n", + "\n", + "We only see a small number of elements (assumed to be a uniform random sample with replacement from among all the elements), so we have to use estimation." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Question 1.1\n", + "Is `N` a population parameter or a statistic? If we compute a number using our random sample that's an estimate of `N`, is that a population parameter or a statistic?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*Write your answer here, replacing this text.*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check your answer by posting on the discussion forum.\n", + "\n", + "To make the situation realistic, we're going to hide the true number of elements from you. You'll have access only to this random sample:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "observations = Table.read_table(\"serial_numbers.csv\")\n", + "num_observations = observations.num_rows\n", + "observations" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Question 1.2\n", + "Define a function named `plot_serial_numbers` to make a histogram of any table of serial numbers. It should take one argument, a table like `observations` with one column called `\"serial number\"`. It should plot a histogram of the values in the column **using bins of width 1** ranging from **1 to 200** but return nothing. Then, call that function to make a histogram of `observations`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def plot_serial_numbers(numbers):\n", + " ...\n", + " \n", + " # Assuming the lines above produce a histogram, this next\n", + " # line may make your histograms look nicer. Feel free to\n", + " # delete it if you want.\n", + " plt.ylim(0, .25)\n", + "\n", + "plot_serial_numbers(observations)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Question 1.3\n", + "By looking at the histogram, what can we say about `N` immediately? (Hint: What is the relationship between `N` and the largest serial number in `observations`?) What does each little bar in the histogram represent? Why are all the bars the same height?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*Write your answer here, replacing this text.*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Question 1.4\n", + "One way to estimate `N` is to take twice the mean of the serial numbers we observe. Write a function that computes that statistic. It should take as its argument an array of serial numbers and return twice their mean. Call it `mean_based_estimator`. \n", + "\n", + "After that, use it to compute an estimate of `N` called `mean_based_estimate`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def mean_based_estimator(nums):\n", + " ...\n", + "\n", + "mean_based_estimate = ...\n", + "mean_based_estimate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q1_4.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Question 1.5\n", + "We can also estimate `N` using the biggest serial number in the sample. Compute it, giving it the name `max_estimate`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "max_estimate = ...\n", + "max_estimate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q1_5.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Question 1.6\n", + "Look at the values of `max_estimate` and `mean_based_estimate` that we happened to get for our dataset. The value of `max_estimate` tells you something about `mean_based_estimate`. For these specific values, is it possible for our value of `mean_based_estimate` to be equal to `N` (at least, if we round it to the nearest integer)? If not, is it definitely higher, definitely lower, or can we not tell? Can you make a statement like the value of our \"`mean_based_estimate` is at least *[fill in a number]* away from `N`\"?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*Write your answer here, replacing this text.*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check your answer by posting on the discussion forum.\n", + "\n", + "We can't just confidently proclaim that `max_estimate` or `mean_based_estimate` is equal to `N`. What if we're really far off? So we want to get a sense of the accuracy of our estimates." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Resampling\n", + "To do this, we'll use resampling. That is, we won't exactly simulate new observations. Rather we sample from our current sample, or \"resample\" the data.\n", + "\n", + "Why does that make any sense?\n", + "\n", + "When we tried to estimate `N`, we would have liked to use the whole population. Since we had only a sample, we used that to estimate `N` instead.\n", + "\n", + "This time, we would like to use the population of serial numbers to *run a simulation* about estimates of `N`. But we still only have our sample. We use our sample in place of the population to run the simulation.\n", + "\n", + "So there is a simple analogy between estimating `N` and simulating the variability of estimates.\n", + "\n", + "$$\\text{computing }N\\text{ from the population}$$\n", + "$$:$$\n", + "$$\\text{computing an estimate of }N\\text{ from a sample}$$\n", + "\n", + "$$\\text{as}$$\n", + "\n", + "$$\\text{simulating the distribution of estimates of }N\\text{ using samples from the population}$$\n", + "$$:$$\n", + "$$\\text{simulating an (approximate) distribution of estimates of }N\\text{ using resamples from a sample}$$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Question 2.1\n", + "Write a function called `simulate_resample`. It should generate a resample from the observed serial numbers in `observations` and return that resample. (The resample should be a table like `observations`.) It should take no arguments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def simulate_resample():\n", + " ..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's make one resample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# This line is a little magic to make sure that you see the same results\n", + "# we did.\n", + "np.random.seed(123)\n", + "\n", + "one_resample = simulate_resample()\n", + "one_resample" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q2_1.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Later, we'll use many resamples at once to see what estimates typically look like. We don't often pay attention to single resamples, so it's easy to misunderstand them. Let's examine some individual resamples before we start using them." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Question 2.2\n", + "In preparation for answering the next question, generate a histogram of your resample using the plotting function you defined earlier in this lab, **and** generate a separate histogram of the original observations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "...\n", + "..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Question 2.3\n", + "Which of the following are true:\n", + "1. In the plot of the resample, there are no bars at locations that weren't there in the plot of the original observations.\n", + "2. In the plot of the original observations, there are no bars at locations that weren't there in the plot of the resample.\n", + "3. The resample has exactly one copy of each serial number.\n", + "4. The sample has exactly one copy of each serial number.\n", + "\n", + "Assign true_statements to a list of the correct statements." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "true_statements = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q2_3.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Question 2.4\n", + "Create two more resamples using the function `simulate_resample` from above. For each resampled data, plot it and compute its max- and mean-based estimates." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "resample_0 = ...\n", + "...\n", + "mean_based_estimate_0 = ...\n", + "max_based_estimate_0 = ...\n", + "print(\"Mean-based estimate for resample 0:\", mean_based_estimate_0)\n", + "print(\"Max-based estimate for resample 0:\", max_based_estimate_0)\n", + "\n", + "resample_1 = ...\n", + "...\n", + "mean_based_estimate_1 = ...\n", + "max_based_estimate_1 = ...\n", + "print(\"Mean-based estimate for resample 1:\", mean_based_estimate_1)\n", + "print(\"Max-based estimate for resample 1:\", max_based_estimate_1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may find that the max-based estimates from the resamples are both exactly 135. You will probably find that the two mean-based estimates do differ from the sample mean-based estimate (and from each other).\n", + "\n", + "#### Question 2.5\n", + "Using probability that you've learned, compute the exact chance that a max-based estimate from *one* resample is 135.\n", + "\n", + "Using your intuition, explain why a mean-based estimate from a resample is less often exactly equal to the mean-based estimate from the original sample as compared to a max-based estimate.\n", + "\n", + "As a refresher, here are some rules of probability that may be helpful:\n", + "\n", + "- When all outcomes are equally likely: P(event happens) $=$ $\\frac{\\text{# outcomes that make event happen}}{\\text{# of all outcomes}}$\n", + "\n", + "- When an event can happen in 2 ways: P(event) $=$ P(event happening first way) $+$ P(event happening second way)\n", + "\n", + "- When 2 events must both happen: P(2 events both happen) $=$ P(one event happens) $*$ P(other event happens, given the first one happened)\n", + "\n", + "- When an event doesn't happen: P(event doesn't happen) $=$ 1 $-$ P(event does happen)\n", + "\n", + "- P(at least one success) $= 1 - $ P(no successes)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*Write your answer here, replacing this text.*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Discuss your answers on the edX discussion forums. If you have difficulty with the probability calculation, ask for help; don't stay stuck on it for too long." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Simulating with resampling\n", + "\n", + "**Note**: *The last part of this lab is difficult to check automatically, so it will not be graded. We strongly suggest that you try to complete it. We will release solutions to this lab so that you can compare to them.*\n", + "\n", + "Since resampling from a sample is very similar to sampling from a population, the code should look almost the same. That means we can write a function that simulates either sampling from a population or resampling from a sample. If we pass it a population as its argument, it will do the former; if we pass it a sample, it will do the latter.\n", + "\n", + "#### Question 3.1\n", + "Write a function called `simulate_estimates`. It should take 4 arguments:\n", + "1. A table from which the data should be sampled. The table will have 1 column named `\"serial number\"`.\n", + "2. The size of each sample from that table, an integer. (For example, to do resampling, we would pass for this argument the number of rows in the table.)\n", + "3. A function that computes a statistic of a sample. This argument is a *function* that takes an array of serial numbers as its argument and returns a number.\n", + "4. The number of replications to perform.\n", + "\n", + "It should simulate many samples with replacement from the given table. (The number of samples is the 4th argument.) For each of those samples, it should compute the statistic on that sample. Then it should return an array containing each of those statistics. The code below provides an example use of your function and describes how you can verify that you've written it correctly.\n", + "\n", + "**Hint**: Your implementation should contain the following line, which extracts the \"serial number\" column from some table ``t`` and calls the `statistic` function on it, storing the result in the name `s`.\n", + "\n", + "``s = statistic(t.column(\"serial number\"))``" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "for_assignment_type": "student" + }, + "outputs": [], + "source": [ + "def simulate_estimates(original_table, sample_size, statistic, num_replications):\n", + " # Our implementation of this function took 5 short lines of code.\n", + " ...\n", + "\n", + "# This should generate an empirical histogram of twice-mean estimates\n", + "# of N from samples of size 50 if N is 1000. This should be a bell-shaped\n", + "# curve centered at 1000 with most of its mass in [800, 1200]. To verify your\n", + "# answer, make sure that's what you see!\n", + "example_estimates = simulate_estimates(\n", + " Table().with_column(\"serial number\", np.arange(1, 1000+1)),\n", + " 50,\n", + " mean_based_estimator,\n", + " 10000)\n", + "Table().with_column(\"mean-based estimate\", example_estimates).hist(bins=np.arange(0, 1500, 25))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can go back to the sample we actually observed (the table `observations`) and estimate how much our mean-based estimate of `N` would have varied from sample to sample." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Question 3.2\n", + "Using the bootstrap and the sample `observations`, simulate the approximate distribution of *mean-based estimates* of `N`. Use 5,000 replications. \n", + "We have provided code that plots a histogram, allowing you to visualize the simulated estimates." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "bootstrap_estimates = ...\n", + "Table().with_column(\"mean-based estimate\", bootstrap_estimates).hist(bins=np.arange(0, 200, 4)) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Question 3.3\n", + "Compute an interval that covers the middle 95% of the bootstrap estimates. Verify that your interval looks like it covers 95% of the area in the histogram above." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "left_end = ...\n", + "right_end = ...\n", + "print(\"Middle 95% of bootstrap estimates: [{:f}, {:f}]\".format(left_end, right_end))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Question 3.4\n", + "Your mean-based estimate of `N` should have been around 122. Given the above calculations, is it likely that `N` is exactly 122? If not, what is the typical range of values of the mean-based estimates of `N` for samples of size 17?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*Write your answer here, replacing this text.*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check your solutions with someone on the edX discussion forums" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Question 3.5\n", + "`N` was actually 150! Write code that simulates the sampling and bootstrapping process again, as follows:\n", + "\n", + "1. Generate a new set of random observations by sampling from the population table we have created for you below. \n", + "2. Compute an estimate of `N` from these new observations, using `mean_based_estimator`.\n", + "3. Using only the new observations, compute 5,000 bootstrap estimates of `N`.\n", + "4. Plot these bootstrap estimates and compute an interval covering the middle 95%." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "population = Table().with_column(\"serial number\", np.arange(1, 150+1))\n", + "\n", + "new_observations = ...\n", + "new_mean_based_estimate = ...\n", + "new_bootstrap_estimates = ...\n", + "...\n", + "new_left_end = ...\n", + "new_right_end = ...\n", + "\n", + "print(\"New mean-based estimate: {:f}\".format(new_mean_based_estimate))\n", + "print(\"Middle 95% of bootstrap estimates: [{:f}, {:f}]\".format(new_left_end, new_right_end))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Question 3.6\n", + "Does the interval covering the middle 95% of the new bootstrap estimates include `N`? If you ran that cell many times, what is the probability that it will include `N`?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*Write your answer here, replacing this text.*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check your solutions with someone on the edX discussion forums" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Submission" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Conratulations! You're finished with lab 5 and Data 8.2x! In order to successfully submit your assignment, follow these steps...\n", + "- **IMPORTANT** Before you do anything, **Save and Checkpoint** from the `File` menu. Please do this first before running the cell below,\n", + "- **run all the tests and verify that they all pass** (the next cell has a shortcut for that), \n", + "- **Review the notebook one last time, we will be grading the final state of your notebook** If you make any changes, please **Save and Checkpoint** again." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# For your convenience, you can run this cell to run all the tests at once!\n", + "import glob\n", + "from gofer.ok import grade_notebook\n", + "if not globals().get('__GOFER_GRADER__', False):\n", + " display(grade_notebook('lab05.ipynb', sorted(glob.glob('tests/q*.py'))))" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/materials/x18/lab/2/lab05/lab05.ipynb b/materials/x18/lab/2/lab05/lab05.ipynb index 55767f7..f5061c2 100644 --- a/materials/x18/lab/2/lab05/lab05.ipynb +++ b/materials/x18/lab/2/lab05/lab05.ipynb @@ -8,9 +8,13 @@ "\n", "Welcome to Lab 5!\n", "\n", - "We will attempt to estimate the number `N`, a *population parameter*, that represents the number of elements in a population. We get to observe a uniform random sample of the elements, and for each one we can observe its serial number (from 1 to `N`). All elements are labeled with consecutive serial numbers from 1 to `N`, so `N` is the total number of elements. \n", + "The British Royal Air Force wanted to know how many warplanes the Germans had (some number `N`, which is a *population parameter*), and they needed to estimate that quantity knowing only a random sample of the planes' serial numbers (from 1 to `N`). We know that the German's warplanes are labeled consecutively from 1 to `N`, so `N` would be the total number of warplanes they have. \n", "\n", - "Given *just* a random sample of elements, we'll estimate `N`, and then we'll use simulation to find a confidence interval around our estimate, all without ever looking at the whole population. This is an example of *statistical inference*.\n", + "We normally investigate the random variation amongst our estimates by simulating a sampling procedure from the population many times and computing estimates from each sample that we generate. In real life, if the RAF had known what the population looked like, they would have known `N` and would not have had any reason to think about random sampling. However, they didn't know what the population looked like, so they couldn't have run the simulations that we normally do. \n", + "\n", + "Simulating a sampling procedure many times was a useful exercise in *understanding random variation* for an estimate, but it's not as useful as a tool for practical data analysis.\n", + "\n", + "Let's flip that sampling idea on its head to make it practical. Given *just* a random sample of serial numbers, we'll estimate `N`, and then we'll use simulation to find out how accurate our estimate probably is, without ever looking at the whole population. This is an example of *statistical inference*.\n", "\n", "As usual, **run the cell below** to prepare the lab and the automatic tests." ] @@ -78,7 +82,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "observations = Table.read_table(\"serial_numbers.csv\")\n", @@ -97,7 +103,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def plot_serial_numbers(numbers):\n", @@ -139,7 +147,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def mean_based_estimator(nums):\n", @@ -152,7 +162,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q1_4.py')" @@ -169,7 +181,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "max_estimate = ...\n", @@ -179,7 +193,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q1_5.py')" @@ -265,7 +281,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# This line is a little magic to make sure that you see the same results\n", @@ -279,7 +297,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q2_1.py')" @@ -303,7 +323,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "...\n", @@ -338,7 +360,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q2_3.py')" @@ -355,7 +379,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "resample_0 = ...\n", @@ -419,7 +445,7 @@ "\n", "**Note**: *The last part of this lab is difficult to check automatically, so it will not be graded. We strongly suggest that you try to complete it. We will release solutions to this lab so that you can compare to them.*\n", "\n", - "Since resampling from a sample is very similar to sampling from a population, the code should look almost the same. That means we can write a function that simulates either sampling from a population or resampling from a sample. If we pass it a population as its argument, it will do the former; if we pass it a sample, it will do the latter.\n", + "Since resampling from a sample looks just like sampling from a population, the code should look almost the same. That means we can write a function that simulates the process of either sampling from a population or resampling from a sample. If we pass in population as its argument, it will do the former; if we pass in a sample, it will do the latter.\n", "\n", "#### Question 3.1\n", "Write a function called `simulate_estimates`. It should take 4 arguments:\n", @@ -428,11 +454,7 @@ "3. A function that computes a statistic of a sample. This argument is a *function* that takes an array of serial numbers as its argument and returns a number.\n", "4. The number of replications to perform.\n", "\n", - "It should simulate many samples with replacement from the given table. (The number of samples is the 4th argument.) For each of those samples, it should compute the statistic on that sample. Then it should return an array containing each of those statistics. The code below provides an example use of your function and describes how you can verify that you've written it correctly.\n", - "\n", - "**Hint**: Your implementation should contain the following line, which extracts the \"serial number\" column from some table ``t`` and calls the `statistic` function on it, storing the result in the name `s`.\n", - "\n", - "``s = statistic(t.column(\"serial number\"))``" + "It should simulate many samples with replacement from the given table. (The number of samples is the 4th argument.) For each of those samples, it should compute the statistic on that sample. Then it should return an array containing each of those statistics. The code below provides an example use of your function and describes how you can verify that you've written it correctly." ] }, { @@ -479,7 +501,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "bootstrap_estimates = ...\n", @@ -497,7 +521,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "left_end = ...\n", @@ -543,7 +569,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "population = Table().with_column(\"serial number\", np.arange(1, 150+1))\n", @@ -631,10 +659,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.3" + "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 1 } - diff --git a/materials/x18/lab/2/lab06/.ipynb_checkpoints/lab06-checkpoint.ipynb b/materials/x18/lab/2/lab06/.ipynb_checkpoints/lab06-checkpoint.ipynb new file mode 100644 index 0000000..d373c5f --- /dev/null +++ b/materials/x18/lab/2/lab06/.ipynb_checkpoints/lab06-checkpoint.ipynb @@ -0,0 +1,881 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lab 6: Conditional Probability\n", + "\n", + "This lab is an introduction conditional probabilities. \n", + "\n", + "The lab includes a visualization called an *icon array*. It's meant to be an instructional part of the lab to help build intuitions about conditional probability. These visualizations do not appear in the textbook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Run this cell to set up the notebook, but please don't change it.\n", + "\n", + "# These lines import the Numpy and Datascience modules.\n", + "import numpy as np\n", + "from datascience import *\n", + "\n", + "# These lines do some fancy plotting magic.\n", + "import matplotlib\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "plt.style.use('fivethirtyeight')\n", + "import warnings\n", + "warnings.simplefilter('ignore', UserWarning)\n", + "\n", + "# This line loads the visualization code for this lab.\n", + "import visualizations\n", + "\n", + "# Don't change this cell; just run it. \n", + "from gofer.ok import check" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. What is conditional probability good for?\n", + "\n", + "Suppose we have a known population, like all dogs in California. So far, we've seen 3 ways of *predicting* something about an individual in that population, given incomplete knowledge about the identity of the individual:\n", + "\n", + "* If we know nothing about the individual dog, we could predict that its speed is the *average* or *median* of all the speeds in the population.\n", + "* If we know the dog's height but not its speed, we could use *linear regression* to predict its speed from its height. The resulting prediction is still imperfect, but it might be more accurate than the population average.\n", + "* If we know the dog's breed, height, and age, we could use *nearest-neighbor classification* (or *multiple regression*) to predict its speed by comparing to a collection of dogs with known speed.\n", + "\n", + "Computing conditional probabilities is a different way of making predictions. It differs in at least two important ways from the methods we've seen:\n", + "1. We will obtain a probability for each outcome \n", + "2. In the simple (but important) cases we'll look at today, conditional probabilities can be calculated exactly from assumptions, rather than being estimated from data. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Icon arrays\n", + "Parts 3 and 4 of this lab are about disease, but first let's start with a simple, contrived example.\n", + "\n", + "Imagine you are a marble. You don't know what you look like (since you obviously have no eyes), but you know that Samantha drew you **uniformly at random** from a bag that contained the following marbles:\n", + "* 4 large shiny marbles,\n", + "* 1 large dull marble,\n", + "* 6 small shiny marbles, and\n", + "* 2 small dull marbles." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 2.1.**
Knowing only what we've told you so far, what's the probability that you're a large shiny marble?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "probability_large_shiny = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q21.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here's a table with those marbles:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "marbles = Table.read_table(\"marbles.csv\")\n", + "marbles.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here are the counts of each type of marble in a pivot table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "marbles.pivot('surface', 'size')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here are all the differnet combinations of surface and size, with counts, where each type of marble appears in its own row." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "marbles.group(['surface', 'size'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We've included some code to display something called an *icon array*. The functions in the cell below create icon arrays from various kinds of tables. Make sure to read the doc strings for each function so you understand what they do! Refer back to this cell later when you need to make an icon array." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Run this cell.\n", + "\n", + "#######################################################################\n", + "# The functions you'll need to actually use are in here. Each is a\n", + "# way of making an icon array from a differently-formatted table.\n", + "#######################################################################\n", + "\n", + "def display_icon_array(table, groups, individuals_name):\n", + " \"\"\"\n", + " Given a table and some columns to group it on, displays an icon array\n", + " of the groups.\n", + " \n", + " groups should be an array of labels of columns in table.\n", + " \n", + " individuals_name is your name for the individual rows of table.\n", + " For example, if we're talking about a population of people,\n", + " individuals_name should be \"people\".\n", + " \n", + " For example:\n", + " \n", + " display_icon_array(marbles, [\"surface\", \"size\"], \"marbles\")\n", + " \"\"\"\n", + " display_grouped_icon_array(table.group(groups), individuals_name)\n", + "\n", + "def display_grouped_icon_array(grouped_data, individuals_name):\n", + " \"\"\"\n", + " Given a table with counts for data grouped by 1 or more categories,\n", + " displays an icon array of the groups represented in the table.\n", + " \n", + " grouped_data should be a table of frequencies or counts, such as\n", + " a table created by calling the groups method on some table.\n", + " \n", + " individuals_name is your name for the individual members of the\n", + " dataset. For example, if we're talking about a population of\n", + " people, individuals_name should be \"people\".\n", + " \n", + " For example:\n", + " \n", + " display_grouped_icon_array(marbles.group([\"surface\", \"size\"]), \"marbles\")\n", + " \"\"\"\n", + " visualizations.display_combinations(grouped_data, individuals_name=individuals_name)\n", + "\n", + "def display_crosstab_icon_array(crosstabulation, x_label, individuals_name):\n", + " \"\"\"\n", + " Given a crosstabulation table, displays an icon array of the groups\n", + " represented in the table.\n", + " \n", + " crosstabulation should be a table of frequencies or counts created by\n", + " calling pivot on some table.\n", + " \n", + " x_label should be the label of the categories listed as columns (on\n", + " the \"x axis\" when the crosstabulation table is printed).\n", + " \n", + " individuals_name is your name for the individual members of the\n", + " dataset. For example, if we're talking about a population of\n", + " people, individuals_name should be \"people\".\n", + " \n", + " For example:\n", + " \n", + " display_crosstab_icon_array(marbles.pivot(\"surface\", \"size\"), \"surface\", \"marbles\")\n", + " \"\"\"\n", + " display_grouped_icon_array(visualizations.pivot_table_to_groups(crosstabulation, x_label), individuals_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here's an icon array of all the marbles, grouped by surface and size:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Run this cell.\n", + "display_grouped_icon_array(marbles.group([\"surface\", \"size\"]), \"marble\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You (the marble) should imagine that you are a random draw from these 13 icons.\n", + "\n", + "The following is an icon array of the marbles, grouped **only by their surface (shiny/dull)**." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "display_grouped_icon_array(marbles.group(\"surface\"), \"marble\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Knowing nothing else about yourself, you're equally likely to be any of the marbles pictured.\n", + "\n", + "**Question 2.2.**
What's the probability that you're a shiny marble? Calculate this by hand (using Python for arithmetic) by looking at your icon array." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "probability_shiny = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q22.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.1. Conditional probability\n", + "\n", + "Suppose you overhear Samantha saying that you're a large marble. (Little-known fact: though marbles cannot see, they can sense sound from surface vibrations.) Does this somehow change the chance that you're shiny? Let's find out.\n", + "\n", + "Go back to the full icon array, displayed below for convenience." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "display_grouped_icon_array(marbles.group([\"surface\", \"size\"]), \"marble\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In question 2.3, we assumed you were equally likely to be any of the marbles, because we didn't know any better. That's why we looked at all the marbles to compute the probability you were shiny.\n", + "\n", + "But assuming you're a large marble, we can eliminate some of these possibilities. In particular, you can't be a small shiny marble or a small dull marble.\n", + "\n", + "You're still equally likely to be any of the remaining marbles, because you don't know anything that says otherwise. So here's an icon array of those remaining possibilities:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Just run this cell.\n", + "display_grouped_icon_array(marbles.where(\"size\", \"large\").group(\"surface\"), \"large marble\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 2.1.1.** What's the probability you're a shiny marble, knowing that you're a large marble? Calculate it by hand, using the icon array." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "probability_shiny_given_large = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q211.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should have found that this is different from the probability that you're a shiny marble (given no size information), which you computed earlier. The distribution of surfaces among the large marbles is a little different from the distribution of surfaces among all the marbles." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 2.1.2.**
Suppose instead Samantha had said you're a **shiny** marble (hooray!). What's the probability you're large? Make an icon array to help you compute this probability, then compute it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# An icon array to help you compute the answer.\n", + "display_grouped_icon_array(marbles.where(\"surface\", \"shiny\").group(\"size\"), \"shiny marbles\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Now compute the answer.\n", + "probability_large_given_shiny = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q212.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 2.1.3.**
Can you answer the last two questions just by looking at the full icon array? (You can run the cell below to see it again.). If you can, how? If not, why not? Check with your lab peers to see if you are on the right track. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Just run this cell. The next cell is where you should write your answer.\n", + "display_grouped_icon_array(marbles.group([\"surface\", \"size\"]), \"marble\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*Write your answer here, replacing this text.*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Cancer screening\n", + "Now let's look at a much more realistic application.\n", + "\n", + "### Background\n", + "Medical tests are an important but surprisingly controversial topic. For years, women have been advised to get regular mammograms (tests for breast cancer). Today, there is controversy over whether the tests are useful at all.\n", + "\n", + "Part of the problem with such tests is that they are not perfectly reliable. Someone without cancer, or with only a benign form of cancer, can see a positive result on a test for cancer. Someone with cancer can receive a negative result. (\"Positive\" means \"pointing toward cancer,\" so in this context it's bad!) Doctors and patients often deal poorly with the first case, called *false positives*. For example, a patient may receive dangerous treatment like chemotherapy or radiation despite having no cancer or, as happens more frequently, having a cancer that would not have impacted her health.\n", + "\n", + "Conditional probability is a good way to think about such situations. For example, you can compute the chance that you have cancer, given the result of a test, by combining information from different probability distributions. You'll see that the chance you have cancer can be far from 100% even if you have a positive test result from a test that is usually accurate." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.1. Basic cancer statistics\n", + "Suppose that, in a representative group of 10,000 people who are tested for cancer (\"representative\" meaning that the frequencies of different things are the same as the frequencies in the whole population):\n", + "1. 100 have cancer.\n", + "2. Among those 100, 90 have positive results on a cancer test and 10 have negative results. (\"Negative\" means \"not pointing toward cancer.\")\n", + "3. The other 9,900 don't have cancer.\n", + "4. Among these, 198 have positive results on a cancer test and the other 9,702 have negative results. (So 198 see \"false positive\" results.)\n", + "\n", + "Below we've generated a table with data from these 10,000 hypothetical people." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "people = Table().with_columns(\n", + " \"cancer status\", [\"sick\", \"sick\", \"healthy\", \"healthy\"],\n", + " \"test status\", [\"positive\", \"negative\", \"positive\", \"negative\"],\n", + " \"count\", [90, 10, 198, 9702])\n", + "people" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One way to visualize this dataset is with a contingency table, which you've seen before.\n", + "\n", + "**Question 3.1.1.**
Create a contingency table that looks like this:\n", + "\n", + "|cancer status|negative|positive|\n", + "|-|-|-|\n", + "|sick|||\n", + "|healthy||||\n", + "\n", + "...with the **count** of each group filled in, according to what we've told you above. The counts in the 4 boxes should sum to 10,000.\n", + "\n", + "*Hint:* Use `pivot` with the `sum` function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "cancer = ...\n", + "cancer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q311.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is the `people` data in an icon array." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's think about how you can use this kind of information when you're tested for cancer.\n", + "\n", + "Before you know any information about yourself, you could imagine yourself as a **uniform random sample** of one of the 10,000 people in this imaginary population of people who have been tested.\n", + "\n", + "What's the chance that you have cancer, knowing nothing else about yourself? It's $\\frac{100}{10000}$, or 1%. We can see that more directly with this icon array:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "by_health = people.select(0, 2).group(0, sum).relabeled(1, 'count')\n", + "display_grouped_icon_array(by_health, \"people who've taken a cancer test\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 3.1.3.**
What's the chance that you have a positive test result, knowing nothing else about yourself?\n", + "\n", + "*Hint:* Make an icon array." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#First, make an icon table similar to the one above\n", + "#by_test should be almost the same thing as by_health above\n", + "#The only difference is the columns we need from the people table\n", + "by_test = ...\n", + "display_grouped_icon_array(by_test, \"people who've taken a cancer test\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Fill in the probabiliy of having a positive test result.\n", + "probability_positive_test = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q313.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.2. Interpreting test results\n", + "Suppose you have a positive test result. This means you can now narrow yourself down to being part of one of two groups:\n", + "1. The people with cancer who have a positive test result.\n", + "2. The people without cancer who have a positive test result.\n", + "\n", + "Here's an icon array for those two groups:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Just run this cell.\n", + "display_grouped_icon_array(people.where(\"test status\", are.equal_to(\"positive\")).drop(1), \"people who have a positive test result\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The *conditional probability* that you have cancer *given* your positive test result is the chance that you're in the first group, assuming you're in one of these two groups.\n", + "\n", + "**Question 3.2.1.**
Eyeballing it, is the conditional probability that you have cancer given your positive test result closest to:\n", + "\n", + "9/10\n", + "\n", + "2/3\n", + "\n", + "1/2\n", + "\n", + "1/3\n", + "\n", + "1/100\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Set this to one of the probabilities above.\n", + "rough_prob_sick_given_positive = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q321.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 3.2.2.**
Now write code to calculate that probability exactly, using the original contingency table you wrote (the `cancer` table)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#Run this cell to take another look at the cancer cell\n", + "cancer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "prob_sick_given_positive = ...\n", + "prob_sick_given_positive" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q322.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 3.2.3.**
Look at the full icon array again. Using that, how would you compute (roughly) the conditional probability of cancer given a positive test?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# The full icon array is given here for your convenience.\n", + "# Write your answer in the next cell.\n", + "display_grouped_icon_array(people, \"people who've taken a cancer test\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*Write your answer here, replacing this text.*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 3.2.4.**
Is your answer to question 3.2.2 bigger than the overall proportion of people in the population who have cancer? Does that make sense? Check with your peers in lab to see if you have the right idea. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*Write your answer here, replacing this text.*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 4. Tree diagrams\n", + "A tree diagram is another useful visualization for conditional probability. It is easiest to draw a tree diagram when the probabilities are presented in a slightly different way. For example, people often summarize the information in your `cancer` table using 3 numbers:\n", + "\n", + "1. The overall probability of having cancer is **`p_cancer`**. (This is called the *base rate* or *marginal probability* of the disease.)\n", + "2. Given that you have cancer, the probability of a positive test result is **`p_pos_given_cancer`**. (This is called the *sensitivity* of the test. Higher values of `p_pos_given_cancer` mean the test is more useful.)\n", + "3. Given that you don't have cancer, the probability of a positive test result is **`p_pos_given_nocancer`**. (This is called the *false positive rate* of the test. Higher values of `p_pos_given_nocancer` mean the test is less useful.)\n", + "\n", + "You already saw that the base rate of cancer was .01 in the previous section. `p_pos_given_cancer` and `p_pos_given_nocancer` can be computed using the same method you used to compute the conditional probability of cancer given a positive test result.\n", + "\n", + "Use the tree diagram below and think about \n", + "\n", + "This corresponds to this tree diagram:\n", + "\n", + " / \\\n", + " .99 / \\ .01\n", + " / \\ \n", + " / \\\n", + " no cancer cancer\n", + " / \\ / \\\n", + " / \\ / \\\n", + " / \\ / \\\n", + " + - + -\n", + "\n", + "**Question 4.1.**
Compute `p_pos_given_cancer` and `p_pos_given_nocancer` for the data in section 3. \n", + "\n", + "**Use Bayes Rule**\n", + "\n", + "You can read about a technique called Bayes Rule in the [course textbook](https://www.inferentialthinking.com/chapters/18/1/more-likely-than-not-binary-classifier.html#Bayes%27-Rule)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Hint: You may find these two tables useful:\n", + "has_cancer = cancer.where(\"cancer status\", are.equal_to(\"sick\"))\n", + "no_cancer = cancer.where(\"cancer status\", are.equal_to(\"healthy\"))\n", + "\n", + "p_cancer = .01\n", + "p_pos_given_cancer = ...\n", + "p_pos_given_nocancer = ...\n", + "\n", + "print('Probability of Cancer:', p_cancer, '\\nProbability of a positive test given no cancer:', p_pos_given_cancer, \n", + " '\\nProbability of a positive test given no cancer:', p_pos_given_nocancer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q41.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once you are done...\n", + "- **Save and Checkpoint** from the `File` menu." + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + }, + "timetravel": { + "allowedContentTypes": [ + "text/plain" + ], + "enabled": true, + "version": "1.0" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/materials/x18/lab/2/lab06/lab06.ipynb b/materials/x18/lab/2/lab06/lab06.ipynb index 7d8aa3e..876d1b4 100644 --- a/materials/x18/lab/2/lab06/lab06.ipynb +++ b/materials/x18/lab/2/lab06/lab06.ipynb @@ -463,9 +463,9 @@ "### Background\n", "Medical tests are an important but surprisingly controversial topic. For years, women have been advised to get regular mammograms (tests for breast cancer). Today, there is controversy over whether the tests are useful at all.\n", "\n", - "Part of the problem with such tests is that they are not perfectly reliable. Someone without cancer, or with only a benign form of cancer, can see a positive result on a test for cancer. Someone with cancer can receive a negative result. (\"Positive\" means \"pointing toward cancer,\" so in this context it's bad!) Doctors and patients often deal poorly with the first case, called *false positives*. For example, a patient may receive dangerous treatment like chemotherapy or radiation despite having no cancer or, as happens more frequently, having a cancer that would not have impacted her health.\n", + "Part of the problem with such tests is that they are not perfectly reliable. Someone without cancer, or with only a benign form of cancer, can see a positive result on a test for cancer. Someone with cancer can receive a negative result. (\"Positive\" means \"pointing toward cancer,\" so in this context it's bad!) Doctors and patients often deal poorly with the first case, called *false positives*. For example, a patient may receive dangerous treatment like chemotherapy or radiation despite having no cancer or, as happens more frequently, having a cancer that would not have impacted their health.\n", "\n", - "Conditional probability is a good way to think about such situations. For example, you can compute the chance that you have cancer, given the result of a test, by combining information from different probability distributions. You'll see that the chance you have cancer can be far from 100% even if you have a positive test result from a test that is usually accurate." + "Conditional probability is a good way to think about such situations. For example, you can compute the chance that you have cancer **given the results of a diagnostic test** by combining information from different probability distributions. You'll see that the chance you have cancer can be far from 100% even if you have a positive test result from a test that is usually accurate." ] }, { @@ -866,7 +866,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.2" + "version": "3.6.1" }, "timetravel": { "allowedContentTypes": [ @@ -879,4 +879,3 @@ "nbformat": 4, "nbformat_minor": 1 } - diff --git a/materials/x18/lab/3/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb b/materials/x18/lab/3/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb new file mode 100644 index 0000000..58f5ad1 --- /dev/null +++ b/materials/x18/lab/3/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb @@ -0,0 +1,772 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lab 1: Variance of Sample Means and Correlation\n", + "\n", + "Welcome to Lab 1 and Data 8.3x!\n", + "\n", + "In this lab we will learn about [the variance of sample means](https://www.inferentialthinking.com/chapters/14/5/variability-of-the-sample-mean.html) as well as ways to understand and quantify [the association between two variables](https://www.inferentialthinking.com/chapters/15/1/correlation.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Run this cell, but please don't change it.\n", + "\n", + "# These lines import the Numpy and Datascience modules.\n", + "import numpy as np\n", + "from datascience import *\n", + "\n", + "# These lines do some fancy plotting magic.\n", + "import matplotlib\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plots\n", + "plots.style.use('fivethirtyeight')\n", + "import warnings\n", + "warnings.simplefilter('ignore', FutureWarning)\n", + "warnings.simplefilter('ignore', UserWarning)\n", + "\n", + "# These lines load the tests.\n", + "from gofer.ok import check" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. How Faithful is Old Faithful? \n", + "\n", + "(Note: clever title comes from [here](http://web.pdx.edu/~jfreder/M212/oldfaithful.pdf).)\n", + "\n", + "Old Faithful is a geyser in Yellowstone National Park in the central United States. It's famous for erupting on a fairly regular schedule. You can see a video below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# For the curious: this is how to display a YouTube video in a\n", + "# Jupyter notebook. The argument to YouTubeVideo is the part\n", + "# of the URL (called a \"query parameter\") that identifies the\n", + "# video. For example, the full URL for this video is:\n", + "# https://www.youtube.com/watch?v=wE8NDuzt8eg\n", + "from IPython.display import YouTubeVideo\n", + "YouTubeVideo(\"wE8NDuzt8eg\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Some of Old Faithful's eruptions last longer than others. When it has a long eruption, there's generally a longer wait until the next eruption.\n", + "\n", + "If you visit Yellowstone, you might want to predict when the next eruption will happen, so you can see the rest of the park and come to see the geyser when it erupts. To predict one variable from another, the first step is to understand the association between them.\n", + "\n", + "The dataset has one row for each observed eruption. It includes the following columns:\n", + "- **duration**: Eruption duration, in minutes\n", + "- **wait**: Time between this eruption and the next, also in minutes\n", + "\n", + "Run the next cell to load the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "faithful = Table.read_table(\"faithful.csv\")\n", + "faithful" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Question 1.1 **\n", + "
\n", + "Make a scatter plot of the data. It's conventional to put the column we will try to predict on the vertical axis and the other column on the horizontal axis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Look at the scatter plot. Does the association between wait times and eruption durations appear to be linear? \n", + "\n", + "There's more going on than just a linear association. The eruption durations seem to cluster; there are a bunch of short eruptions and a bunch of longer ones. Within each of the clusters, these values appear to be roughly linearly correlated, but perhaps with a different correlation coefficient.\n", + "\n", + "The overall relationship is positive, which means that longer eruptions have longer waiting times. Even when the association is more nuanced than a simple linear association, we can still compute the correlation." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, we'll plot the data in standard units. Recall that, if `nums` is an array of numbers, then\n", + "\n", + " (nums - np.mean(nums)) / np.std(nums)\n", + "\n", + "is an array of those numbers in standard units." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Question 1.2 **\n", + "
\n", + "Compute the mean and standard deviation of the eruption durations and waiting times. **Then** create a table called `faithful_standard` containing the eruption durations and waiting times in standard units. (The columns should be named `\"duration (standard units)\"` and `\"wait (standard units)\"`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "for_assignment_type": "student" + }, + "outputs": [], + "source": [ + "duration_mean = ...\n", + "duration_std = ...\n", + "wait_mean = ...\n", + "wait_std = ...\n", + "\n", + "faithful_standard = Table().with_columns(\n", + " \"duration (standard units)\", ...,\n", + " \"wait (standard units)\", ...)\n", + "faithful_standard" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q1_2.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Question 1.3 **\n", + "
\n", + "Plot the data again, but this time in standard units." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You'll notice that this plot looks exactly the same as the last one! The data really are different, but the axes are scaled differently. (The method `scatter` scales the axes so the data fill up the available space.) So it's important to read the ticks on the axes." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Question 1.4 **\n", + "Among the following numbers, which would you guess is closest to the correlation between eruption duration and waiting time in this dataset?\n", + "\n", + "* -1\n", + "* 0\n", + "* 1\n", + "\n", + "Assign your answer to `closest_correlation`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "closest_correlation = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q1_4.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Question 1.5 **\n", + "
\n", + "Compute the correlation `r`. *Hint:* Use `faithful_standard`. Section [15.1](https://www.inferentialthinking.com/chapters/15/1/correlation.html) explains how to do this." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "r = ...\n", + "r" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q1_5.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Variability of the Sample Mean\n", + "\n", + "By the Central Limit Theorem, the probability distribution of the mean of a large random sample is roughly normal. The bell curve is centered at the population mean. Some of the sample means are higher, and some lower, but the deviations from the population mean are roughly symmetric on either side, as we have seen repeatedly. Formally, probability theory shows that the sample mean is an unbiased estimate of the population mean.\n", + "\n", + "In our simulations, we also noticed that the means of larger samples tend to be more tightly clustered around the population mean than means of smaller samples. In this section, we will quantify the variability of the sample mean and develop a relation between the variability and the sample size.\n", + "\n", + "Let's take a look at the salaries of employees of the City of San Francisco in 2014. The mean salary reported by the city government was about $75463.92." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "salaries = Table.read_table('sf_salaries_2014.csv').select(\"salary\")\n", + "salaries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "salary_mean = np.mean(salaries.column('salary'))\n", + "salary_mean" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "salaries.hist('salary', bins=np.arange(0, 300000+10000*2, 10000))\n", + "plots.scatter(salary_mean, 0, marker='^', color='red', s=100);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Question 2.1 **\n", + "
\n", + "Clearly, the population does not follow a normal distribution. Keep that in mind as we progress through these exercises.\n", + "\n", + "Let's take random samples and look at the probability distribution of the sample mean. As usual, we will use simulation to get an empirical approximation to this distribution.\n", + "\n", + "We will define a function `simulate_sample_mean` to do this, because we are going to vary the sample size later. The arguments are the name of the table, the label of the column containing the variable, the sample size, and the number of simulations.\n", + "\n", + "Complete the function `simulate_sample_mean`. It will not be graded, but if you haven't implemented it correctly, the rest of the lab won't work properly, so this step is crucial." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "\"\"\"Empirical distribution of random sample means\"\"\"\n", + "\n", + "def simulate_sample_mean(table, label, sample_size, repetitions):\n", + " \n", + " means = []\n", + "\n", + " for i in np.arange(repetitions):\n", + " new_sample = ...\n", + " new_sample_mean = ...\n", + " ...\n", + "\n", + " sample_means = Table().with_column('Sample Means', means)\n", + " \n", + " # Display empirical histogram and print all relevant quantities – don't change this!\n", + " sample_means.hist(bins=20)\n", + " plots.xlabel('Sample Means')\n", + " plots.title('Sample Size ' + str(sample_size))\n", + " print(\"Sample size: \", sample_size)\n", + " print(\"Population mean:\", np.mean(table.column(label)))\n", + " print(\"Average of sample means: \", np.mean(means))\n", + " print(\"Population SD:\", np.std(table.column(label)))\n", + " print(\"SD of sample means:\", np.std(means))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Question 2.2 **\n", + "
\n", + "In the following cell, we will create a sample of size 100 from the salaries table and graph it using our new `simulate_sample_mean` function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "simulate_sample_mean(salaries, 'salary', 100, 10000) \n", + "plots.xlim(50000, 100000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the following two cells, simulate the mean of a random sample of 400 salaries and 625 salaries, respectively. In each case, perform 10,000 repetitions of each of these processes. Don't worry about the `plots.xlim` line – it just makes sure that all of the plots have the same x-axis. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "for_assignment_type": "student" + }, + "outputs": [], + "source": [ + "simulate_sample_mean(..., ..., ..., ...)\n", + "plots.xlim(50000, 100000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "for_assignment_type": "student" + }, + "outputs": [], + "source": [ + "simulate_sample_mean(..., ..., ..., ...)\n", + "plots.xlim(50000, 100000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see the Central Limit Theorem in action – the histograms of the sample means are roughly normal, even though the histogram of the salaries themselves is far from normal.\n", + "\n", + "We can also see that each of the three histograms of the sample means is centered very close to the population mean. In each case, the \"average of sample means\" is very close to the population mean. Both values are provided in the printout above each histogram. As expected, the sample mean is an unbiased estimate of the population mean." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Question 2.3 **\n", + "
\n", + "Below, we'll look at what happens when we take a fixed sample, then bootstrap from it with different numbers of resamples. How does the distribution of the resampled means change?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "simulate_sample_mean(salaries, 'salary', 100, 1000)\n", + "plots.xlim(50000, 100000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "simulate_sample_mean(salaries, 'salary', 100, 5000)\n", + "plots.xlim(50000, 100000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "simulate_sample_mean(salaries, 'salary', 100, 10000)\n", + "plots.xlim(50000, 100000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Assign the variable `bootstrap_sampled_SD` to the integer corresponding to your answer to the following question:\n", + "\n", + "When I increase the number of bootstrap samples that I take, for a fixed sample size, the SD of my sample mean will...\n", + "\n", + "1. Increase\n", + "2. Decrease\n", + "3. Stay about the same\n", + "4. Vary widly" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "bootstrap_sampled_SD = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q2_3.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below, we'll look at what happens when we take a fixed sample, then bootstrap from it with different numbers of resamples. How does the distribution of the resampled means change?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "simulate_sample_mean(salaries, 'salary', 100, 500)\n", + "plots.xlim(50000, 100000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "simulate_sample_mean(salaries, 'salary', 100, 1000)\n", + "plots.xlim(50000, 100000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "simulate_sample_mean(salaries, 'salary', 100, 5000)\n", + "plots.xlim(50000, 100000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "simulate_sample_mean(salaries, 'salary', 100, 10000)\n", + "plots.xlim(50000, 100000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What did you notice about the sample means of the four bootstrapped samples above?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Question 2.4 **\n", + "
\n", + "Next, let's think about how the relationships between population SD, sample SD, and SD of sample means change with varying sample size. Which of the following is true? Again, assign the variable `pop_vs_sample` to the integer corresponding to your answer. To gain some intuition, you can run the simulation cells below.\n", + "\n", + "1. Sample SD gets smaller with increasing sample size, SD of sample means gets smaller with increasing sample size\n", + "2. Sample SD gets larger with increasing sample size, SD of sample means stays the same with increasing sample size\n", + "3. Sample SD becomes more consistent with population SD with increasing sample size, SD of sample means gets smaller with increasing sample size\n", + "4. Sample SD becomes more consistent with populatoin SD with increasing sample size, SD of smaple means stays the same with increasing sample size" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "pop_vs_sample = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q2_4.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see what happens: First, we calculate the population SD so that we can compare the SD of each sample to the SD of the population." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "pop_sd = np.std(salaries.column(\"salary\"))\n", + "pop_sd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's then how a small sample behaves. Run the following cells multiple times to see how the SD of the sample changes from sample to sample. Adjust the bins as necessary." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "sample_10 = salaries.sample(10)\n", + "sample_10.hist(\"salary\")\n", + "print(\"Sample SD: \", np.std(sample_10.column(\"salary\")))\n", + "simulate_sample_mean(sample_10, 'salary', 10, 1000)\n", + "plots.xlim(5,120000)\n", + "plots.ylim(0, .0001);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "sample_200 = salaries.sample(200)\n", + "sample_200.hist(\"salary\")\n", + "print(\"Sample SD: \", np.std(sample_200.column(\"salary\")))\n", + "simulate_sample_mean(sample_200, 'salary', 200, 1000)\n", + "plots.xlim(5,100000)\n", + "plots.ylim(0, .00015);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "sample_1000 = salaries.sample(1000)\n", + "sample_1000.hist(\"salary\")\n", + "print(\"Sample SD: \", np.std(sample_1000.column(\"salary\")))\n", + "simulate_sample_mean(sample_1000, 'salary', 1000, 1000)\n", + "plots.xlim(5,100000)\n", + "plots.ylim(0, .00025);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's illustrate this trend. Below, you will see how the average absolute error of SD from the population changes with sample size (N)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Don't change this cell, just run it!\n", + "sample_n_errors = make_array()\n", + "for i in np.arange(10, 200, 10):\n", + " sample_n_errors = np.append(sample_n_errors, np.average([abs(np.std(salaries.sample(i).column(\"salary\"))-pop_sd)\n", + " for d in np.arange(100)]))\n", + "Table().with_column(\"Average absolute error in SD\", sample_n_errors, \"N\", np.arange(10, 200, 10)).plot(\"N\", \"Average absolute error in SD\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should notice that the distribution of means gets spiker, and that the distribution of the sample increasingly looks like the distribution of the population as we get to larger sample sizes. \n", + "\n", + "Is there a relationship between the sample size and absolute error in standard deviation? Identify this relationship – if you're having trouble, take a look at this [section](https://www.inferentialthinking.com/chapters/14/5/variability-of-the-sample-mean.html) in our textbook about the variability of sample means." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Submission" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You're finished with lab 1! In order to successfully submit your assignment, follow these steps...\n", + "- **IMPORTANT** Before you do anything, **Save and Checkpoint** from the `File` menu. Please do this first before running the cell below,\n", + "- **run all the tests and verify that they all pass** (the next cell has a shortcut for that), \n", + "- **Review the notebook one last time, we will be grading the final state of your notebook** If you make any changes, please **Save and Checkpoint** again." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# For your convenience, you can run this cell to run all the tests at once!\n", + "import glob\n", + "from gofer.ok import grade_notebook\n", + "if not globals().get('__GOFER_GRADER__', False):\n", + " display(grade_notebook('lab01.ipynb', sorted(glob.glob('tests/q*.py'))))" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/materials/x18/lab/3/lab01/lab01.ipynb b/materials/x18/lab/3/lab01/lab01.ipynb index d567a8a..58f5ad1 100644 --- a/materials/x18/lab/3/lab01/lab01.ipynb +++ b/materials/x18/lab/3/lab01/lab01.ipynb @@ -8,7 +8,7 @@ "\n", "Welcome to Lab 1 and Data 8.3x!\n", "\n", - "In this week's lab, we will cover two relatively orthogonal concepts. First, we will investigate the variance of sample means, found in [Section 14.5](https://www.inferentialthinking.com/chapters/14/5/Variability_of_the_Sample_Mean) of our textbook. We will also get some hands-on practice with understanding the association between two variables, which you can read more about in [Section 15.1](https://www.inferentialthinking.com/chapters/15/1/Correlation)." + "In this lab we will learn about [the variance of sample means](https://www.inferentialthinking.com/chapters/14/5/variability-of-the-sample-mean.html) as well as ways to understand and quantify [the association between two variables](https://www.inferentialthinking.com/chapters/15/1/correlation.html)." ] }, { @@ -52,7 +52,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# For the curious: this is how to display a YouTube video in a\n", @@ -82,7 +84,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "faithful = Table.read_table(\"faithful.csv\")\n", @@ -101,7 +105,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "..." @@ -126,7 +132,7 @@ "\n", " (nums - np.mean(nums)) / np.std(nums)\n", "\n", - "...is an array of those numbers in standard units." + "is an array of those numbers in standard units." ] }, { @@ -142,6 +148,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "for_assignment_type": "student" }, "outputs": [], @@ -160,7 +167,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q1_2.py')" @@ -178,7 +187,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "..." @@ -219,7 +230,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q1_4.py')" @@ -237,7 +250,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "r = ...\n", @@ -247,7 +262,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q1_5.py')" @@ -269,7 +286,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "salaries = Table.read_table('sf_salaries_2014.csv').select(\"salary\")\n", @@ -279,7 +298,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "salary_mean = np.mean(salaries.column('salary'))\n", @@ -289,7 +310,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "salaries.hist('salary', bins=np.arange(0, 300000+10000*2, 10000))\n", @@ -355,7 +378,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "simulate_sample_mean(salaries, 'salary', 100, 10000) \n", @@ -373,6 +398,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "for_assignment_type": "student" }, "outputs": [], @@ -473,6 +499,24 @@ "bootstrap_sampled_SD = ..." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q2_3.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below, we'll look at what happens when we take a fixed sample, then bootstrap from it with different numbers of resamples. How does the distribution of the resampled means change?" + ] + }, { "cell_type": "code", "execution_count": null, @@ -493,7 +537,39 @@ }, "outputs": [], "source": [ - "check('tests/q2_3.py')" + "simulate_sample_mean(salaries, 'salary', 100, 1000)\n", + "plots.xlim(50000, 100000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "simulate_sample_mean(salaries, 'salary', 100, 5000)\n", + "plots.xlim(50000, 100000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "simulate_sample_mean(salaries, 'salary', 100, 10000)\n", + "plots.xlim(50000, 100000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What did you notice about the sample means of the four bootstrapped samples above?" ] }, { @@ -635,7 +711,7 @@ "source": [ "You should notice that the distribution of means gets spiker, and that the distribution of the sample increasingly looks like the distribution of the population as we get to larger sample sizes. \n", "\n", - "Is there a relationship between the sample size and absolute error in standard deviation? Identify this relationship – if you're having trouble, take a look at [Section 14.5](https://www.inferentialthinking.com/chapters/14/5/Variability_of_the_Sample_Mean) in our textbook." + "Is there a relationship between the sample size and absolute error in standard deviation? Identify this relationship – if you're having trouble, take a look at this [section](https://www.inferentialthinking.com/chapters/14/5/variability-of-the-sample-mean.html) in our textbook about the variability of sample means." ] }, { @@ -688,10 +764,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.2" + "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 1 } - diff --git a/materials/x18/lab/3/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb b/materials/x18/lab/3/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb new file mode 100644 index 0000000..9742ba3 --- /dev/null +++ b/materials/x18/lab/3/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb @@ -0,0 +1,716 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lab 2: Regression\n", + "\n", + "Welcome to Lab 2 of Data 8.3x!\n", + "\n", + "Today we will get some hands-on practice with linear regression. You can find more information about this topic in\n", + "[section 15.2](https://www.inferentialthinking.com/chapters/15/2/Regression_Line)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Run this cell, but please don't change it.\n", + "\n", + "# These lines import the Numpy and Datascience modules.\n", + "import numpy as np\n", + "from datascience import *\n", + "\n", + "# These lines do some fancy plotting magic.\n", + "import matplotlib\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plots\n", + "plots.style.use('fivethirtyeight')\n", + "import warnings\n", + "warnings.simplefilter('ignore', FutureWarning)\n", + "warnings.simplefilter('ignore', UserWarning)\n", + "\n", + "# These lines load the tests.\n", + "from gofer.ok import check" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. How Faithful is Old Faithful? Revisited\n", + "\n", + "Let's revisit a question from lab 1. Last lab, we investigated Old Faithful, a geyser in Yellowstone National Park in the central United States. It's famous for erupting on a fairly regular schedule.\n", + "\n", + "To recap, some of Old Faithful's eruptions last longer than others. Today, we will use the same dataset on eruption durations and waiting times to see if we can make predict the wait time from the eruption duration using linear regression.\n", + "\n", + "The dataset has one row for each observed eruption. It includes the following columns:\n", + "- **duration**: Eruption duration, in minutes\n", + "- **wait**: Time between this eruption and the next, also in minutes\n", + "\n", + "Run the next cell to load the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "faithful = Table.read_table(\"faithful.csv\")\n", + "faithful" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Remember from last lab that we concluded eruption time and waiting time are positively correlated. The table below called `faithful_standard` contains the eruption durations and waiting times in standard units." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "duration_mean = np.mean(faithful.column(\"duration\"))\n", + "duration_std = np.std(faithful.column(\"duration\"))\n", + "wait_mean = np.mean(faithful.column(\"wait\"))\n", + "wait_std = np.std(faithful.column(\"wait\"))\n", + "\n", + "faithful_standard = Table().with_columns(\n", + " \"duration (standard units)\", (faithful.column(\"duration\") - duration_mean) / duration_std,\n", + " \"wait (standard units)\", (faithful.column(\"wait\") - wait_mean) / wait_std\n", + ")\n", + "faithful_standard" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next cell computes the correlation `r`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "r = np.mean(faithful_standard.column(0) * faithful_standard.column(1))\n", + "r" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. The regression line\n", + "The correlation coefficient is the slope of the regression line when the data are expressed in standard units.\n", + "\n", + "The next cell plots the regression line in standard units:\n", + "\n", + "$$\\text{waiting time (standard units)} = r \\times \\text{eruption duration (standard units)}.$$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def plot_data_and_line(dataset, x, y, point_0, point_1):\n", + " \"\"\"Makes a scatter plot of the dataset, along with a line passing through two points.\"\"\"\n", + " dataset.scatter(x, y, label=\"data\")\n", + " xs, ys = zip(point_0, point_1)\n", + " plots.plot(xs, ys, label=\"regression line\")\n", + " plots.legend(bbox_to_anchor=(1.5,.8))\n", + "\n", + "plot_data_and_line(faithful_standard, \n", + " \"duration (standard units)\", \n", + " \"wait (standard units)\", \n", + " [-2, -2*r], \n", + " [2, 2*r])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How would you take a point in standard units and convert it back to original units? We'd have to \"stretch\" its horizontal position by `duration_std` and its vertical position by `wait_std`.\n", + "\n", + "That means the same thing would happen to the slope of the line.\n", + "\n", + "Stretching a line horizontally makes it less steep, so we divide the slope by the stretching factor. Stretching a line vertically makes it more steep, so we multiply the slope by the stretching factor.\n", + "\n", + "** Question 2.1 **
\n", + "What is the slope of the regression line in original units?\n", + "\n", + "(If the \"stretching\" explanation is unintuitive, consult section [15.2](https://www.inferentialthinking.com/chapters/15/2/Regression_Line) in the textbook.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "slope = ...\n", + "slope" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We know that the regression line passes through the point `(duration_mean, wait_mean)`. You might recall from high-school algebra that the equation for the line is therefore:\n", + "\n", + "$$\\text{waiting time} - \\verb|wait_mean| = \\texttt{slope} \\times (\\text{eruption duration} - \\verb|duration_mean|)$$\n", + "\n", + "After rearranging that equation slightly, the intercept turns out to be:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "intercept = slope*(-duration_mean) + wait_mean\n", + "intercept" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q2_1.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Investigating the regression line\n", + "The slope and intercept tell you exactly what the regression line looks like. To predict the waiting time for an eruption, multiply the eruption's duration by `slope` and then add `intercept`.\n", + "\n", + "** Question 3.1 **
\n", + "Compute the predicted waiting time for an eruption that lasts 2 minutes, and for an eruption that lasts 5 minutes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "two_minute_predicted_waiting_time = ...\n", + "five_minute_predicted_waiting_time = ...\n", + "\n", + "# Here is a helper function to print out your predictions\n", + "# (you don't need to modify it):\n", + "def print_prediction(duration, predicted_waiting_time):\n", + " print(\"After an eruption lasting\", duration,\n", + " \"minutes, we predict you'll wait\", predicted_waiting_time,\n", + " \"minutes until the next eruption.\")\n", + "\n", + "print_prediction(2, two_minute_predicted_waiting_time)\n", + "print_prediction(5, five_minute_predicted_waiting_time)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q3_1.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next cell plots the line that goes between those two points, which is (a segment of) the regression line." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": false + }, + "outputs": [], + "source": [ + "plot_data_and_line(faithful, \"duration\", \"wait\", \n", + " [2, two_minute_predicted_waiting_time], \n", + " [5, five_minute_predicted_waiting_time])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Question 3.2 **
\n", + "Make predictions for the waiting time after each eruption in the `faithful` table. (Of course, we know exactly what the waiting times were! We are doing this so we can see how accurate our predictions are.) Put these numbers into a column in a new table called `faithful_predictions`. Its first row should look like this:\n", + "\n", + "|duration|wait|predicted wait|\n", + "|-|-|-|\n", + "|3.6|79|72.1011|\n", + "\n", + "*Hint:* Your answer can be just one line. There is no need for a `for` loop; use array arithmetic instead." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "faithful_predictions = ...\n", + "faithful_predictions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q3_2.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Question 3.3 **
\n", + "How close were we? We computed the *residual* for each eruption in the dataset. The residual is the difference (not the absolute difference) between the actual waiting time and the predicted waiting time. Add the residuals to `faithful_predictions` as a new column called `\"residual\"`, naming the resulting table `faithful_residuals`.\n", + "\n", + "*Hint:* Again, your code will be much simpler if you don't use a `for` loop." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "residual = faithful_predictions.column(1) - faithful_predictions.column(2)\n", + "faithful_residuals = ...\n", + "faithful_residuals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q3_3.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is a plot of the residuals you computed. Each point corresponds to one eruption. It shows how much our prediction over- or under-estimated the waiting time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "faithful_residuals.scatter(\"duration\", \"residual\", color=\"r\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There isn't too much of a pattern in the residuals, which confirms that it's reasonable to use linear regression for prediction. It's true that there are two separate clouds; the eruption durations seemed to fall into two distinct clusters. But that's just a pattern in the eruption durations, not a pattern in the relationship between eruption durations and waiting times. A larger concern is that there may be more positive than negative residuals in a particular region of the horizontal axis. For both clusters, the points are distributed fairly evenly above and below zero, which is a confirmation that the association is mostly linear." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. How accurate are different predictions?\n", + "The correlation coefficient is close to 1, implying that the observed values are tightly clustered around the regression line. The residuals are overall small (close to 0) in comparison to the waiting times.\n", + "\n", + "We can see that visually by plotting the waiting times and residuals together:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "faithful_residuals.scatter(\"duration\", \"wait\", label=\"actual waiting time\", color=\"blue\")\n", + "plots.scatter(faithful_residuals.column(\"duration\"), faithful_residuals.column(\"residual\"), label=\"residual\", color=\"r\")\n", + "plots.plot([2, 5], [two_minute_predicted_waiting_time, five_minute_predicted_waiting_time], label=\"regression line\")\n", + "plots.legend(bbox_to_anchor=(1.7,.8));" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Question 4.1 **
\n", + "In `faithful`, no eruption lasted exactly 0, 2.5, or 60 minutes. Using this line, what is the predicted waiting time for an eruption that lasts 0 minutes? 2.5 minutes? An hour?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "zero_minute_predicted_waiting_time = ...\n", + "two_point_five_minute_predicted_waiting_time = ...\n", + "hour_predicted_waiting_time = ...\n", + "\n", + "print_prediction(0, zero_minute_predicted_waiting_time)\n", + "print_prediction(2.5, two_point_five_minute_predicted_waiting_time)\n", + "print_prediction(60, hour_predicted_waiting_time)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q4_1.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Divide and Conquer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see what happens if we treat the two clusters of observations differently. It appears from the scatter diagram that there are two clusters of points: one for durations around 2 and another for durations between 3.5 and 5. A vertical line at 3 divides the two clusters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "faithful.scatter(\"duration\", \"wait\", label=\"actual waiting time\", color=\"blue\")\n", + "plots.plot([3, 3], [40, 100]);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `standardize` function from lecture appears below, which returns a table of values in standard units." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def standard_units(any_numbers):\n", + " \"Convert any array of numbers to standard units.\"\n", + " return (any_numbers - np.mean(any_numbers)) / np.std(any_numbers) \n", + "\n", + "def standardize(t):\n", + " \"\"\"Return a table in which all columns of t are converted to standard units.\"\"\"\n", + " t_su = Table()\n", + " for label in t.labels:\n", + " t_su = t_su.with_column(label + ' (su)', standard_units(t.column(label)))\n", + " return t_su" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 5.1**
\n", + "Separately compute the regression coefficients *r* for all the points with a duration below 3 **and then** for all the points with a duration above 3. To do so, create a function that computes `r` from a table and pass it two different tables of points, `below_3` and `above_3`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def reg_coeff(t):\n", + " \"\"\"Return the regression coefficient for columns 0 & 1.\"\"\"\n", + " t_su = standardize(t)\n", + " ...\n", + "\n", + "below_3 = ...\n", + "above_3 = ...\n", + "below_3_r = reg_coeff(below_3)\n", + "above_3_r = reg_coeff(above_3)\n", + "print(\"For points below 3, r is\", below_3_r, \"; for points above 3, r is\", above_3_r)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q5_1.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 5.2**
\n", + "Write functions `slope_of` and `intercept_of` below. \n", + "\n", + "When you're done, the functions `wait_below_3` and `wait_above_3` should each use a different regression line to predict a wait time for a duration. The first function should use the regression line for all points with duration below 3.2. The second function should use the regression line for all points with duration above 3.2." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def slope_of(t, r):\n", + " \"\"\"Return the slope of the regression line for t in original units.\n", + " \n", + " Assume that column 0 contains x values and column 1 contains y values.\n", + " r is the regression coefficient for x and y.\n", + " \"\"\"\n", + " ...\n", + "\n", + "def intercept_of(t, r):\n", + " \"\"\"Return the slope of the regression line for t in original units.\"\"\"\n", + " s = slope_of(t, r)\n", + " ...\n", + " \n", + "below_3_a = slope_of(below_3, below_3_r)\n", + "below_3_b = intercept_of(below_3, below_3_r)\n", + "above_3_a = slope_of(above_3, above_3_r)\n", + "above_3_b = intercept_of(above_3, above_3_r)\n", + "\n", + "def wait_below_3(duration):\n", + " return below_3_a * duration + below_3_b\n", + "\n", + "def wait_above_3(duration):\n", + " return above_3_a * duration + above_3_b" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q5_2.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The plot below shows two different regression lines, one for each cluster!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "faithful.scatter(0, 1)\n", + "plots.plot([1, 3], [wait_below_3(1), wait_below_3(3)])\n", + "plots.plot([3, 6], [wait_above_3(3), wait_above_3(6)]);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 5.3**
\n", + "Write a function `predict_wait` that takes a `duration` and returns the predicted wait time using the appropriate regression line, depending on whether the duration is below 3 or greater than (or equal to) 3." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "for_assignment_type": "student" + }, + "outputs": [], + "source": [ + "def predict_wait(duration):\n", + " \"\"\"Return the wait predicted by the appropriate one of the two regression lines above.\"\"\"\n", + " ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q5_3.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The predicted wait times for each point appear below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "faithful.with_column('predicted', faithful.apply(predict_wait, 'duration')).scatter(0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Further Exploration (ungraded)**: When drawing a line through each cluster separately, we discovered two different but similar lines. Here are some natural questions to explore, if you want to continue working with these data:\n", + " * How much more accurate do we expect predictions to be using two lines instead of one? Can we measure this improvement using residuals?\n", + " * Are the lines really different, or did they just come out different due to chance because we have only a small number of observations? How could we tell?\n", + " * Could it be that the slopes of the lines are the same, but the intercepts are different? " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Submission" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You're finished with lab 2! In order to successfully submit your assignment, follow these steps...\n", + "- **IMPORTANT** Before you do anything, **Save and Checkpoint** from the `File` menu. Please do this first before running the cell below,\n", + "- **run all the tests and verify that they all pass** (the next cell has a shortcut for that), \n", + "- **Review the notebook one last time, we will be grading the final state of your notebook** If you make any changes, please **Save and Checkpoint** again." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# For your convenience, you can run this cell to run all the tests at once!\n", + "import glob\n", + "from gofer.ok import grade_notebook\n", + "if not globals().get('__GOFER_GRADER__', False):\n", + " display(grade_notebook('lab02.ipynb', sorted(glob.glob('tests/q*.py'))))" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/materials/x18/lab/3/lab02/lab02.ipynb b/materials/x18/lab/3/lab02/lab02.ipynb index 114ac62..9742ba3 100644 --- a/materials/x18/lab/3/lab02/lab02.ipynb +++ b/materials/x18/lab/3/lab02/lab02.ipynb @@ -59,7 +59,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "faithful = Table.read_table(\"faithful.csv\")\n", @@ -76,7 +78,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "duration_mean = np.mean(faithful.column(\"duration\"))\n", @@ -101,7 +105,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "r = np.mean(faithful_standard.column(0) * faithful_standard.column(1))\n", @@ -123,7 +129,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def plot_data_and_line(dataset, x, y, point_0, point_1):\n", @@ -159,7 +167,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "slope = ...\n", @@ -180,7 +190,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "intercept = slope*(-duration_mean) + wait_mean\n", @@ -190,7 +202,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q2_1.py')" @@ -210,7 +224,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "two_minute_predicted_waiting_time = ...\n", @@ -230,7 +246,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q3_1.py')" @@ -247,6 +265,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": false }, "outputs": [], @@ -273,7 +292,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "faithful_predictions = ...\n", @@ -283,7 +304,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q3_2.py')" @@ -302,7 +325,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "residual = faithful_predictions.column(1) - faithful_predictions.column(2)\n", @@ -313,7 +338,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q3_3.py')" @@ -329,7 +356,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "faithful_residuals.scatter(\"duration\", \"residual\", color=\"r\")" @@ -355,7 +384,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "faithful_residuals.scatter(\"duration\", \"wait\", label=\"actual waiting time\", color=\"blue\")\n", @@ -375,7 +406,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "zero_minute_predicted_waiting_time = ...\n", @@ -390,7 +423,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q4_1.py')" @@ -413,7 +448,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "faithful.scatter(\"duration\", \"wait\", label=\"actual waiting time\", color=\"blue\")\n", @@ -458,7 +495,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def reg_coeff(t):\n", @@ -476,7 +515,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q5_1.py')" @@ -528,7 +569,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q5_2.py')" @@ -544,7 +587,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "faithful.scatter(0, 1)\n", @@ -577,7 +622,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q5_3.py')" @@ -593,7 +640,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "faithful.with_column('predicted', faithful.apply(predict_wait, 'duration')).scatter(0)" @@ -659,10 +708,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.2" + "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 1 } - diff --git a/materials/x18/lab/3/lab03/.ipynb_checkpoints/lab03-checkpoint.ipynb b/materials/x18/lab/3/lab03/.ipynb_checkpoints/lab03-checkpoint.ipynb new file mode 100644 index 0000000..669974b --- /dev/null +++ b/materials/x18/lab/3/lab03/.ipynb_checkpoints/lab03-checkpoint.ipynb @@ -0,0 +1,601 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lab 3: Regression Inference\n", + "\n", + "Welcome to Lab 3 of Data 8.3x!\n", + "\n", + "Sometimes, the primary purpose of regression analysis is to learn something about the slope or intercept of the best-fitting line. When we use a sample of data to estimate the slope or intercept, our estimate is subject to random error, just as in the simpler case of the mean of a random sample.\n", + "\n", + "In this lab, we'll use regression to get an accurate estimate for the age of the universe, using pictures of exploding stars. Our estimate will come from a sample of all exploding stars. We'll compute a confidence interval to quantify the error caused by sampling." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Run this cell, but please don't change it.\n", + "\n", + "# These lines import the Numpy and Datascience modules.\n", + "import numpy as np\n", + "from datascience import *\n", + "\n", + "# These lines do some fancy plotting magic\n", + "import matplotlib\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "plt.style.use('fivethirtyeight')\n", + "import warnings\n", + "warnings.simplefilter('ignore', FutureWarning)\n", + "warnings.simplefilter('ignore', UserWarning)\n", + "from matplotlib import patches\n", + "from ipywidgets import interact, interactive, fixed\n", + "import ipywidgets as widgets\n", + "\n", + "# These lines load the tests.\n", + "from gofer.ok import check" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. The Age of the Universe\n", + "\n", + "### The Actual Big Bang Theory\n", + "In the early 20th century, the most popular cosmological theory suggested that the universe had always existed at a fixed size. Today, the Big Bang theory prevails: Our universe started out very small and is still expanding.\n", + "\n", + "A consequence of this is Hubble's Law, which states that every celestial object that's reasonably far away from Earth (for example, another galaxy) is moving away from us at a constant speed. If we extrapolate that motion backwards to the time when everything in the universe was in the same place, that time is (roughly) the beginning of the universe!\n", + "\n", + "Scientists have used this fact, along with measurements of the current *location* and *movement speed* of other celestial objects, to estimate when the universe started.\n", + "\n", + "The cell below simulates a universe in which our sun is the center and every other star is moving away from us. Each star starts at the same place as the sun, then moves away from it over time. Different stars have different directions *and speeds*; the arrows indicate the direction and speed of travel.\n", + "\n", + "Run the cell, then move the slider to see how things change over time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Just run this cell. (The simulation is actually not\n", + "# that complicated; it just takes a lot of code to draw\n", + "# everything. So you don't need to read this unless you\n", + "# have time and are curious about more advanced plotting.)\n", + "\n", + "num_locations = 15\n", + "example_velocities = Table().with_columns(\n", + " \"x\", np.random.normal(size=num_locations),\n", + " \"y\", np.random.normal(size=num_locations))\n", + "start_of_time = -2\n", + "\n", + "def scatter_after_time(t, start_of_time, end_of_time, velocities, center_name, other_point_name, make_title):\n", + " max_location = 1.1*(end_of_time-start_of_time)*max(max(abs(velocities.column(\"x\"))), max(abs(velocities.column(\"y\"))))\n", + " new_locations = velocities.with_columns(\n", + " \"x\", (t-start_of_time)*velocities.column(\"x\"),\n", + " \"y\", (t-start_of_time)*velocities.column(\"y\"))\n", + " plt.scatter(make_array(0), make_array(0), label=center_name, s=100, c=\"yellow\")\n", + " plt.scatter(new_locations.column(\"x\"), new_locations.column(\"y\"), label=other_point_name)\n", + " for i in np.arange(new_locations.num_rows):\n", + " plt.arrow(\n", + " new_locations.column(\"x\").item(i),\n", + " new_locations.column(\"y\").item(i),\n", + " velocities.column(\"x\").item(i),\n", + " velocities.column(\"y\").item(i),\n", + " fc='black',\n", + " ec='black',\n", + " head_width=0.025*max_location,\n", + " lw=.15)\n", + " plt.xlim(-max_location, max_location)\n", + " plt.ylim(-max_location, max_location)\n", + " plt.gca().set_aspect('equal', adjustable='box')\n", + " plt.gca().set_position(make_array(0, 0, 1, 1))\n", + " plt.legend(bbox_to_anchor=(1.6, .7))\n", + " plt.title(make_title(t))\n", + " plt.show()\n", + "\n", + "interact(\n", + " scatter_after_time,\n", + " t=widgets.FloatSlider(min=start_of_time, max=5, step=.05, value=0, msg_throttle=1),\n", + " start_of_time=fixed(start_of_time),\n", + " end_of_time=fixed(5),\n", + " velocities=fixed(example_velocities),\n", + " center_name=fixed(\"our sun\"),\n", + " other_point_name=fixed(\"other star\"),\n", + " make_title=fixed(lambda t: \"The world {:01g} year{} in the {}\".format(abs(t), \"\" if abs(t) == 1 else \"s\", \"past\" if t < 0 else \"future\")));" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Analogy: driving\n", + "Here's an analogy to illustrate how scientists use information about stars to estimate the age of the universe.\n", + "\n", + "Suppose that at some point in the past, our friend Mei started driving in a car going at a steady speed of 60 miles per hour straight east. We're still standing where she started." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Run this cell to see a picture of Mei's locations over time.\n", + "\n", + "mei_velocity = Table().with_columns(\"x\", make_array(60), \"y\", make_array(0))\n", + "interact(\n", + " scatter_after_time,\n", + " t=widgets.FloatSlider(min=-2, max=1, step=.05, value=0, msg_throttle=1),\n", + " start_of_time=fixed(-2),\n", + " end_of_time=fixed(1),\n", + " velocities=fixed(mei_velocity),\n", + " center_name=fixed(\"Us\"),\n", + " other_point_name=fixed(\"Mei\"),\n", + " make_title=fixed(lambda t: \"Mei's position {:01g} hour{} in the {}\".format(abs(t), \"\" if abs(t) == 1 else \"s\", \"past\" if t < 0 else \"future\")));" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We want to know how long she's been driving, but we forgot to record the time when she left. If we find out that she's 120 miles away, and she's been going 60 miles per hour the whole time, we can infer that she left 2 hours ago.\n", + "\n", + "One way we can compute that number is by fitting a line to a scatter plot of our locations and speeds. It turns out that the *slope* of that line is the amount of time that has passed. Run the next cell to see a picture:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Just run this cell.\n", + "small_driving_example = Table().with_columns(\n", + " \"Name\", make_array(\"Us\", \"Mei\"),\n", + " \"Speed moving away from us (miles per hour)\", make_array(0, 60),\n", + " \"Current distance from us (miles)\", make_array(0, 120))\n", + "\n", + "small_driving_example.scatter(1, 2, s=200, fit_line=True)\n", + "\n", + "# Fancy magic to draw each person's name with their dot.\n", + "with_slope_indicator = small_driving_example.with_row(\n", + " [\"Slope = 2\\ hours\", small_driving_example.column(1).mean(), small_driving_example.column(2).mean()])\n", + "for i in range(with_slope_indicator.num_rows):\n", + " name = with_slope_indicator.column(0).item(i)\n", + " x = with_slope_indicator.column(1).item(i)\n", + " y = with_slope_indicator.column(2).item(i)\n", + " plt.scatter(make_array(x - 15), make_array(y + 15), s=1000*len(name), marker=\"$\\mathrm{\" + name + \"}$\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The slope of the line is 2 hours. (The units are vertical-axis units divided by horizontal-axis units, which are $\\frac{\\texttt{miles}}{\\texttt{miles} / \\texttt{hour}}$, or hours.) So that's our answer.\n", + "\n", + "Imagine that you don't know Mei's exact distance or speed, only rough estimates. Then if you drew this line, you'd get a slightly bad estimate of the time since she left. But if you measured the distance and speed of hundreds of people who left you at the same time going different speeds, and drew a line through them, the slope of that line would be a pretty good estimate of the time they left, even if the individual measurements weren't exactly right.\n", + "\n", + "The `drivers.csv` dataset contains the speeds and distances-from-start of 100 drivers. They all left the same starting location at the same time, driving at a fixed speed on a straight line away from the start. The measurements aren't exact, so they don't fit exactly on a line. We've created a scatter plot and drawn a line through the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Just run this cell.\n", + "Table.read_table(\"drivers.csv\").scatter(0, 1, fit_line=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Question 1.1 **
\n", + "By looking at the fit line, estimate how long ago (in hours) Mei left." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Fill in the start time you infer from the above line.\n", + "driving_start_time_hours = ...\n", + "driving_start_time_hours" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "check('tests/q1_1.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Back to cosmology\n", + "To do the same thing for the universe, we need to know the distance-from-Earth and speed-away-from-Earth of many celestial objects. Using pictures taken by very accurate telescopes and a lot of physics, astronomers have been able to estimate both. It turns out that *nearby supernovae* -- stars that have recently died and exploded -- are among the best sources of this data, because they are very easy to see. This picture taken by the Hubble telescope shows an entire galaxy, with a single supernova - as bright by itself as billions of stars - at the bottom left.\n", + "\n", + "\n", + "\n", + "Our astronomical data for today will come from the [Supernova Cosmology Project](http://supernova.lbl.gov/union/) at Lawrence Berkeley Lab. The original dataset is [here](http://supernova.lbl.gov/union/figures/SCPUnion2.1_mu_vs_z.txt), with (brief) documentation [here](http://supernova.lbl.gov/union/descriptions.html#Magvsz). Each row in the table corresponds to a supernova near Earth that was observed by astronomers. From pictures like the one above, the astronomers deduced how far away each supernova was from Earth and how fast it was moving away from Earth. Their deductions were good, but not perfect.\n", + "\n", + "Run the cell below to load the data into a table called `close_novas` and make a scatter plot. (If you prefer, you can also use the name `close_novae`; both are correct.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Just run this cell.\n", + "close_novas = Table.read_table(\"close_novas.csv\")\n", + "close_novae = close_novas\n", + "\n", + "close_novas.scatter(0, 1, fit_line=True)\n", + "close_novas" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Question 1.2 **
\n", + "Looking this plot, make a guess at the age of the universe.\n", + "\n", + "**Note**: Make sure you get the units right! In case you need to know what a parsec is, it's a big unit of distance, equivalent to 30.86 trillion kilometers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Fill this in manually by examining the line above.\n", + "first_guess_universe_age_years = ...\n", + "\n", + "# This just shows your guess as a nice string, in billions of years.\n", + "\"{:,} billion years\".format(round(first_guess_universe_age_years / 1e9, 2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q1_2.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Fitting the line yourself\n", + "`fit_line=True` is convenient, but we need to be able to calculate the slope as a number. Recall that the least-squares regression line for our supernova data is:\n", + "* the line\n", + "* with the smallest average (over all the supernovae we observe)\n", + "* error,\n", + "* squared,\n", + "* where the error is\n", + "\n", + "$$\\text{the supernova's actual distance from Earth} - \\text{the height of the line at that supernova's speed.}$$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Question 1.3 **
\n", + "Define a function called `errors`. It should take three arguments:\n", + "1. a table like `close_novas` (with the same column names and meanings, but not necessarily the same data)\n", + "2. the slope of a line (a number)\n", + "3. the intercept of a line (a number).\n", + "\n", + "It should return an array of the errors made when a line with that slope and intercept is used to predict distance from speed for each supernova in the given table. (The error is the actual distance minus the predicted distance.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "for_assignment_type": "student" + }, + "outputs": [], + "source": [ + "def errors(t, slope, intercept):\n", + " ...\n", + " return ..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Question 1.4 **
\n", + "Using `errors`, compute the errors for the line with slope `16000` and intercept `0` on the `close_novas` dataset. Name that array `example_errors`. Then make a scatter plot of the errors.\n", + "\n", + "**Hint:** To make a scatter plot of the errors, plot the error for each supernova in the dataset. Put the actual speed on the horizontal axis and the error on the vertical axis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "example_errors = ...\n", + "..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q1_4.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should find that the errors are almost all negative. That means our line is a little bit too steep. Let's find a better one." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Question 1.5 **
\n", + "Define a function called `fit_line`. It should take a table like `close_novas` (with the same column names and meanings) as its argument. It should return an array containing the slope (as item 0) and intercept (as item 1) of the least-squares regression line predicting distance from speed for that table.\n", + "\n", + "Note: If you haven't tried to use the [`minimize` function](http://data8.org/datascience/util.html#datascience.util.minimize) yet, now is a great time to practice. Here's an [example from the textbook](https://www.inferentialthinking.com/chapters/15/3/Method_of_Least_Squares)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def fit_line(tbl):\n", + " # Your code may need more than 1 line below here.\n", + " def mse(..., ...):\n", + " ... \n", + " return ... \n", + " \n", + "# Here is an example call to your function. To test your function,\n", + "# figure out the right slope and intercept by hand.\n", + "example_table = Table().with_columns(\n", + " \"Speed (parsecs/year)\", make_array(0, 1),\n", + " \"Distance (million parsecs)\", make_array(1, 3))\n", + "fit_line(example_table)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q1_5.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Question 1.6 **
\n", + "Use your function to fit a line to `close_novas`.\n", + "\n", + "Then, set `new_errors` equal to the errors that we get calling `errors` with our new line. The cell below will graph the corresponding residual plot with a best fit line.\n", + "\n", + "Make sure that the residual plot makes sense (Hint: what qualities should the best fit line of a residual plot have?)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": false + }, + "outputs": [], + "source": [ + "best_line = ...\n", + "best_line_slope = ...\n", + "best_line_intercept = ...\n", + "\n", + "new_errors = ...\n", + "\n", + "# This code displays the residual plot, given your values for the best_line_slope and best_line_intercept\n", + "Table().with_column(\"Speed (parsecs/year)\", \n", + " close_novas.column(\"Speed (parsecs/year)\"), \n", + " \"Distance errors (million parsecs)\", \n", + " new_errors\n", + " ).scatter(0, 1, fit_line=True)\n", + "\n", + "# This just shows your answer as a nice string, in billions of years.\n", + "\"Slope: {:g} (corresponding to an estimated age of {:,} billion years)\".format(best_line_slope, round(best_line_slope/1000, 4))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That slope (multiplied by 1 million) is an estimate of the age of the universe. The current best estimate of the age of the universe (using slightly more sophisticated techniques) is 13.799 billion years. Did we get close?\n", + "\n", + "One reason our answer might be a little off is that we are using a sample of only some of the supernovae in the universe. Our sample isn't exactly random, since astronomers presumably chose the novae that were easiest to measure (or used some other nonrandom criteria). But let's assume it is. How can we produce a confidence interval for the age of the universe?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Question 1.7 **
\n", + "It's time to bootstrap so that we can quantify the variability in our estimate! Simulate 1000 resamples from `close_novas`. For each resample, compute the slope of the least-squares regression line, and multiply it by 1 million to compute an estimate of the age of the universe. Store these ages in an array called `bootstrap_ages`, and then use them to compute a 95% confidence interval for the age of the universe.\n", + "\n", + "**Note:** This might take up to a minute, and more repetitions will take even longer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "bootstrap_ages = make_array()\n", + "for i in np.arange(1000):\n", + " bootstrap_ages = ...\n", + "\n", + "lower_end = ...\n", + "upper_end = ...\n", + "Table().with_column(\"Age estimate\", bootstrap_ages*1e-9).hist(bins=np.arange(12, 16, .1), unit=\"billion years\")\n", + "print(\"95% confidence interval for the age of the universe: [{:g}, {:g}] billion years\".format(lower_end*1e-9, upper_end*1e-9))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "check('tests/q1_7.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Nice work, data astronomer! You can compare your result to the [Planck project 2015 results](https://arxiv.org/pdf/1502.01589.pdf), which estimated the age of the universe to be 13.799±0.021 billion years. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Submission" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You're finished with lab 3! In order to successfully submit your assignment, follow these steps...\n", + "- **IMPORTANT** Before you do anything, **Save and Checkpoint** from the `File` menu. Please do this first before running the cell below,\n", + "- **run all the tests and verify that they all pass** (the next cell has a shortcut for that), \n", + "- **Review the notebook one last time, we will be grading the final state of your notebook** If you make any changes, please **Save and Checkpoint** again." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# For your convenience, you can run this cell to run all the tests at once!\n", + "import glob\n", + "from gofer.ok import grade_notebook\n", + "if not globals().get('__GOFER_GRADER__', False):\n", + " display(grade_notebook('lab03.ipynb', sorted(glob.glob('tests/q*.py'))))" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + }, + "widgets": { + "state": { + "6c09ba2474d24e10bdd21db7b9699237": { + "views": [ + { + "cell_index": 9 + } + ] + }, + "ef0a0194fbdd498787d3894efa009a7e": { + "views": [ + { + "cell_index": 3 + } + ] + } + }, + "version": "1.2.0" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/materials/x18/lab/3/lab03/lab03.ipynb b/materials/x18/lab/3/lab03/lab03.ipynb index a6f7006..669974b 100644 --- a/materials/x18/lab/3/lab03/lab03.ipynb +++ b/materials/x18/lab/3/lab03/lab03.ipynb @@ -8,15 +8,17 @@ "\n", "Welcome to Lab 3 of Data 8.3x!\n", "\n", - "Sometimes, the primary purpose of regression analysis is to learn something about the slope or intercept of the best-fitting line. When we use a sample of data to estimate the slope or intercept, our estimate is subject to random error, just like our estimates of population means and medians.\n", + "Sometimes, the primary purpose of regression analysis is to learn something about the slope or intercept of the best-fitting line. When we use a sample of data to estimate the slope or intercept, our estimate is subject to random error, just as in the simpler case of the mean of a random sample.\n", "\n", - "In this lab, we'll use linear regression to estimate the age of the universe using pictures of exploding stars. Our estimate will come from a sample of all exploding stars. We'll compute a confidence interval to quantify the error caused by sampling." + "In this lab, we'll use regression to get an accurate estimate for the age of the universe, using pictures of exploding stars. Our estimate will come from a sample of all exploding stars. We'll compute a confidence interval to quantify the error caused by sampling." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# Run this cell, but please don't change it.\n", @@ -50,7 +52,7 @@ "### The Actual Big Bang Theory\n", "In the early 20th century, the most popular cosmological theory suggested that the universe had always existed at a fixed size. Today, the Big Bang theory prevails: Our universe started out very small and is still expanding.\n", "\n", - "A consequence of this is Hubble's Law, which says that the expansion of the universe creates the appearance that every celestial object that's reasonably far away from Earth (for example, another galaxy) is moving away from us at a constant speed. If we extrapolate that motion backwards to the time when everything in the universe was in the same place, that time is (roughly) the beginning of the universe!\n", + "A consequence of this is Hubble's Law, which states that every celestial object that's reasonably far away from Earth (for example, another galaxy) is moving away from us at a constant speed. If we extrapolate that motion backwards to the time when everything in the universe was in the same place, that time is (roughly) the beginning of the universe!\n", "\n", "Scientists have used this fact, along with measurements of the current *location* and *movement speed* of other celestial objects, to estimate when the universe started.\n", "\n", @@ -62,7 +64,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# Just run this cell. (The simulation is actually not\n", @@ -125,7 +129,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# Run this cell to see a picture of Mei's locations over time.\n", @@ -154,7 +160,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# Just run this cell.\n", @@ -189,7 +197,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# Just run this cell.\n", @@ -207,7 +217,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# Fill in the start time you infer from the above line.\n", @@ -219,6 +231,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -243,7 +256,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# Just run this cell.\n", @@ -267,7 +282,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# Fill this in manually by examining the line above.\n", @@ -280,7 +297,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q1_2.py')" @@ -318,6 +337,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "for_assignment_type": "student" }, "outputs": [], @@ -341,6 +361,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -352,7 +373,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q1_4.py')" @@ -378,17 +401,16 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def fit_line(tbl):\n", " # Your code may need more than 1 line below here.\n", - " # Rather than using the regression line formulas, try\n", - " # calling minimize on the mean squared error.\n", - " ...\n", - " slope = ...\n", - " intercept = ...\n", - " return make_array(slope, intercept)\n", + " def mse(..., ...):\n", + " ... \n", + " return ... \n", " \n", "# Here is an example call to your function. To test your function,\n", "# figure out the right slope and intercept by hand.\n", @@ -401,7 +423,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q1_5.py')" @@ -423,6 +447,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": false }, "outputs": [], @@ -466,7 +491,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "bootstrap_ages = make_array()\n", @@ -482,7 +509,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "check('tests/q1_7.py')" @@ -515,7 +544,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# For your convenience, you can run this cell to run all the tests at once!\n", @@ -543,7 +574,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.6.1" }, "widgets": { "state": { @@ -568,4 +599,3 @@ "nbformat": 4, "nbformat_minor": 1 } - From 6f7a331db01122b8e5d0daff4f9ab18f162f3174 Mon Sep 17 00:00:00 2001 From: Yanay Rosen Date: Mon, 26 Nov 2018 14:39:45 -0800 Subject: [PATCH 2/8] Removed checkpoints --- .../.ipynb_checkpoints/lab00-checkpoint.ipynb | 264 --- .../.ipynb_checkpoints/lab01-checkpoint.ipynb | 1084 ------------ .../.ipynb_checkpoints/lab02-checkpoint.ipynb | 1548 ----------------- .../.ipynb_checkpoints/lab03-checkpoint.ipynb | 1071 ------------ .../.ipynb_checkpoints/lab04-checkpoint.ipynb | 1128 ------------ .../.ipynb_checkpoints/lab01-checkpoint.ipynb | 729 -------- .../.ipynb_checkpoints/lab02-checkpoint.ipynb | 664 ------- .../.ipynb_checkpoints/lab05-checkpoint.ipynb | 667 ------- .../.ipynb_checkpoints/lab06-checkpoint.ipynb | 881 ---------- .../.ipynb_checkpoints/lab01-checkpoint.ipynb | 772 -------- .../.ipynb_checkpoints/lab02-checkpoint.ipynb | 716 -------- .../.ipynb_checkpoints/lab03-checkpoint.ipynb | 601 ------- 12 files changed, 10125 deletions(-) delete mode 100644 materials/x18/lab/1/lab00/.ipynb_checkpoints/lab00-checkpoint.ipynb delete mode 100644 materials/x18/lab/1/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb delete mode 100644 materials/x18/lab/1/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb delete mode 100644 materials/x18/lab/1/lab03/.ipynb_checkpoints/lab03-checkpoint.ipynb delete mode 100644 materials/x18/lab/1/lab04/.ipynb_checkpoints/lab04-checkpoint.ipynb delete mode 100644 materials/x18/lab/2/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb delete mode 100644 materials/x18/lab/2/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb delete mode 100644 materials/x18/lab/2/lab05/.ipynb_checkpoints/lab05-checkpoint.ipynb delete mode 100644 materials/x18/lab/2/lab06/.ipynb_checkpoints/lab06-checkpoint.ipynb delete mode 100644 materials/x18/lab/3/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb delete mode 100644 materials/x18/lab/3/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb delete mode 100644 materials/x18/lab/3/lab03/.ipynb_checkpoints/lab03-checkpoint.ipynb diff --git a/materials/x18/lab/1/lab00/.ipynb_checkpoints/lab00-checkpoint.ipynb b/materials/x18/lab/1/lab00/.ipynb_checkpoints/lab00-checkpoint.ipynb deleted file mode 100644 index c75742c..0000000 --- a/materials/x18/lab/1/lab00/.ipynb_checkpoints/lab00-checkpoint.ipynb +++ /dev/null @@ -1,264 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Lab 0: Introduction and Practice with Jupyter Notebooks" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In Lab 0, you will learn how to navigate a Jupyter Notebook (like this one). All of the required lab assignments in this course are published as jupyter notebooks. You follow the instructions in the notebook to complete the assignment.\n", - "\n", - "This one isn't graded, but you should complete it anyway for practice. Let's get started!" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "## 1. Jupyter notebooks\n", - "This webpage is called a Jupyter notebook. A notebook is a place to write programs and view their results.\n", - "\n", - "### 1.1. Text cells\n", - "In a notebook, each rectangle containing text or code is called a *cell*.\n", - "\n", - "Text cells (like this one) can be edited by double-clicking on them. They're written in a simple format called [Markdown](http://daringfireball.net/projects/markdown/syntax) to add formatting and section headings. You don't need to learn Markdown, but you might want to.\n", - "\n", - "After you edit a text cell, select the \"run cell\" button at the top that looks like ▶| to confirm any changes." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 1.1.1.**
\n", - "This paragraph is in its own text cell. Try editing it so that **this** sentence is the last sentence in the paragraph, and then select the \"run cell\" ▶| button on the top. This sentence, for example, should be deleted. So should this one." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "print(\"Hello, World!\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And this one:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "print(\"\\N{WAVING HAND SIGN}, \\N{EARTH GLOBE ASIA-AUSTRALIA}!\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The fundamental building block of Python code is an expression. Cells can contain multiple lines with multiple expressions. When you run a cell, the lines of code are executed in the order in which they appear. Every `print` expression prints a line. Run the next cell and notice the order of the output." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "print(\"First this line is printed,\")\n", - "print(\"and then this one.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 1.2.1.**
\n", - "Change the cell above so that it prints out:\n", - "\n", - " First this line,\n", - " then the whole 🌏,\n", - " and then this one.\n", - "\n", - "*Hint:* If you're stuck on how to print the Earth symbol, try looking at the print expressions above." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 1.3. Writing Jupyter notebooks\n", - "You can use Jupyter notebooks for your own projects or documents. They are among the world's most popular programming environments for data science. When you make your own notebook, you'll need to create your own cells for text and code.\n", - "\n", - "To add a cell, select the + button in the menu bar. A new cell starts out as text. You can change it to a code cell by selecting it so that it's highlighted, then selecting the drop-down box next to the restart (⟳) button in the menu bar, and choosing Code instead of Markdown.\n", - "\n", - "**Question 1.3.1.**
\n", - "Add a code cell below this one. Write code in it that prints out:\n", - " \n", - " A whole new cell! \n", - " ♪🌏♪\n", - "\n", - "(That musical note symbol is like the Earth symbol. Its long-form name is `\\N{EIGHTH NOTE}`.)\n", - "\n", - "Run your cell to verify that it works." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 1.4. Errors\n", - "Python is a language, and like natural human languages, it has rules. It differs from natural language in two important ways:\n", - "1. The rules are *simple*. You can learn most of them in a few weeks and gain reasonable proficiency with the language in a semester.\n", - "2. The rules are *rigid*. If you're proficient in a natural language, you can understand a non-proficient speaker, glossing over small mistakes. A computer running Python code is not smart enough to do that.\n", - "\n", - "Whenever you write code, you'll make mistakes. When you run a code cell that has errors, Python will sometimes produce error messages to tell you what you did wrong.\n", - "\n", - "Errors are okay; even experienced programmers make many errors. When you make an error, you just have to find the source of the problem, fix it, and move on.\n", - "\n", - "We have made an error in the next cell. Run it and see what happens." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "print(\"This line is missing something.\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You should see something like this (minus our annotations):\n", - "\n", - "\"\"/\n", - "\n", - "The last line of the error output attempts to tell you what went wrong. The *syntax* of a language is its structure, and this `SyntaxError` tells you that you have created an illegal structure. \"`EOF`\" means \"end of file,\" so the message is saying Python expected you to write something more (in this case, a right parenthesis) before finishing the cell.\n", - "\n", - "There's a lot of terminology in programming languages. You'll learn as you go. If you are ever having trouble understanding an error message, search the discussion forum. If you don't find an answer, post a question about the error yourself.\n", - "\n", - "Try to fix the code above so that you can run the cell and see the intended message instead of an error." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 1.5. The Kernel\n", - "The kernel is a program that executes the code inside your notebook and outputs the results. In the top right of your window, you can see a circle that indicates the status of your kernel. If the circle is empty (⚪), the kernel is idle and ready to execute code. If the circle is filled in (⚫), the kernel is busy running some code. \n", - "\n", - "You may run into problems where your kernel is stuck for an excessive amount of time, your notebook is very slow and unresponsive, or your kernel loses its connection. If this happens, try the following steps:\n", - "1. At the top of your screen, select **Kernel**, then **Interrupt**.\n", - "2. If that doesn't help, select **Kernel**, then **Restart**. If you do this, you will have to run your code cells from the start of your notebook up until where you paused your work.\n", - "3. If that doesn't help, restart your server. First, save your work by selecting **File** at the top left of your screen, then **Save and Checkpoint**. Next, select **Control Panel** at the top right. Choose **Stop My Server** to shut it down, then **My Server** to start it back up. Then, navigate back to the notebook you were working on." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 1.6. Completing a lab\n", - "All assignments in the course will be distributed as notebooks like this one. At the top of each assignment, you'll see a cell like the one below that imports autograder tests. Run it to import the autograder tests.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Don't change this cell, just run it\n", - "# Import autograder tests\n", - "from gofer.ok import check" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When you finish a question, you need to check your answer by running the check command below. It's OK to grade multiple times; Gofer will only try to grade your final submission for each question. There are no hidden autograder tests. If you pass all the given autograder tests for a question, you will receive full credit for that question." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check(\"tests/q0.py\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The notebook resides on a server that is run by the course staff, and so we have access to it as well. Once you're finished with a lab, use the File menu within the notebook page (below the Jupyter logo) to \"Save and Checkpoint\" and you're done. You may also check your notebook in its entirety with the following command." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import glob\n", - "from gofer.ok import grade_notebook\n", - "if not globals().get('__GOFER_GRADER__', False):\n", - " display(grade_notebook('lab00.ipynb', sorted(glob.glob('tests/q*.py'))))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/materials/x18/lab/1/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb b/materials/x18/lab/1/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb deleted file mode 100644 index 75b2c77..0000000 --- a/materials/x18/lab/1/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb +++ /dev/null @@ -1,1084 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Lab 1: Introduction to Python\n", - "\n", - "Welcome to Lab 1! Each week you will complete a lab assignment like this one. In this lab, you'll get started with the Python programming language through numbers, names, and expressions.\n", - "\n", - "As you go, please regularly select **Save and Checkpoint** from the `File` menu below the Jupyter logo to save your work." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Numbers\n", - "\n", - "Quantitative information arises everywhere in data science. In addition to representing commands to print out lines, expressions can represent numbers and methods of combining numbers. The expression `3.2500` evaluates to the number 3.25. (Run the cell and see.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "3.2500" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notice that we didn't have to `print`. When you run a notebook cell, if the last line has a value, then Jupyter helpfully prints out that value for you. However, it won't print out prior lines automatically. If you want to print out a prior line, you need to add the `print` statement. Run the cell below to check." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "print(2)\n", - "3\n", - "4" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Above, you should see that 4 is the value of the last expression, 2 is printed, but 3 is lost forever because it was neither printed nor last.\n", - "\n", - "You don't want to print everything all the time anyway. But if you feel sorry for 3, change the cell above to print it." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 1.1. Arithmetic\n", - "The line in the next cell subtracts. Its value is what you'd expect. Run it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "3.25 - 1.5" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Many basic arithmetic operations are built in to Python. The textbook section on [Expressions](http://www.inferentialthinking.com/chapters/03/1/expressions.html) describes all the arithmetic operators used in the course. The common operator that differs from typical math notation is `**`, which raises one number to the power of the other. So, `2**3` stands for $2^3$ and evaluates to 8. \n", - "\n", - "The order of operations is what you learned in elementary school, and Python also has parentheses. For example, compare the outputs of the cells below. Use parentheses for a happy new year!" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "-724.0" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "3+6*5-6*3**2*2**3/4*7" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "2018.0" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "3+(6*5-(6*3))**2*((2**3)/4*7)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In standard math notation, the first expression is\n", - "\n", - "$$3 + 6 \\times 5 - 6 \\times 3^2 \\times \\frac{2^3}{4} \\times 7,$$\n", - "\n", - "while the second expression is\n", - "\n", - "$$3 + (6 \\times 5 - (6 \\times 3))^2 \\times (\\frac{(2^3)}{4} \\times 7).$$\n", - "\n", - "**Question 1.1.1.**
Write a Python expression in this next cell that's equal to $5 \\times (3 \\frac{10}{11}) - 49 \\frac{1}{3} + 2^{.5 \\times 22} - \\frac{7}{33}$. That's five times three and ten elevenths, minus 49 and a third, plus two to the power of half of 22, minus 7 33rds. By \"$3 \\frac{10}{11}$\" we mean $3+\\frac{10}{11}$, not $3 \\times \\frac{10}{11}$.\n", - "\n", - "Replace the ellipses (`...`) with your expression. Try to use parentheses only when necessary.\n", - "\n", - "*Hint:* The correct output should be a familiar number." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "..." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Names\n", - "In natural language, we have terminology that lets us quickly reference very complicated concepts. We don't say, \"That's a large mammal with brown fur and sharp teeth!\" Instead, we just say, \"Bear!\"\n", - "\n", - "Similarly, an effective strategy for writing code is to define names for data as we compute it, like a lawyer would define terms for complex ideas at the start of a legal document to simplify the rest of the writing.\n", - "\n", - "In Python, we do this with *assignment statements*. An assignment statement has a name on the left side of an `=` sign and an expression to be evaluated on the right." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "ten = 3 * 2 + 4" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When you run that cell, Python first evaluates the first line. It computes the value of the expression `3 * 2 + 4`, which is the number 10. Then it gives that value the name `ten`. At that point, the code in the cell is done running.\n", - "\n", - "After you run that cell, the value 10 is bound to the name `ten`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "ten" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The statement `ten = 3 * 2 + 4` is not asserting that `ten` is already equal to `3 * 2 + 4`, as we might expect by analogy with math notation. Rather, that line of code changes what `ten` means; it now refers to the value 10, whereas before it meant nothing at all.\n", - "\n", - "If the designers of Python had been ruthlessly pedantic, they might have made us write\n", - "\n", - " define the name ten to hereafter have the value of 3 * 2 + 4 \n", - "\n", - "instead. You will probably appreciate the brevity of \"`=`\"! But keep in mind that this is the real meaning.\n", - "\n", - "**Question 2.1.**
Try writing code that uses a name (like `eleven`) that hasn't been assigned to anything. You'll see an error!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A common pattern in Jupyter notebooks is to assign a value to a name and then immediately evaluate the name in the last line in the cell so that the value is displayed as output. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "close_to_pi = 355/113\n", - "close_to_pi" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Another common pattern is that a series of lines in a single cell will build up a complex computation in stages, naming the intermediate results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "bimonthly_salary = 840\n", - "monthly_salary = 2 * bimonthly_salary\n", - "number_of_months_in_a_year = 12\n", - "yearly_salary = number_of_months_in_a_year * monthly_salary\n", - "yearly_salary" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Names in Python can have letters (upper- and lower-case letters are both okay and count as different letters), underscores, and numbers. The first character can't be a number (otherwise a name might look like a number). And names can't contain spaces, since spaces are used to separate pieces of code from each other.\n", - "\n", - "Other than those rules, what you name something doesn't matter *to Python*. For example, this cell does the same thing as the above cell, except everything has a different name:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "a = 840\n", - "b = 2 * a\n", - "c = 12\n", - "d = c * b\n", - "d" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**However**, names are very important for making your code *readable* to yourself and others. The cell above is shorter, but it's totally useless without an explanation of what it does.\n", - "\n", - "According to a famous joke among computer scientists, naming things is one of the two hardest problems in computer science. (The other two are cache invalidation and \"off-by-one\" errors. And people say computer scientists have an odd sense of humor...)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 2.2.**
Assign the name `seconds_in_a_decade` to the number of seconds between midnight January 1, 2010 and midnight January 1, 2020. Use Python to perform any required arithmetic.\n", - "\n", - "*Hint:* If you're stuck, the next section shows you how to get hints." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "# Change the next line so that it computes the number of\n", - "# seconds in a decade and assigns that number the name\n", - "# seconds_in_a_decade.\n", - "seconds_in_a_decade = ...\n", - "\n", - "# We've put this line in this cell so that it will print\n", - "# the value you've given to seconds_in_a_decade when you\n", - "# run it. You don't need to change this.\n", - "seconds_in_a_decade" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2.1. Checking your code\n", - "Now that you know how to name things, you can start using the built-in *tests* to check whether your work is correct. Try not to change the contents of the test cells. \n", - "\n", - "The cell below appears only once in the notebook and loads all of the tests so that they can be run later. You can load all of the tests before you answer all questions in the notebook. You will run tests as you go to check your work along the way, and you can also run all of the tests at the end to make sure that you will receive full credit on the lab." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# These lines load the tests.\n", - "from gofer.ok import check" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Running the following cell will test whether you have assigned `seconds_in_a_decade` correctly in Question 2.2. \n", - "\n", - "Sometimes the tests will give hints about what went wrong. If the test doesn't pass, read the output, adjust your answer to the question, run the answer cell again to update the name `seconds_in_a_decade`, then run this test cell again.\n", - "\n", - "Sometimes the tests will tell you the answer. Rather than copying the answer, try to understand how it was reached. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "# Test cell; please do not change!\n", - "check('tests/q22.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2.2. Comments\n", - "You may have noticed this line in the cell above:\n", - "\n", - " # Test cell; please do not change!\n", - "\n", - "That is called a *comment*. It doesn't make anything happen in Python; Python ignores anything on a line after a #. Instead, it's there to communicate something about the code to you, the human reader. Comments are extremely useful.\n", - "\n", - "\"comic" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2.3. Application: A physics experiment\n", - "\n", - "On the Apollo 15 mission to the Moon, astronaut David Scott famously replicated Galileo's physics experiment in which he showed that gravity accelerates objects of different mass at the same rate. Because there is no air resistance for a falling object on the surface of the Moon, even two objects with very different masses and densities should fall at the same rate. David Scott compared a feather and a hammer.\n", - "\n", - "You can run the following cell to watch a video of the experiment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "from IPython.display import YouTubeVideo\n", - "# The original URL is:\n", - "# https://www.youtube.com/watch?v=U7db6ZeLR5s\n", - "YouTubeVideo(\"U7db6ZeLR5s\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here's the transcript of the video:\n", - "\n", - "**167:22:06 Scott**: Well, in my left hand, I have a feather; in my right hand, a hammer. And I guess one of the reasons we got here today was because of a gentleman named Galileo, a long time ago, who made a rather significant discovery about falling objects in gravity fields. And we thought where would be a better place to confirm his findings than on the Moon. And so we thought we'd try it here for you. The feather happens to be, appropriately, a falcon feather for our Falcon. And I'll drop the two of them here and, hopefully, they'll hit the ground at the same time. \n", - "\n", - "**167:22:43 Scott**: How about that!\n", - "\n", - "**167:22:45 Allen**: How about that! (Applause in Houston)\n", - "\n", - "**167:22:46 Scott**: Which proves that Mr. Galileo was correct in his findings." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Newton's Law.** Using this footage, we can also attempt to confirm another famous bit of physics: Newton's law of universal gravitation. Newton's laws predict that any object dropped near the surface of the Moon should fall\n", - "\n", - "$$\\frac{1}{2} G \\frac{M}{R^2} t^2 \\text{ meters}$$\n", - "\n", - "after $t$ seconds, where $G$ is a universal constant, $M$ is the moon's mass in kilograms, and $R$ is the moon's radius in meters. So if we know $G$, $M$, and $R$, then Newton's laws let us predict how far an object will fall over any amount of time.\n", - "\n", - "To verify the accuracy of this law, we will calculate the difference between the predicted distance the hammer drops and the actual distance. (If they are different, it might be because Newton's laws are wrong, or because our measurements are imprecise, or because there are other factors affecting the hammer for which we haven't accounted.)\n", - "\n", - "Someone studied the video and estimated that the hammer was dropped 113 cm from the surface. Counting frames in the video, the hammer falls for 1.2 seconds (36 frames)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 2.3.1.**
Complete the code in the next cell to fill in the *data* from the experiment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "# t, the duration of the fall in the experiment, in seconds.\n", - "# Fill this in.\n", - "time = ...\n", - "\n", - "# The estimated distance the hammer actually fell, in meters.\n", - "# Fill this in.\n", - "estimated_distance_m = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "check('tests/q231.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 2.3.2.**
Now, complete the code in the next cell to compute the difference between the predicted and estimated distances (in meters) that the hammer fell in this experiment.\n", - "\n", - "This just means translating the formula above ($\\frac{1}{2}G\\frac{M}{R^2}t^2$) into Python code. You'll have to replace each variable in the math formula with the name we gave that number in Python code." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "# First, we've written down the values of the 3 universal\n", - "# constants that show up in Newton's formula.\n", - "\n", - "# G, the universal constant measuring the strength of gravity.\n", - "gravity_constant = 6.674 * 10**-11\n", - "\n", - "# M, the moon's mass, in kilograms.\n", - "moon_mass_kg = 7.34767309 * 10**22\n", - "\n", - "# R, the radius of the moon, in meters.\n", - "moon_radius_m = 1.737 * 10**6\n", - "\n", - "# The distance the hammer should have fallen over the\n", - "# duration of the fall, in meters, according to Newton's\n", - "# law of gravity. The text above describes the formula\n", - "# for this distance given by Newton's law.\n", - "# **YOU FILL THIS PART IN.**\n", - "predicted_distance_m = ...\n", - "\n", - "# Here we've computed the difference between the predicted\n", - "# fall distance and the distance we actually measured.\n", - "# If you've filled in the above code, this should just work.\n", - "difference = predicted_distance_m - estimated_distance_m\n", - "difference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "check('tests/q232.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Calling functions\n", - "\n", - "The most common way to combine or manipulate values in Python is by calling functions. Python comes with many built-in functions that perform common operations.\n", - "\n", - "For example, the `abs` function takes a single number as its argument and returns the absolute value of that number. The absolute value of a number is its distance from 0 on the number line, so `abs(5)` is 5 and `abs(-5)` is also 5." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "abs(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "abs(-5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 3.1. Application: Computing walking distances\n", - "Chunhua is on the corner of 7th Avenue and 42nd Street in Midtown Manhattan, and she wants to know far she'd have to walk to get to Gramercy School on the corner of 10th Avenue and 34th Street.\n", - "\n", - "She can't cut across blocks diagonally, since there are buildings in the way. She has to walk along the sidewalks. Using the map below, she sees she'd have to walk 3 avenues (long blocks) and 8 streets (short blocks). In terms of the given numbers, she computed 3 as the difference between 7 and 10, *in absolute value*, and 8 similarly. \n", - "\n", - "Chunhua also knows that blocks in Manhattan are all about 80m by 274m (avenues are farther apart than streets). So in total, she'd have to walk $(80 \\times |42 - 34| + 274 \\times |7 - 10|)$ meters to get to the park.\n", - "\n", - "\"visual\n", - "\n", - "**Question 3.1.1.**
Finish the line `num_avenues_away = ...` in the next cell so that the cell calculates the distance Chunhua must walk and gives it the name `manhattan_distance`. Everything else has been filled in for you. **Use the `abs` function.**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "# Here's the number of streets away:\n", - "num_streets_away = abs(42-34)\n", - "\n", - "# Compute the number of avenues away in a similar way:\n", - "num_avenues_away = ...\n", - "\n", - "street_length_m = 80\n", - "avenue_length_m = 274\n", - "\n", - "# Now we compute the total distance Chunhua must walk.\n", - "manhattan_distance = street_length_m*num_streets_away + avenue_length_m*num_avenues_away\n", - "\n", - "# We've included this line so that you see the distance\n", - "# you've computed when you run this cell. You don't need\n", - "# to change it, but you can if you want.\n", - "manhattan_distance" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Be sure to run the next cell to test your code." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "check('tests/q311.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Multiple arguments\n", - "Some functions take multiple arguments, separated by commas. For example, the built-in `max` function returns the maximum argument passed to it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "max(2, -3, 4, -5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Understanding nested expressions\n", - "Function calls and arithmetic expressions can themselves contain expressions. You saw an example in the last question:\n", - "\n", - " abs(42-34)\n", - "\n", - "has 2 number expressions in a subtraction expression in a function call expression. And you probably wrote something like `abs(7-10)` to compute `num_avenues_away`.\n", - "\n", - "Nested expressions can turn into complicated-looking code. However, the way in which complicated expressions break down is very regular.\n", - "\n", - "Suppose we are interested in heights that are very unusual. We'll say that a height is unusual to the extent that it's far away on the number line from the average human height. [An estimate](http://press.endocrine.org/doi/full/10.1210/jcem.86.9.7875?ck=nck&) of the average adult human height (averaging, we hope, over all humans on Earth today) is 1.688 meters.\n", - "\n", - "So if Aditya is 1.21 meters tall, then his height is $|1.21 - 1.688|$, or $.478$, meters away from the average. Here's a picture of that:\n", - "\n", - "\"number\n", - "\n", - "And here's how we'd write that in one line of Python code:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "abs(1.21 - 1.688)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "What's going on here? `abs` takes just one argument, so the stuff inside the parentheses is all part of that *single argument*. Specifically, the argument is the value of the expression `1.21 - 1.688`. The value of that expression is `-.478`. That value is the argument to `abs`. The absolute value of that is `.478`, so `.478` is the value of the full expression `abs(1.21 - 1.688)`.\n", - "\n", - "Picture simplifying the expression in several steps:\n", - "\n", - "1. `abs(1.21 - 1.688)`\n", - "2. `abs(-.478)`\n", - "3. `.478`\n", - "\n", - "In fact, that's basically what Python does to compute the value of the expression." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 4.1.**
Say that Botan's height is 1.85 meters. In the next cell, use `abs` to compute the absolute value of the difference between Botan's height and the average human height. Give that value the name `botan_distance_from_average_m`.\n", - "\n", - "\"number" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "# Replace the ... with an expression to compute the absolute\n", - "# value of the difference between Botan's height (1.85m) and\n", - "# the average human height.\n", - "botan_distance_from_average_m = ...\n", - "\n", - "# Again, we've written this here so that the distance you\n", - "# compute will get printed when you run this cell.\n", - "botan_distance_from_average_m" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "check('tests/q41.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 4.1. More nesting\n", - "Now say that we want to compute the most unusual height among Aditya's and Botan's heights. We'll use the function `max`, which (again) takes two numbers as arguments and returns the larger of the two arguments. Combining that with the `abs` function, we can compute the biggest distance from the average among the two heights:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "# Just read and run this cell.\n", - "\n", - "aditya_height_m = 1.21\n", - "botan_height_m = 1.85\n", - "average_adult_human_height_m = 1.688\n", - "\n", - "# The biggest distance from the average human height, among the two heights:\n", - "biggest_distance_m = max(abs(aditya_height_m - average_adult_human_height_m), abs(botan_height_m - average_adult_human_height_m))\n", - "\n", - "# Print out our results in a nice readable format:\n", - "print(\"The biggest distance from the average height among these two people is\", biggest_distance_m, \"meters.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The line where `biggest_distance_m` is computed looks complicated, but we can break it down into simpler components just like we did before.\n", - "\n", - "The basic recipe is repeated simplification of small parts of the expression:\n", - "* We start with the simplest components whose values we know, like plain names or numbers. (Examples: `aditya_height_m` or `5`.)\n", - "* **Find a simple-enough group of expressions:** We look for a group of simple expressions that are directly connected to each other in the code, for example by arithmetic or as arguments to a function call.\n", - "* **Evaluate that group:** We evaluate the arithmetic expressions or function calls they're part of, and replace the whole group with whatever we compute. (Example: `aditya_height_m - average_adult_human_height_m` becomes `-.478`.)\n", - "* **Repeat:** We continue this process, using the values of the glommed-together stuff as our new basic components. (Example: `abs(-.478)` becomes `.478`, and `max(.478, .162)` later becomes `.478`.)\n", - "* We keep doing that until we've evaluated the whole expression.\n", - "\n", - "You can run the next cell to see a slideshow of that process." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "from IPython.display import IFrame\n", - "IFrame('https://docs.google.com/presentation/d/1urkX-nRsD8VJvcOnJsjmCy0Jpv752Ssn5Pphg2sMC-0/embed?start=false&loop=false&delayms=3000', 800, 600)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Ok, your turn. \n", - "\n", - "**Question 4.1.1.**
Given the heights of the Splash Triplets from the Golden State Warriors, write an expression that computes the smallest difference between any of the three heights. Your expression shouldn't have any numbers in it, only function calls and the names `klay`, `steph`, and `kevin`. Give the value of your expression the name `min_height_difference`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "# The three players' heights, in meters:\n", - "klay = 2.01 # Klay Thompson is 6'7\"\n", - "steph = 1.91 # Steph Curry is 6'3\"\n", - "kevin = 2.06 # Kevin Durant is officially 6'9\", but many suspect that he is taller.\n", - " # (Further complicating matters, membership of the \"Splash Triplets\" \n", - " # is disputed, since it was originally used in reference to \n", - " # Klay Thompson, Steph Curry, and Draymond Green.)\n", - "\n", - "# We'd like to look at all 3 pairs of heights, compute the absolute\n", - "# difference between each pair, and then find the smallest of those\n", - "# 3 absolute differences. This is left to you! If you're stuck,\n", - "# try computing the value for each step of the process (like the\n", - "# difference between Klay's heigh and Steph's height) on a separate\n", - "# line and giving it a name (like klay_steph_height_diff).\n", - "min_height_difference = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "check('tests/q411.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Tables" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A website called [Gapminder](https://www.gapminder.org/) collects a large variety of measurements of human health, education, and progress. Each measurement is published in a table that has one row per country and one column per year, describing how the measurement varies over time and place.\n", - "\n", - "For example, [this table](https://docs.google.com/spreadsheets/d/1kmnYQzXLGVF9RbKB3Y-WuUsJFumnE4s2UWdmlskv6r4/pub#) describes the average number of years of school attended by all women 25 and older. The table has a row for each of 175 countries and a column for each year from 1970 through 2009. The data were estimated for a study by the [Institute for Health Metrics and Evaluation](http://www.healthmetricsandevaluation.org/) called \"Increased educational attainment and its impact on child mortality: a systematic analysis in 175 countries from 1970 to 2009\" ([link](http://www.healthmetricsandevaluation.org/resources/datasets/2010/education_attainment/education_attainment.html&sa=D&ust=1522644678563000&usg=AFQjCNG-Rn_hO868jLLBz6FRLT8LSqwUVA)).\n", - "\n", - "To load tables into Python, you must first import the `datascience` module. The second line below makes sure that charts appear on the screen when you create them. You only need to execute these lines once per notebook (and each time you restart your kernel)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Don't change this cell\n", - "from datascience import *\n", - "%matplotlib inline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, run the next cell in order to load the table describing years of school attended by women around the world and over time. Only the first 10 rows of the table will be displayed." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "school = Table.read_table('school.csv')\n", - "school" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 5.1.**\n", - "Assign the name `top_1970` to a two-column table that has the column of country names (labeled `\"Row Labels\"`) and the years in school in 1970, sorted by the second column in decreasing order. Notice the large difference between the country with the most average years of school and the rest in the top 10.\n", - "\n", - "*Hint*: Even though 1970 is a number, treat it as text by placing it within quotation marks when using it as a label. For example, `school.select(\"1970\")` rather than `school.select(1970)`. Column labels are always text." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "top_1970 = ...\n", - "top_1970" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q51.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can create a bar chart of all the countries in the data set using the expression below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": false - }, - "outputs": [], - "source": [ - "top_1970.barh('Row Labels')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 5.2** Now, to see how much these numbers have changed, assign `top_1970_with_2009` to a table with the rows in the same order, but include a third column for 2009 as well. The differences between countries are much smaller in 2009." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "top_1970_with_2009 = ...\n", - "top_1970_with_2009" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q52.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A bar chart for this three-column table will compare 1970 to 2009 for each country. Everywhere in the world, the average number of years that women attend school has increased, in some cases dramatically!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": false - }, - "outputs": [], - "source": [ - "top_1970_with_2009.barh('Row Labels')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The [Gapminder data browser](https://www.gapminder.org/data/) includes many other tables that you can explore as well. For more information on how to load a table from the web, try the course discussion forum." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6. Completion" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Congratulations, you're done with lab 1! You can check that all tests pass by running the next cell. If all the tests are passing in your notebook when we score everybody's assignment, then you will receive full credit." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": false - }, - "outputs": [], - "source": [ - "# For your convenience, you can run this cell to run all the tests at once!\n", - "import glob\n", - "from gofer.ok import grade_notebook\n", - "if not globals().get('__GOFER_GRADER__', False):\n", - " display(grade_notebook('lab01.ipynb', sorted(glob.glob('tests/q*.py'))))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, select **Save and Checkpoint** from the `File` menu below the Jupyter logo to save your work." - ] - } - ], - "metadata": { - "anaconda-cloud": {}, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/materials/x18/lab/1/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb b/materials/x18/lab/1/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb deleted file mode 100644 index d5184af..0000000 --- a/materials/x18/lab/1/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb +++ /dev/null @@ -1,1548 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Lab 2: Data Types, Arrays, and Tables\n", - "Welcome to Lab 2! \n", - "\n", - "Last time, we had our first look at Python and Jupyter notebooks. So far, we've only used Python to manipulate numbers. There's a lot more to life than numbers, so Python lets us represent many other types of data in programs.\n", - "\n", - "In this lab, you'll first see how to represent and manipulate another fundamental type of data: text. A piece of text is called a *string* in Python.\n", - "\n", - "You'll also see how to invoke *methods*. A method is very similar to a function. Calling a method looks different because the method is tied to a particular piece of data.\n", - "\n", - "Last, you'll learn more about working with datasets in Python." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, initialize the grader. Each time you come back to this site to work on the lab, you will need to run this cell again." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from gofer.ok import check" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Review: The building blocks of Python code\n", - "\n", - "The two building blocks of Python code are *expressions* and *statements*. An **expression** is a piece of code that\n", - "\n", - "* is self-contained, meaning it would make sense to write it on a line by itself, and\n", - "* usually has a value.\n", - "\n", - "\n", - "Here are two expressions that both evaluate to 3\n", - "\n", - " 3\n", - " 5 - 2\n", - " \n", - "One important form of an expression is the **call expression**, which first names a function and then describes its arguments. The function returns some value, based on its arguments. Some important mathematical functions are\n", - "\n", - "| Function | Description |\n", - "|----------|---------------------------------------------------------------|\n", - "| `abs` | Returns the absolute value of its argument |\n", - "| `max` | Returns the maximum of all its arguments |\n", - "| `min` | Returns the minimum of all its arguments |\n", - "| `pow` | Raises its first argument to the power of its second argument |\n", - "| `round` | Round its argument to the nearest integer |\n", - "\n", - "Here are two call expressions that both evaluate to 3\n", - "\n", - " abs(2 - 5)\n", - " max(round(2.8), min(pow(2, 10), -1 * pow(2, 10)))\n", - "\n", - "All these expressions but the first are **compound expressions**, meaning that they are actually combinations of several smaller expressions. `2 + 3` combines the expressions `2` and `3` by addition. In this case, `2` and `3` are called **subexpressions** because they're expressions that are part of a larger expression. Any expression can be used as part of a larger expression.\n", - "\n", - "A **statement** is a piece of code that *makes something happen* rather than *having a value*. For example, an **assignment statement** assigns a value to a name. \n", - "\n", - "Every assignment statement has one `=` sign. The whole statement is executed by **evaluating the expression on the right-hand side** of the equals sign and then **assigning its value to the name on the left-hand side**. Here are some assignment statements:\n", - " \n", - " height = 1.3\n", - " the_number_five = abs(-5)\n", - " absolute_height_difference = abs(height - 1.688)\n", - "\n", - "A key idea in programming is that large, interesting things can be built by combining many simple, uninteresting things. The key to understanding a complicated piece of code is breaking it down into its simple components.\n", - "\n", - "For example, a lot is going on in the last statement above, but it's really just a combination of a few things. This picture describes what's going on.\n", - "\n", - "\"Explanation\n", - "\n", - "Any names that you assign in one cell are available in later cells and can be used in place of the value assigned to them." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 1.1.**
In the next cell, assign the name `new_year` to the larger number among the following two numbers:\n", - "\n", - "1. the absolute value of $2^{5}-2^{11}-2^{1}-2^{0}$, and \n", - "2. $5 \\times 13 \\times 31 + 4$.\n", - "\n", - "Try to use just one statement (one line of code)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "new_year = ...\n", - "new_year" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check your work by executing the next cell." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q11.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Text\n", - "Programming doesn't just concern numbers. Text is one of the most common types of values used in programs. \n", - "\n", - "A snippet of text is represented by a **string value** in Python. The word \"*string*\" is a programming term for a sequence of characters. A string might contain a single character, a word, a sentence, or a whole book.\n", - "\n", - "To distinguish text data from actual code, we demarcate strings by putting quotation marks around them. Single quotes (`'`) and double quotes (`\"`) are both valid, but the types of opening and closing quotation marks must match. The contents can be any sequence of characters, including numbers and symbols. \n", - "\n", - "We've seen strings before in `print` statements. Below, two different strings are passed as arguments to the `print` function." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "print(\"I <3\", 'Data Science')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Just like names can be given to numbers, names can be given to string values. The names and strings aren't required to be similar in any way. Any name can be assigned to any string." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "one = 'two'\n", - "plus = '*'\n", - "print(one, plus, one)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 2.1.**
Yuri Gagarin was the first person to travel through outer space. When he emerged from his capsule upon landing on Earth, he [reportedly](https://en.wikiquote.org/wiki/Yuri_Gagarin) had the following conversation with a woman and girl who saw the landing:\n", - "\n", - " The woman asked: \"Can it be that you have come from outer space?\"\n", - " Gagarin replied: \"As a matter of fact, I have!\"\n", - "\n", - "The cell below contains unfinished code. Fill in the `...`s so that it prints out this conversation *exactly* as it appears above." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "woman_asking = ...\n", - "woman_quote = '\"Can it be that you have come from outer space?\"'\n", - "gagarin_reply = 'Gagarin replied:'\n", - "gagarin_quote = ...\n", - "\n", - "print(woman_asking, woman_quote)\n", - "print(gagarin_reply, gagarin_quote)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q21.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2.1. String Methods" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Strings can be transformed using **methods**, which are functions that involve an existing string and some other arguments. One example is the `replace` method, which replaces all instances of some part of a string with some alternative. \n", - "\n", - "A method is invoked on a string by placing a `.` after the string value, then the name of the method, and finally parentheses containing the arguments. Here's a sketch, where the `<` and `>` symbols aren't part of the syntax; they just mark the boundaries of sub-expressions.\n", - "\n", - " .(, , ...)\n", - "\n", - "Try to predict the output of these examples, then execute them." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Replace one letter\n", - "'Hello'.replace('H', 'C')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Replace a sequence of letters, which appears twice\n", - "'hitchhiker'.replace('hi', 'ma')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Once a name is bound to a string value, methods can be invoked on that name as well. The name is still bound to the original string, so a new name is needed to capture the result. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sharp = 'edged'\n", - "hot = sharp.replace('ed', 'ma')\n", - "print('sharp:', sharp)\n", - "print('hot:', hot)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can call functions on the results of other functions. For example,\n", - "\n", - " max(abs(-5), abs(3))\n", - "\n", - "has value 5. Similarly, you can invoke methods on the results of other method (or function) calls." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Calling replace on the output of another call to replace\n", - "'train'.replace('t', 'ing').replace('in', 'de')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here's a picture of how Python evaluates a \"chained\" method call like that:\n", - "\n", - "\"In" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 2.1.1.**
Assign strings to the names `you` and `this` so that the final expression evaluates to a 10-letter English word with three double letters in a row.\n", - "\n", - "*Hint:* The call to `print` is there to print out the intermediate result called `the`. This should be an English word with two double letters in a row.\n", - "\n", - "*Hint 2:* Run the tests if you're stuck. They'll give you some hints." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "you = ...\n", - "this = ...\n", - "a = 'beeper'\n", - "the = a.replace('p', you) \n", - "print('the:', the)\n", - "the.replace('bee', this)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q211.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Other string methods do not take any arguments at all, because the original string is all that's needed to compute the result. In these cases, parentheses are still needed, but there's nothing in between the parentheses. Here are some methods that take no arguments:\n", - "\n", - "|Method name|Value|\n", - "|-|-|\n", - "|`lower`|a lowercased version of the string|\n", - "|`upper`|an uppercased version of the string|\n", - "|`capitalize`|a version with the first letter capitalized|\n", - "|`title`|a version with the first letter of every word capitalized||\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "'unIverSITy of caliFORnia'.title()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "All these string methods are useful, but most programmers don't memorize their names or how to use them. Instead, people usually just search the internet for documentation and examples. A complete [list of string methods](https://docs.python.org/3/library/stdtypes.html#string-methods) appears in the Python language documentation. [Stack Overflow](http://stackoverflow.com) has a huge database of answered questions that often demonstrate how to use these methods to achieve various ends." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2.2. Converting to and from Strings" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Strings and numbers are different *types* of values, even when a string contains the digits of a number. For example, evaluating the following cell causes an error because an integer cannot be added to a string." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "8 + \"8\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "However, there are built-in functions to convert numbers to strings and strings to numbers. \n", - "\n", - "|Function name|Effect|Example|\n", - "|-|-|-|\n", - "|`int` |Converts a string of digits and perhaps a negative sign to an integer (`int`) value|`int(\"42\")`|\n", - "|`float`|Converts a string of digits and perhaps a negative sign and decimal point to a decimal (`float`) value|`float(\"4.2\")`|\n", - "|`str` | Converts any value to a string (`str`) value|`str(42)`|" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Try to predict what the following cell will evaluate to, then evaluate it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "8 + int(\"8\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Suppose you're writing a program that looks for dates in a text, and you want your program to find the amount of time that elapsed between two years it has identified. It doesn't make sense to subtract two texts, but you can first convert the text containing the years into numbers.\n", - "\n", - "**Question 2.2.1.**
Finish the code below to compute the number of years that elapsed between `one_year` and `another_year`. Don't just write the numbers `1618` and `1648` (or `30`); use a conversion function to turn the given text data into numbers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Some text data:\n", - "one_year = \"1618\"\n", - "another_year = \"1648\"\n", - "\n", - "# Complete the next line. Note that we can't just write:\n", - "# another_year - one_year\n", - "# If you don't see why, try seeing what happens when you\n", - "# write that here.\n", - "difference = ...\n", - "difference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q221.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 2.2.2.** Use `replace` and `int` together to compute the difference between the the year 753 BC ([the founding of Rome](https://en.wikipedia.org/wiki/Ancient_Rome)) and the year 410 AD ([the sack of Rome](https://en.wikipedia.org/wiki/Sack_of_Rome_(410)). Try not to use any numbers in your solution, but instead manipulate the strings that are provided.\n", - "\n", - "*Hint*: It's ok to be off by one year. In historical calendars, there is no year zero, but astronomical calendars do include [year zero](https://en.wikipedia.org/wiki/Year_zero) to simplify calculations." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "founded = 'BC 753'\n", - "sacked = 'AD 410'\n", - "start = ...\n", - "end = ...\n", - "print('Ancient Rome lasted for about', end-start, 'years from', founded, 'to', sacked)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q222.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2.3. Strings as function arguments\n", - "\n", - "String values, like numbers, can be arguments to functions and can be returned by functions. The function `len` takes a single string as its argument and returns the number of characters in the string: its **len**gth. \n", - "\n", - "Note that it doesn't count *words*. `len(\"one small step for man\")` is 22, not 5.\n", - "\n", - "**Question 2.3.1.**
Use `len` to find out the number of characters in the very long string in the next cell. (It's the first sentence of the English translation of the French [Declaration of the Rights of Man](http://avalon.law.yale.edu/18th_century/rightsof.asp).) The length of a string is the total number of characters in it, including things like spaces and punctuation. Assign `sentence_length` to that number." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "a_very_long_sentence = \"The representatives of the French people, organized as a National Assembly, believing that the ignorance, neglect, or contempt of the rights of man are the sole cause of public calamities and of the corruption of governments, have determined to set forth in a solemn declaration the natural, unalienable, and sacred rights of man, in order that this declaration, being constantly before all the members of the Social body, shall remind them continually of their rights and duties; in order that the acts of the legislative power, as well as those of the executive power, may be compared at any moment with the objects and purposes of all political institutions and may thus be more respected, and, lastly, in order that the grievances of the citizens, based hereafter upon simple and incontestable principles, shall tend to the maintenance of the constitution and redound to the happiness of all.\"\n", - "sentence_length = ...\n", - "sentence_length" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q231.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Importing code\n", - "\n", - "> What has been will be again, \n", - "> what has been done will be done again; \n", - "> there is nothing new under the sun.\n", - "\n", - "Most programming involves work that is very similar to work that has been done before. Since writing code is time consuming, it's good to rely on others' published code when you can. Rather than copy-pasting, Python allows us to **import** other code, creating a **module** that contains all of the names created by that code.\n", - "\n", - "Python includes many useful modules that are just an `import` away. We'll look at the `math` module as a first example. The `math` module is extremely useful in computing mathematical expressions in Python. \n", - "\n", - "Suppose we want to very accurately compute the area of a circle with radius 5 meters. For that, we need the constant $\\pi$, which is roughly 3.14. Conveniently, the `math` module has `pi` defined for us:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import math\n", - "radius = 5\n", - "area_of_circle = radius**2 * math.pi\n", - "area_of_circle" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`pi` is defined inside `math`, and the way that we access names that are inside modules is by writing the module's name, then a dot, then the name of the thing we want:\n", - "\n", - " .\n", - " \n", - "In order to use a module at all, we must first write the statement `import `. That statement creates a module object with things like `pi` in it and then assigns the name `math` to that module. Above we have done that for `math`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 3.1.**
`math` also provides the name `e` for the base of the natural logarithm, which is roughly 2.71. Compute $e^{\\pi}-\\pi$, giving it the name `near_twenty`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "near_twenty = ...\n", - "near_twenty" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q31.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![XKCD](http://imgs.xkcd.com/comics/e_to_the_pi_minus_pi.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 3.1. Importing functions\n", - "\n", - "**Modules** can provide other named things, including **functions**. For example, `math` provides the name `sin` for the sine function. Having imported `math` already, we can write `math.sin(3)` to compute the sine of 3. (Note that this sine function considers its argument to be in [radians](https://en.wikipedia.org/wiki/Radian), not degrees. 180 degrees are equivalent to $\\pi$ radians.)\n", - "\n", - "**Question 3.1.1.**
A $\\frac{\\pi}{4}$-radian (45-degree) angle forms a right triangle with equal base and height, pictured below. If the hypotenuse (the radius of the circle in the picture) is 1, then the height is $\\sin(\\frac{\\pi}{4})$. Compute that using `sin` and `pi` from the `math` module. Give the result the name `sine_of_pi_over_four`.\n", - "\n", - "\n", - "(Source: [Wolfram MathWorld](http://mathworld.wolfram.com/images/eps-gif/TrigonometryAnglesPi4_1000.gif))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sine_of_pi_over_four = ...\n", - "sine_of_pi_over_four" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q311.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For your reference, here are some more examples of functions from the `math` module.\n", - "\n", - "Note how different methods take in different number of arguments. Often, the documentation of the module will provide information on how many arguments is required for each method." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Calculating factorials.\n", - "math.factorial(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Calculating logarithms (the logarithm of 8 in base 2).\n", - "# The result is 3 because 2 to the power of 3 is 8.\n", - "math.log(8, 2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Calculating square roots.\n", - "math.sqrt(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There's many variations of how we can import methods from outside sources. For example, we can import just a specific method from an outside source, we can rename a library we import, and we can import every single method from a whole library. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Importing just cos and pi from math.\n", - "# Now, we don't have to use \"math.\" before these names.\n", - "from math import cos, pi\n", - "print(cos(pi))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# We can nickname math as something else, if we don't want to type the name math\n", - "import math as m\n", - "m.log(m.pi)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Lastly, we can import ever thing from math and use all of its names without \"math.\"\n", - "from math import *\n", - "log(pi)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### A function that displays a picture\n", - "People have written Python functions that do very cool and complicated things, like crawling web pages for data, transforming videos, or learning functions from data. Now that you can import things, when you want to do something with code, first check to see if someone else has done it for you.\n", - "\n", - "Let's see an example of a function that's used for downloading and displaying pictures.\n", - "\n", - "The module `IPython.display` provides a function called `Image`. The `Image` function takes a single argument, a string that is the URL of the image on the web. It returns an *image* value that this Jupyter notebook understands how to display. To display an image, make it the value of the last expression in a cell, just like you'd display a number or a string.\n", - "\n", - "**Question 3.1.2.**
In the next cell, import the module `IPython.display` and use its `Image` function to display the image at this URL:\n", - "\n", - " https://upload.wikimedia.org/wikipedia/commons/thumb/8/8c/David_-_The_Death_of_Socrates.jpg/1024px-David_-_The_Death_of_Socrates.jpg\n", - "\n", - "Give the name `art` to the output of the call to `Image`. (It might take a few seconds to load the image. It's a painting called *The Death of Socrates* by Jacques-Louis David, depicting events from a philosophical text by Plato.)\n", - "\n", - "*Hint*: A link isn't any special type of data type in Python. You can't just write a link into Python and expect it to work; you need to type the link in as a specific data type. Which one makes the most sense?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Import the module IPython.display. Watch out for capitalization.\n", - "import IPython.display\n", - "# Replace the ... with a call to the Image function\n", - "# in the IPython.display module, which should produce\n", - "# a picture.\n", - "art = ...\n", - "art" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q312.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Arrays\n", - "\n", - "Up to now, we haven't done much that you couldn't do yourself by hand, without going through the trouble of learning Python. Computers are most useful when a small amount of code performs a lot of work by *performing the same action* to *many different things*.\n", - "\n", - "For example, in the time it takes you to calculate the 18% tip on a restaurant bill, a laptop can calculate 18% tips for every restaurant bill paid by every human on Earth that day. (That's if you're pretty fast at doing arithmetic in your head!)\n", - "\n", - "**Arrays** are how we put many values in one place so that we can operate on them as a group. For example, if `billions_of_numbers` is an array of numbers, the expression\n", - "\n", - " .18 * billions_of_numbers\n", - "\n", - "gives a new array of numbers that's the result of multiplying each number in `billions_of_numbers` by .18 (18%). Arrays are not limited to numbers; we can also put all the words in a book into an array of strings.\n", - "\n", - "Concretely, an array is a **collection of values of the same type**, like a column in an Excel spreadsheet. \n", - "\n", - "\"In" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 4.1. Making arrays\n", - "You can type in the data that goes in an array yourself, but that's not typically how programs work. Normally, we create arrays by loading them from an external source, like a data file.\n", - "\n", - "First, though, let's learn how to start from scratch. Execute the following cell so that all the names from the `datascience` module are available to you. The documentation for this module is available at [http://data8.org/datascience](http://data8.org/datascience/)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from datascience import *" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, to create an array, call the function `make_array`. Each argument you pass to `make_array` will be in the array it returns. Run this cell to see an example:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "make_array(0.125, 4.75, -1.3)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Each value in an array (in the above case, the numbers 0.125, 4.75, and -1.3) is called an *element* or *item* of that array.\n", - "\n", - "Arrays themselves are also values, just like numbers and strings. That means you can assign them names or use them as arguments to functions.\n", - "\n", - "**Question 4.1.1.**
Make an array containing the numbers 1, 2, and 3, in that order. Name it `small_numbers`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "small_numbers = ...\n", - "small_numbers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q411.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 4.1.2.**
Make an array containing the numbers 0, 1, -1, $\\pi$, and $e$, in that order. Name it `interesting_numbers`. *Hint:* How did you get the values $\\pi$ and $e$ earlier? You can refer to them in exactly the same way here." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "interesting_numbers = ...\n", - "interesting_numbers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q412.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 4.1.3.**
Make an array containing the five strings `\"Hello\"`, `\",\"`, `\" \"`, `\"world\"`, and `\"!\"`. (The third one is a single space inside quotes.) Name it `hello_world_components`.\n", - "\n", - "*Note:* If you print `hello_world_components`, you'll notice some extra information in addition to its contents: `dtype=' Assign `separator` to a string so that the name `hello` is bound to the string `'Hello, world!'` in the cell below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "separator = ...\n", - "hello = separator.join(hello_world_components)\n", - "hello" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q414.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 4.1.1. `np.arange`\n", - "Arrays are provided by a package called [NumPy](http://www.numpy.org/) (pronounced \"NUM-pie\" or, if you prefer to pronounce things incorrectly, \"NUM-pee\"). The package is called `numpy`, but it's standard to rename it `np` for brevity. You can do that with:\n", - "\n", - " import numpy as np\n", - "\n", - "Very often in data science, we want to work with many numbers that are evenly spaced within some range. NumPy provides a special function for this called `arange`. `np.arange(start, stop, space)` produces an array with all the numbers starting at `start` and counting up by `space`, stopping before `stop` is reached.\n", - "\n", - "For example, the value of `np.arange(1, 6, 2)` is an array with elements 1, 3, and 5 -- it starts at 1 and counts up by 2, then stops before 6. In other words, it's equivalent to `make_array(1, 3, 5)`.\n", - "\n", - "`np.arange(4, 9, 1)` is an array with elements 4, 5, 6, 7, and 8. (It doesn't contain 9 because `np.arange` stops *before* the stop value is reached.)\n", - "\n", - "**Question 4.1.1.1.**
Import `numpy` as `np` and then use `np.arange` to create an array with the multiples of 99 from 0 up to (**and including**) 9999. (So its elements are 0, 99, 198, 297, etc.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "...\n", - "multiples_of_99 = ...\n", - "multiples_of_99" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q4111.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Temperature readings\n", - "NOAA (the US National Oceanic and Atmospheric Administration) operates weather stations that measure surface temperatures at different sites around the United States. The hourly readings are [publicly available](http://www.ncdc.noaa.gov/qclcd/QCLCD?prior=N).\n", - "\n", - "Suppose we download all the hourly data from the Oakland, California site for the month of December 2015. To analyze the data, we want to know when each reading was taken, but we find that the data don't include the timestamps of the readings (the time at which each one was taken).\n", - "\n", - "However, we know the first reading was taken at the first instant of December 2015 (midnight on December 1st) and each subsequent reading was taken exactly 1 hour after the last.\n", - "\n", - "**Question 4.1.1.2.**
Create an array of the *time, in seconds, since the start of the month* at which each hourly reading was taken. Name it `collection_times`.\n", - "\n", - "*Hint 1:* There were 31 days in December, which is equivalent to ($31 \\times 24$) hours or ($31 \\times 24 \\times 60 \\times 60$) seconds. So your array should have $31 \\times 24$ elements in it.\n", - "\n", - "*Hint 2:* The `len` function works on arrays, too. If your `collection_times` isn't passing the tests, check its length and make sure it has $31 \\times 24$ elements." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "collection_times = ...\n", - "collection_times" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q4112.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 4.2. Working with single elements of arrays (\"indexing\")\n", - "Let's work with a more interesting dataset. The next cell creates an array called `population` that includes estimated world populations in every year from **1950** to roughly the present. (The estimates come from the [US Census Bureau website](http://www.census.gov/population/international/data/worldpop/table_population.php).)\n", - "\n", - "Rather than type in the data manually, we've loaded them from a file on your computer called `world_population.csv`. You'll learn how to do that next week." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Don't worry too much about what goes on in this cell.\n", - "from datascience import *\n", - "population = Table.read_table(\"world_population.csv\").column(\"Population\")\n", - "population" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here's how we get the first element of `population`, which is the world population in the first year in the dataset, 1950." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "population.item(0)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The value of that expression is the number 2557628654 (around 2.5 billion), because that's the first thing in the array `population`.\n", - "\n", - "Notice that we wrote `.item(0)`, not `.item(1)`, to get the first element. This is a weird convention in computer science. 0 is called the *index* of the first item. It's the number of elements that appear *before* that item. So 3 is the index of the 4th item.\n", - "\n", - "Here are some more examples. In the examples, we've given names to the things we get out of `population`. Read and run each cell." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# The third element in the array is the population\n", - "# in 1952.\n", - "population_1952 = population.item(2)\n", - "population_1952" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# The thirteenth element in the array is the population\n", - "# in 1962 (which is 1950 + 12).\n", - "population_1962 = population.item(12)\n", - "population_1962" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# The 66th element is the population in 2015.\n", - "population_2015 = population.item(65)\n", - "population_2015" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# The array has only 66 elements, so this doesn't work.\n", - "# (There's no element with 66 other elements before it.)\n", - "population_2016 = population.item(66)\n", - "population_2016" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Since make_array returns an array, we can call .item(3)\n", - "# on its output to get its 4th element, just like we\n", - "# \"chained\" together calls to the method \"replace\" earlier.\n", - "make_array(-1, -3, 4, -2).item(3)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 4.2.1.**
Set `population_1973` to the world population in 1973, by getting the appropriate element from `population` using `item`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "population_1973 = ...\n", - "population_1973" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q421.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 4.3. Doing something to every element of an array\n", - "Arrays are primarily useful for doing the same operation many times, so we don't often have to use `.item` and work with single elements.\n", - "\n", - "##### Logarithms\n", - "Here is one simple question we might ask about world population:\n", - "\n", - "> How big was the population in *orders of magnitude* in each year?\n", - "\n", - "The logarithm function is one way of measuring how big a number is. The logarithm (base 10) of a number increases by 1 every time we multiply the number by 10. It's like a measure of how many decimal digits the number has, or how big it is in orders of magnitude.\n", - "\n", - "We could try to answer our question like this, using the `log10` function from the `math` module and the `item` method you just saw:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import math\n", - "\n", - "population_1950_magnitude = math.log10(population.item(0))\n", - "population_1951_magnitude = math.log10(population.item(1))\n", - "population_1952_magnitude = math.log10(population.item(2))\n", - "population_1953_magnitude = math.log10(population.item(3))\n", - "..." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "But this is tedious and doesn't really take advantage of the fact that we are using a computer.\n", - "\n", - "Instead, NumPy provides its own version of `log10` that takes the logarithm of each element of an array. It takes a single array of numbers as its argument. It returns an array of the same length, where the first element of the result is the logarithm of the first element of the argument, and so on.\n", - "\n", - "**Question 4.3.1.**
Use it to compute the logarithms of the world population in every year. Give the result (an array of 66 numbers) the name `population_magnitudes`. Your code should be very short." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "population_magnitudes = ...\n", - "population_magnitudes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q431.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"Elementwise\n", - "\n", - "This is called *elementwise* application of the function, since it operates separately on each element of the array it's called on. The textbook's section on arrays has a useful list of NumPy functions that are designed to work elementwise, like `np.log10`.\n", - "\n", - "##### Arithmetic\n", - "Arithmetic also works elementwise on arrays. For example, you can divide all the population numbers by 1 billion to get numbers in billions:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "population_in_billions = population / 1000000000\n", - "population_in_billions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can do the same with addition, subtraction, multiplication, and exponentiation (`**`). For example, you can calculate a tip on several restaurant bills at once (in this case just 3):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "restaurant_bills = make_array(20.12, 39.90, 31.01)\n", - "print(\"Restaurant bills:\\t\", restaurant_bills)\n", - "tips = .2 * restaurant_bills\n", - "print(\"Tips:\\t\\t\\t\", tips)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"Elementwise\n", - "\n", - "**Question 4.3.2.**
Suppose the total charge at a restaurant is the original bill plus the tip. That means we can multiply the original bill by 1.2 to get the total charge. Compute the total charge for each bill in `restaurant_bills`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "total_charges = ...\n", - "total_charges" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q432.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 4.3.3.**
`more_restaurant_bills.csv` contains 100,000 bills! Compute the total charge for each one. How is your code different?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "more_restaurant_bills = Table.read_table(\"more_restaurant_bills.csv\").column(\"Bill\")\n", - "more_total_charges = ...\n", - "more_total_charges" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q433.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The function `sum` takes a single array of numbers as its argument. It returns the sum of all the numbers in that array (so it returns a single number, not an array).\n", - "\n", - "**Question 4.3.4.**
What was the sum of all the bills in `more_restaurant_bills`, *including tips*?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sum_of_bills = ...\n", - "sum_of_bills" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q434.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 4.3.5.**
The powers of 2 ($2^0 = 1$, $2^1 = 2$, $2^2 = 4$, etc) arise frequently in computer science. (For example, you may have noticed that storage on smartphones or USBs come in powers of 2, like 16 GB, 32 GB, or 64 GB.) Use `np.arange` and the exponentiation operator `**` to compute the first 15 powers of 2, starting from `2^0`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "powers_of_2 = ...\n", - "powers_of_2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q435.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Success!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Congratulations, you're done with lab 2! Be sure to \n", - "- **run all the tests and verify that they all pass** (the next cell has a shortcut for that), \n", - "- **Review the notebook one last time, we will be grading the final state of your notebook after the deadline**,\n", - "- **Save and Checkpoint** from the `File` menu," - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# For your convenience, you can run this cell to run all the tests at once!\n", - "import glob\n", - "from gofer.ok import grade_notebook\n", - "if not globals().get('__GOFER_GRADER__', False):\n", - " display(grade_notebook('lab02.ipynb', sorted(glob.glob('tests/q*.py'))))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "anaconda-cloud": {}, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/materials/x18/lab/1/lab03/.ipynb_checkpoints/lab03-checkpoint.ipynb b/materials/x18/lab/1/lab03/.ipynb_checkpoints/lab03-checkpoint.ipynb deleted file mode 100644 index db5a783..0000000 --- a/materials/x18/lab/1/lab03/.ipynb_checkpoints/lab03-checkpoint.ipynb +++ /dev/null @@ -1,1071 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Lab 3: Tables\n", - "\n", - "Welcome to lab 3! \n", - "\n", - "This week, we will focus on manipulating tables. Tables are described in [Chapter 6](http://www.inferentialthinking.com/chapters/06/tables.html) of the text.\n", - "\n", - "First, set up the tests and imports by running the cell below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "from datascience import *\n", - "\n", - "# These lines load the tests.\n", - "\n", - "from gofer.ok import check" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Introduction\n", - "\n", - "For a collection of things in the world, an array is useful for describing a single attribute of each thing. For example, among the collection of US States, an array could describe the land area of each. Tables extend this idea by describing multiple attributes for each element of a collection.\n", - "\n", - "In most data science applications, we have data about many entities, but we also have several kinds of data about each entity.\n", - "\n", - "For example, in the cell below we have two arrays. The first one contains the world population in each year (estimated by the US Census Bureau), and the second contains the years themselves. These elements are in order, so the year and the world population for that year have the same index in their corresponding arrays." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "population_amounts = Table.read_table(\"world_population.csv\").column(\"Population\")\n", - "years = np.arange(1950, 2015+1)\n", - "print(\"Population column:\", population_amounts)\n", - "print(\"Years column:\", years)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Suppose we want to answer this question:\n", - "\n", - "> When did world population cross 6 billion?\n", - "\n", - "You could technically answer this question just from staring at the arrays, but it's a bit convoluted, since you would have to count the position where the population first crossed 6 billion, then find the corresponding element in the years array. In cases like these, it might be easier to put the data into a *`Table`*, a 2-dimensional type of dataset. \n", - "\n", - "The expression below:\n", - "\n", - "- creates an empty table using the expression `Table()`,\n", - "- adds two columns by calling `with_columns` with four arguments,\n", - "- assignes the result to the name `population`, and finally\n", - "- evaluates `population` so that we can see the table.\n", - "\n", - "The strings `\"Year\"` and `\"Population\"` are column labels that we have chosen. Ther names `population_amounts` and `years` were assigned above to two arrays of the same length. The function `with_columns` (you can find the documentation [here](http://data8.org/datascience/tables.html)) takes in alternating strings (to represent column labels) and arrays (representing the data in those columns), which are all separated by commas." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "population = Table().with_columns(\n", - " \"Population\", population_amounts,\n", - " \"Year\", years\n", - ")\n", - "population" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now the data are all together in a single table! It's much easier to parse this data--if you need to know what the population was in 1959, for example, you can tell from a single glance. We'll revisit this table later." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Creating Tables\n", - "\n", - "**Question 2.1.**
In the cell below, we've created 2 arrays. Using the steps above, assign `top_10_movies` to a table that has two columns called \"Rating\" and \"Name\", which hold `top_10_movie_ratings` and `top_10_movie_names` respectively." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "top_10_movie_ratings = make_array(9.2, 9.2, 9., 8.9, 8.9, 8.9, 8.9, 8.9, 8.9, 8.8)\n", - "top_10_movie_names = make_array(\n", - " 'The Shawshank Redemption (1994)',\n", - " 'The Godfather (1972)',\n", - " 'The Godfather: Part II (1974)',\n", - " 'Pulp Fiction (1994)',\n", - " \"Schindler's List (1993)\",\n", - " 'The Lord of the Rings: The Return of the King (2003)',\n", - " '12 Angry Men (1957)',\n", - " 'The Dark Knight (2008)',\n", - " 'Il buono, il brutto, il cattivo (1966)',\n", - " 'The Lord of the Rings: The Fellowship of the Ring (2001)')\n", - "\n", - "top_10_movies = ...\n", - "# We've put this next line here so your table will get printed out when you\n", - "# run this cell.\n", - "top_10_movies" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q2_1.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Loading a table from a file\n", - "In most cases, we aren't going to go through the trouble of typing in all the data manually. Instead, we can use our `Table` functions.\n", - "\n", - "`Table.read_table` takes one argument, a path to a data file (a string) and returns a table. There are many formats for data files, but CSV (\"comma-separated values\") is the most common.\n", - "\n", - "**Question 2.2.**
The file `imdb.csv` contains a table of information about the 250 highest-rated movies on IMDb. Load it as a table called `imdb`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "imdb = ...\n", - "imdb" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q2_2.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notice the part about \"... (240 rows omitted).\" This table is big enough that only a few of its rows are displayed, but the others are still there. 10 are shown, so there are 250 movies total.\n", - "\n", - "Where did `imdb.csv` come from? Take a look at [this lab's folder](./). You should see a file called `imdb.csv`.\n", - "\n", - "Open up the `imdb.csv` file in that folder and look at the format. What do you notice? The `.csv` filename ending says that this file is in the [CSV (comma-separated value) format](http://edoceo.com/utilitas/csv-file-format)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Using lists\n", - "\n", - "A *list* is another Python sequence type, similar to an array. It's different than an array because the values it contains can all have different types. A single list can contain `int` values, `float` values, and strings. Elements in a list can even be other lists! A list is created by giving a name to the list of values enclosed in square brackets and separated by commas. For example, `values_with_different_types = ['data', 8, 8.1]`\n", - "\n", - "Lists can be useful when working with tables because they can describe the contents of one row in a table, which often corresponds to a sequence of values with different types. A list of lists can be used to describe multiple rows.\n", - "\n", - "Each column in a table is a collection of values with the same type (an array). If you create a table column from a list, it will automatically be converted to an array. A row, on the ther hand, mixes types.\n", - "\n", - "Here's a table from Chapter 5. (Run the cell below.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Run this cell to recreate the table\n", - "flowers = Table().with_columns(\n", - " 'Number of petals', make_array(8, 34, 5),\n", - " 'Name', make_array('lotus', 'sunflower', 'rose')\n", - ")\n", - "flowers" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 3.1.**
Create a list that describes a new fourth row of this table. The details can be whatever you want, but the list must contain two values: the number of petals (an `int` value) and the name of the flower (a string). For example, your flower could be \"pondweed\"! (A flower with zero petals)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "my_flower = ...\n", - "my_flower" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q3_1.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 3.2.**
`my_flower` fits right in to the table from chapter 5. Complete the cell below to create a table of seven flowers that includes your flower as the fourth row followed by `other_flowers`. You can use `with_row` to create a new table with one extra row by passing a list of values and `with_rows` to create a table with multiple extra rows by passing a list of lists of values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Use the method .with_row(...) to create a new table that includes my_flower \n", - "\n", - "four_flowers = ...\n", - "\n", - "# Use the method .with_rows(...) to create a table that \n", - "# includes four_flowers followed by other_flowers\n", - "\n", - "other_flowers = [[10, 'lavender'], [3, 'birds of paradise'], [6, 'tulip']]\n", - "\n", - "seven_flowers = ...\n", - "seven_flowers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q3_2.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Analyzing datasets\n", - "With just a few table methods, we can answer some interesting questions about the IMDb dataset.\n", - "\n", - "If we want just the ratings of the movies, we can get an array that contains the data in that column:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "imdb.column(\"Rating\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The value of that expression is an array, exactly the same kind of thing you'd get if you typed in `make_array(8.4, 8.3, 8.3, [etc])`.\n", - "\n", - "**Question 4.1.**
Find the rating of the highest-rated movie in the dataset.\n", - "\n", - "*Hint:* Think back to the functions you've learned about for working with arrays of numbers. Ask for help if you can't remember one that's useful for this." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "highest_rating = ...\n", - "highest_rating" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q4_1.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "That's not very useful, though. You'd probably want to know the *name* of the movie whose rating you found! To do that, we can sort the entire table by rating, which ensures that the ratings and titles will stay together. Note that calling sort creates a copy of the table and leaves the original table unsorted." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "imdb.sort(\"Rating\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Well, that actually doesn't help much, either -- we sorted the movies from lowest -> highest ratings. To look at the highest-rated movies, sort in reverse order:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "imdb.sort(\"Rating\", descending=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "(The `descending=True` bit is called an *optional argument*. It has a default value of `False`, so when you explicitly tell the function `descending=True`, then the function will sort in descending order.)\n", - "\n", - "So there are actually 2 highest-rated movies in the dataset: *The Shawshank Redemption* and *The Godfather*.\n", - "\n", - "Some details about sort:\n", - "\n", - "1. The first argument to `sort` is the name of a column to sort by.\n", - "2. If the column has strings in it, `sort` will sort alphabetically; if the column has numbers, it will sort numerically.\n", - "3. The value of `imdb.sort(\"Rating\")` is a *copy of `imdb`*; the `imdb` table doesn't get modified. For example, if we called `imdb.sort(\"Rating\")`, then running `imdb` by itself would still return the unsorted table.\n", - "4. Rows always stick together when a table is sorted. It wouldn't make sense to sort just one column and leave the other columns alone. For example, in this case, if we sorted just the \"Rating\" column, the movies would all end up with the wrong ratings.\n", - "\n", - "**Question 4.2.**
Create a version of `imdb` that's sorted chronologically, with the earliest movies first. Call it `imdb_by_year`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "imdb_by_year = ...\n", - "imdb_by_year" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q4_2.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 4.3.**
What's the title of the earliest movie in the dataset? You could just look this up from the output of the previous cell. Instead, write Python code to find out.\n", - "\n", - "*Hint:* Starting with `imdb_by_year`, extract the Title column to get an array, then use `item` to get its first item." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "earliest_movie_title = ...\n", - "earliest_movie_title" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q4_3.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Finding pieces of a dataset\n", - "Suppose you're interested in movies from the 1940s. Sorting the table by year doesn't help you, because the 1940s are in the middle of the dataset.\n", - "\n", - "Instead, we use the table method `where`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "forties = imdb.where('Decade', are.equal_to(1940))\n", - "forties" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Ignore the syntax for the moment. Instead, try to read that line like this:\n", - "\n", - "> Assign the name **`forties`** to a table whose rows are the rows in the **`imdb`** table **`where`** the **`'Decade'`**s **`are` `equal` `to` `1940`**.\n", - "\n", - "**Question 5.1.**
Compute the average rating of movies from the 1940s.\n", - "\n", - "*Hint:* The function `np.average` computes the average of an array of numbers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "average_rating_in_forties = ...\n", - "average_rating_in_forties" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q5_1.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's dive into the details a bit more. `where` takes 2 arguments:\n", - "\n", - "1. The name of a column. `where` finds rows where that column's values meet some criterion.\n", - "2. Something that describes the criterion that the column needs to meet, called a predicate.\n", - "\n", - "To create our predicate, we called the function `are.equal_to` with the value we wanted, 1940. We'll see other predicates soon.\n", - "\n", - "`where` returns a table that's a copy of the original table, but with only the rows that meet the given predicate.\n", - "\n", - "**Question 5.2.**
Create a table called `ninety_nine` containing the movies that came out in the year 1999. Use `where`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "ninety_nine = ...\n", - "ninety_nine" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q5_2.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So far we've only been finding where a column is *exactly* equal to a certain value. However, there are many other predicates. Here are a few:\n", - "\n", - "|Predicate|Example|Result|\n", - "|-|-|-|\n", - "|`are.equal_to`|`are.equal_to(50)`|Find rows with values equal to 50|\n", - "|`are.not_equal_to`|`are.not_equal_to(50)`|Find rows with values not equal to 50|\n", - "|`are.above`|`are.above(50)`|Find rows with values above (and not equal to) 50|\n", - "|`are.above_or_equal_to`|`are.above_or_equal_to(50)`|Find rows with values above 50 or equal to 50|\n", - "|`are.below`|`are.below(50)`|Find rows with values below 50|\n", - "|`are.between`|`are.between(2, 10)`|Find rows with values above or equal to 2 and below 10|\n", - "\n", - "The textbook section on selecting rows has more examples.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 5.3.**
Using `where` and one of the predicates from the table above, find all the movies with a rating higher than 8.5. Put their data in a table called `really_highly_rated`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "really_highly_rated = ...\n", - "really_highly_rated" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q5_3.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 5.4.**
Find the average rating for movies released in the 20th century and the average rating for movies released in the 21st century for the movies in `imdb`.\n", - "\n", - "*Hint*: Think of the steps you need to do (take the average, find the ratings, find movies released in 20th/21st centuries), and try to put them in an order that makes sense." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "average_20th_century_rating = ...\n", - "average_21st_century_rating = ...\n", - "print(\"Average 20th century rating:\", average_20th_century_rating)\n", - "print(\"Average 21st century rating:\", average_21st_century_rating)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q5_4.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The property `num_rows` tells you how many rows are in a table. (A \"property\" is just a method that doesn't need to be called by adding parentheses.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "num_movies_in_dataset = imdb.num_rows\n", - "num_movies_in_dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 5.5.**
Use `num_rows` (and arithmetic) to find the *proportion* of movies in the dataset that were released in the 20th century, and the proportion from the 21st century.\n", - "\n", - "*Hint:* The *proportion* of movies released in the 20th century is the *number* of movies released in the 20th century, divided by the *total number* of movies." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "proportion_in_20th_century = ...\n", - "proportion_in_21st_century = ...\n", - "print(\"Proportion in 20th century:\", proportion_in_20th_century)\n", - "print(\"Proportion in 21st century:\", proportion_in_21st_century)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q5_5.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 5.6.**
Here's a challenge: Find the number of movies that came out in *even* years.\n", - "\n", - "*Hint:* The operator `%` computes the remainder when dividing by a number. So `5 % 2` is 1 and `6 % 2` is 0. A number is even if the remainder is 0 when you divide by 2.\n", - "\n", - "*Hint 2:* `%` can be used on arrays, operating elementwise like `+` or `*`. So `make_array(5, 6, 7) % 2` is `array([1, 0, 1])`.\n", - "\n", - "*Hint 3:* Create a column called \"Year Remainder\" that's the remainder when each movie's release year is divided by 2. Make a copy of `imdb` that includes that column. Then use `where` to find rows where that new column is equal to 0. Then use `num_rows` to count the number of such rows." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "for_assignment_type": "student" - }, - "outputs": [], - "source": [ - "num_even_year_movies = ...\n", - "num_even_year_movies" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q5_6.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 5.7.**
Check out the `population` table from the introduction to this lab. Compute the year when the world population first went above 6 billion." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "year_population_crossed_6_billion = ...\n", - "year_population_crossed_6_billion" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q5_7.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6. Miscellanea\n", - "There are a few more table methods you'll need to fill out your toolbox. The first 3 have to do with manipulating the columns in a table.\n", - "\n", - "The table `farmers_markets.csv` contains data on farmers' markets in the United States (data collected [by the USDA]([dataset](https://apps.ams.usda.gov/FarmersMarketsExport/ExcelExport.aspx)). Each row represents one such market.\n", - "\n", - "**Question 6.1.**
Load the dataset into a table. Call it `farmers_markets`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "farmers_markets = ...\n", - "farmers_markets" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q6_1.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You'll notice that it has a large number of columns in it!\n", - "\n", - "### `num_columns`\n", - "\n", - "**Question 6.2.**
The table property `num_columns` (example call: `tbl.num_columns`) produces the number of columns in a table. Use it to find the number of columns in our farmers' markets dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "num_farmers_markets_columns = ...\n", - "print(\"The table has\", num_farmers_markets_columns, \"columns in it!\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q6_2.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Most of the columns are about particular products -- whether the market sells tofu, pet food, etc. If we're not interested in that stuff, it just makes the table difficult to read. This comes up more than you might think.\n", - "\n", - "### `select`\n", - "\n", - "In such situations, we can use the table method `select` to pare down the columns of a table. It takes any number of arguments. Each should be the name or index of a column in the table. It returns a new table with only those columns in it.\n", - "\n", - "For example, the value of `imdb.select(\"Year\", \"Decade\")` is a table with only the years and decades of each movie in `imdb`.\n", - "\n", - "**Question 6.3.**
Use `select` to create a table with only the name, city, state, latitude ('y'), and longitude ('x') of each market. Call that new table `farmers_markets_locations`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "farmers_markets_locations = ...\n", - "farmers_markets_locations" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q6_3.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### `select` is not `column`!\n", - "\n", - "The method `select` is **definitely not** the same as the method `column`.\n", - "\n", - "`farmers_markets.column('y')` is an *array* of the latitudes of all the markets. `farmers_markets.select('y')` is a *table* that happens to contain only 1 column, the latitudes of all the markets.\n", - "\n", - "**Question 6.4.**
Below, we tried using the function `np.average` to find the average latitude ('y') and average longitude ('x') of the farmers' markets in the table, but we screwed something up. Run the cell to see the (somewhat inscrutable) error message that results from calling `np.average` on a table. Then, fix our code." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "for_assignment_type": "student" - }, - "outputs": [], - "source": [ - "average_latitude = np.average(farmers_markets.select('y'))\n", - "average_longitude = np.average(farmers_markets.select('x'))\n", - "print(\"The average of US farmers' markets' coordinates is located at (\", average_latitude, \",\", average_longitude, \")\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q6_4.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### `drop`\n", - "\n", - "`drop` serves the same purpose as `select`, but it takes away the columns you list instead of the ones you don't list, leaving all the rest of the columns.\n", - "\n", - "**Question 6.5.**
Suppose you just didn't want the \"FMID\" or \"updateTime\" columns in `farmers_markets`. Create a table that's a copy of `farmers_markets` but doesn't include those columns. Call that table `farmers_markets_without_fmid`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "farmers_markets_without_fmid = ...\n", - "farmers_markets_without_fmid" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q6_5.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### `take`\n", - "Let's find the 5 northernmost farmers' markets in the US. You already know how to sort by latitude ('y'), but we haven't seen how to get the first 5 rows of a table. That's what `take` is for.\n", - "\n", - "The table method `take` takes as its argument an array of numbers. Each number should be the index of a row in the table. It returns a new table with only those rows.\n", - "\n", - "Most often you'll want to use `take` in conjunction with `np.arange` to take the first few rows of a table.\n", - "\n", - "**Question 6.6.**
Make a table of the 5 northernmost farmers' markets in `farmers_markets_locations`. Call it `northern_markets`. (It should include the same columns as `farmers_markets_locations`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "northern_markets = ...\n", - "northern_markets" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q6_6.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 6.7.**
Make a table of the farmers' markets in Berkeley, California. (It should include the same columns as `farmers_markets_locations`.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "berkeley_markets = ...\n", - "berkeley_markets" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q6_7.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 7. Summary\n", - "\n", - "For your reference, here's a table of all the functions and methods we saw in this lab.\n", - "\n", - "|Name|Example|Purpose|\n", - "|-|-|-|\n", - "|`Table`|`Table()`|Create an empty table, usually to extend with data|\n", - "|`Table.read_table`|`Table.read_table(\"my_data.csv\")`|Create a table from a data file|\n", - "|`with_columns`|`tbl = Table().with_columns(\"N\", np.arange(5), \"2*N\", np.arange(0, 10, 2))`|Create a copy of a table with more columns|\n", - "|`column`|`tbl.column(\"N\")`|Create an array containing the elements of a column|\n", - "|`sort`|`tbl.sort(\"N\")`|Create a copy of a table sorted by the values in a column|\n", - "|`where`|`tbl.where(\"N\", are.above(2))`|Create a copy of a table with only the rows that match some *predicate*|\n", - "|`num_rows`|`tbl.num_rows`|Compute the number of rows in a table|\n", - "|`num_columns`|`tbl.num_columns`|Compute the number of columns in a table|\n", - "|`select`|`tbl.select(\"N\")`|Create a copy of a table with only some of the columns|\n", - "|`drop`|`tbl.drop(\"2*N\")`|Create a copy of a table without some of the columns|\n", - "|`take`|`tbl.take(np.arange(0, 6, 2))`|Create a copy of the table with only the rows whose indices are in the given array|\n", - "\n", - "
\n", - "\n", - "Congratulations, you're done with lab 3! Be sure to \n", - "- **run all the tests and verify that they all pass** (the next cell has a shortcut for that), \n", - "- **Review the notebook one last time, we will be grading the final state of your notebook after the deadline**,\n", - "- **Save and Checkpoint** from the `File` menu," - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# For your convenience, you can run this cell to run all the tests at once!\n", - "import glob\n", - "from gofer.ok import grade_notebook\n", - "if not globals().get('__GOFER_GRADER__', False):\n", - " display(grade_notebook('lab03.ipynb', sorted(glob.glob('tests/q*.py'))))" - ] - } - ], - "metadata": { - "anaconda-cloud": {}, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/materials/x18/lab/1/lab04/.ipynb_checkpoints/lab04-checkpoint.ipynb b/materials/x18/lab/1/lab04/.ipynb_checkpoints/lab04-checkpoint.ipynb deleted file mode 100644 index 956c7c6..0000000 --- a/materials/x18/lab/1/lab04/.ipynb_checkpoints/lab04-checkpoint.ipynb +++ /dev/null @@ -1,1128 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Functions and Visualizations" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Welcome to lab 4! This week, we'll learn about functions and the table method `apply` from [Section 8.1](https://www.inferentialthinking.com/chapters/08/1/applying-a-function-to-a-column.html). We'll also learn about visualization from [Chapter 7](https://www.inferentialthinking.com/chapters/07/visualization.html).\n", - "\n", - "First, set up the tests and imports by running the cell below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "from datascience import *\n", - "\n", - "# These lines set up graphing capabilities.\n", - "import matplotlib\n", - "%matplotlib inline\n", - "import matplotlib.pyplot as plt\n", - "plt.style.use('fivethirtyeight')\n", - "import warnings\n", - "warnings.simplefilter('ignore', FutureWarning)\n", - "\n", - "from ipywidgets import interact, interactive, fixed, interact_manual\n", - "import ipywidgets as widgets\n", - "\n", - "from gofer.ok import check" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Functions and CEO Incomes\n", - "\n", - "Let's start with a real data analysis task. We'll look at the 2015 compensation of CEOs at the 100 largest companies in California. The data were compiled for a Los Angeles Times analysis [here](http://spreadsheets.latimes.com/california-ceo-compensation/), and ultimately came from [filings](https://www.sec.gov/answers/proxyhtf.htm) mandated by the SEC from all publicly-traded companies. Two companies have two CEOs, so there are 102 CEOs in the dataset.\n", - "\n", - "We've copied the data in raw form from the LA Times page into a file called `raw_compensation.csv`. (The page notes that all dollar amounts are in millions of dollars.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "raw_compensation = Table.read_table('raw_compensation.csv')\n", - "raw_compensation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 1.1.**
We want to compute the average of the CEOs' pay. Try running the cell below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "np.average(raw_compensation.column(\"Total Pay\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You should see an error. Let's examine why this error occured by looking at the values in the \"Total Pay\" column. Use the `type` function and set `total_pay_type` to the type of the first value in the \"Total Pay\" column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "total_pay_type = ...\n", - "total_pay_type" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q1_1.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 1.2.**
You should have found that the values in \"Total Pay\" column are strings (text). It doesn't make sense to take the average of the text values, so we need to convert them to numbers if we want to do this. Extract the first value in the \"Total Pay\" column. It's Mark Hurd's pay in 2015, in *millions* of dollars. Call it `mark_hurd_pay_string`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "mark_hurd_pay_string = ...\n", - "mark_hurd_pay_string" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q1_2.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 1.3.**
Convert `mark_hurd_pay_string` to a number of *dollars*. The string method `strip` will be useful for removing the dollar sign; it removes a specified character from the start or end of a string. For example, the value of `\"100%\".strip(\"%\")` is the string `\"100\"`. You'll also need the function `float`, which converts a string that looks like a number to an actual number. Last, remember that the answer should be in dollars, not millions of dollars." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "mark_hurd_pay = ...\n", - "mark_hurd_pay" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q1_3.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To compute the average pay, we need to do this for every CEO. But that looks like it would involve copying this code 102 times.\n", - "\n", - "This is where functions come in. First, we'll define a new function, giving a name to the expression that converts \"total pay\" strings to numeric values. Later in this lab we'll see the payoff: we can call that function on every pay string in the dataset at once.\n", - "\n", - "**Question 1.4.**
Copy the expression you used to compute `mark_hurd_pay` as the `return` expression of the function below, but replace the specific `mark_hurd_pay_string` with the generic `pay_string` name specified in the first line of the `def` statement.\n", - "\n", - "*Hint*: When dealing with functions, you should generally not be referencing any variable outside of the function. Usually, you want to be working with the arguments that are passed into it, such as `pay_string` for this function. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "for_assignment_type": "student" - }, - "outputs": [], - "source": [ - "def convert_pay_string_to_number(pay_string):\n", - " \"\"\"Converts a pay string like '$100' (in millions) to a number of dollars.\"\"\"\n", - " return ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q1_4.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Running that cell doesn't convert any particular pay string. Instead, it creates a function called `convert_pay_string_to_number` that can convert any string with the right format to a number representing millions of dollars.\n", - "\n", - "We can call our function just like we call the built-in functions we've seen. It takes one argument, a string, and it returns a number." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "convert_pay_string_to_number('$42')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "convert_pay_string_to_number(mark_hurd_pay_string)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# We can also compute Safra Catz's pay in the same way:\n", - "convert_pay_string_to_number(raw_compensation.where(\"Name\", are.containing(\"Safra\")).column(\"Total Pay\").item(0))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So, what have we gained by defining the `convert_pay_string_to_number` function? \n", - "Well, without it, we'd have to copy that `10**6 * float(pay_string.strip(\"$\"))` stuff each time we wanted to convert a pay string. Now we just call a function whose name says exactly what it's doing.\n", - "\n", - "Soon, we'll see how to apply this function to every pay string in a single expression. First, let's take a brief detour and introduce `interact`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using `interact`\n", - "\n", - "We've included a nifty function called `interact` that allows you to\n", - "call a function with different arguments.\n", - "\n", - "To use it, call `interact` with the function you want to interact with as the\n", - "first argument, then specify a default value for each argument of the original\n", - "function like so:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "_ = interact(convert_pay_string_to_number, pay_string='$42')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can now change the value in the textbox to automatically call\n", - "`convert_pay_string_to_number` with the argument you enter in the `pay_string`\n", - "textbox. For example, entering in `'$49'` in the textbox will display the result of\n", - "running `convert_pay_string_to_number('$49')`. Neat!\n", - "\n", - "Note that we'll never ask you to write the `interact` function calls yourself as\n", - "part of a question. However, we'll include it here and there where it's helpful\n", - "and you'll probably find it useful to use yourself.\n", - "\n", - "Now, let's continue on and write more functions." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Defining functions\n", - "\n", - "Let's write a very simple function that converts a proportion to a percentage by multiplying it by 100. For example, the value of `to_percentage(.5)` should be the number 50. (No percent sign.)\n", - "\n", - "A function definition has a few parts.\n", - "\n", - "##### `def`\n", - "It always starts with `def` (short for **def**ine):\n", - "\n", - " def\n", - "\n", - "##### Name\n", - "Next comes the name of the function. Let's call our function `to_percentage`.\n", - " \n", - " def to_percentage\n", - "\n", - "##### Signature\n", - "Next comes something called the *signature* of the function. This tells Python how many arguments your function should have, and what names you'll use to refer to those arguments in the function's code. `to_percentage` should take one argument, and we'll call that argument `proportion` since it should be a proportion.\n", - "\n", - " def to_percentage(proportion)\n", - "\n", - "We put a colon after the signature to tell Python it's over.\n", - "\n", - " def to_percentage(proportion):\n", - "\n", - "##### Documentation\n", - "Functions can do complicated things, so you should write an explanation of what your function does. For small functions, this is less important, but it's a good habit to learn from the start. Conventionally, Python functions are documented by writing a triple-quoted string:\n", - "\n", - " def to_percentage(proportion):\n", - " \"\"\"Converts a proportion to a percentage.\"\"\"\n", - " \n", - " \n", - "##### Body\n", - "Now we start writing code that runs when the function is called. This is called the *body* of the function. We can write anything we could write anywhere else. First let's give a name to the number we multiply a proportion by to get a percentage.\n", - "\n", - " def to_percentage(proportion):\n", - " \"\"\"Converts a proportion to a percentage.\"\"\"\n", - " factor = 100\n", - "\n", - "##### `return`\n", - "The special instruction `return` in a function's body tells Python to make the value of the function call equal to whatever comes right after `return`. We want the value of `to_percentage(.5)` to be the proportion .5 times the factor 100, so we write:\n", - "\n", - " def to_percentage(proportion):\n", - " \"\"\"Converts a proportion to a percentage.\"\"\"\n", - " factor = 100\n", - " return proportion * factor\n", - "Note that `return` inside a function gives the function a value, while `print`, which we have used before, is a function which has no `return` value and just prints a certain value out to the console. The two are very different. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 2.1.**
Define `to_percentage` in the cell below. Call your function to convert the proportion .2 to a percentage. Name that percentage `twenty_percent`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "for_assignment_type": "student" - }, - "outputs": [], - "source": [ - "def ...\n", - " \"\"\" ... \"\"\"\n", - " ... = ...\n", - " return ...\n", - "\n", - "twenty_percent = ...\n", - "twenty_percent" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q2_1.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Like the built-in functions, you can use named values as arguments to your function.\n", - "\n", - "**Question 2.2.**
Use `to_percentage` again to convert the proportion named `a_proportion` (defined below) to a percentage called `a_percentage`.\n", - "\n", - "*Note:* You don't need to define `to_percentage` again! Just like other named things, functions stick around after you define them." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "a_proportion = 2**(.5) / 2\n", - "a_percentage = ...\n", - "a_percentage" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q2_2.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here's something important about functions: the names assigned within a function body are only accessible within the function body. Once the function has returned, those names are gone. So even though you defined `factor = 100` inside `to_percentage` above and then called `to_percentage`, you cannot refer to `factor` anywhere except inside the body of `to_percentage`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# You should see an error when you run this. (If you don't, you might\n", - "# have defined factor somewhere above.)\n", - "factor" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As we've seen with the built-in functions, functions can also take strings (or arrays, or tables) as arguments, and they can return those things, too.\n", - "\n", - "**Question 2.3.**
Define a function called `disemvowel`. It should take a single string as its argument. (You can call that argument whatever you want.) It should return a copy of that string, but with all the characters that are vowels removed. (In English, the vowels are the characters \"a\", \"e\", \"i\", \"o\", and \"u\".)\n", - "\n", - "*Hint:* To remove all the \"a\"s from a string, you can use `that_string.replace(\"a\", \"\")`. The `.replace` method for strings returns another string, so you can call `replace` multiple times, one after the other. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def disemvowel(a_string):\n", - " ...\n", - " ...\n", - "\n", - "# An example call to your function. (It's often helpful to run\n", - "# an example call from time to time while you're writing a function,\n", - "# to see how it currently works.)\n", - "disemvowel(\"Can you read this without vowels?\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Alternatively, you can use interact to call your function\n", - "_ = interact(disemvowel, a_string='Hello world')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q2_3.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Calls on calls on calls\n", - "Just as you write a series of lines to build up a complex computation, it's useful to define a series of small functions that build on each other. Since you can write any code inside a function's body, you can call other functions you've written.\n", - "\n", - "If a function is a like a recipe, defining a function in terms of other functions is like having a recipe for cake telling you to follow another recipe to make the frosting, and another to make the sprinkles. This makes the cake recipe shorter and clearer, and it avoids having a bunch of duplicated frosting recipes. It's a foundation of productive programming.\n", - "\n", - "For example, suppose you want to count the number of characters *that aren't vowels* in a piece of text. One way to do that is this to remove all the vowels and count the size of the remaining string.\n", - "\n", - "**Question 2.4.**
Write a function called `num_non_vowels`. It should take a string as its argument and return a number. The number should be the number of characters in the argument string that aren't vowels.\n", - "\n", - "*Hint:* The function `len` takes a string as its argument and returns the number of characters in it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def num_non_vowels(a_string):\n", - " \"\"\"The number of characters in a string, minus the vowels.\"\"\"\n", - " ...\n", - "\n", - "# Try calling your function yourself to make sure the output is what\n", - "# you expect. You can also use the interact function if you'd like." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q2_4.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Functions can also encapsulate code that *does things* rather than just computing values. For example, if you call `print` inside a function, and then call that function, something will get printed.\n", - "\n", - "The `movies_by_year` dataset in the textbook has information about movie sales in recent years. Suppose you'd like to display the year with the 5th-highest total gross movie sales, printed in a human-readable way. You might do this:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "movies_by_year = Table.read_table(\"movies_by_year.csv\")\n", - "rank = 5\n", - "fifth_from_top_movie_year = movies_by_year.sort(\"Total Gross\", descending=True).column(\"Year\").item(rank-1)\n", - "print(\"Year number\", rank, \"for total gross movie sales was:\", fifth_from_top_movie_year)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After writing this, you realize you also wanted to print out the 2nd and 3rd-highest years. Instead of copying your code, you decide to put it in a function. Since the rank varies, you make that an argument to your function.\n", - "\n", - "**Question 2.5.**
Write a function called `print_kth_top_movie_year`. It should take a single argument, the rank of the year (like 2, 3, or 5 in the above examples). It should print out a message like the one above. It shouldn't have a `return` statement." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "for_assignment_type": "student" - }, - "outputs": [], - "source": [ - "def print_kth_top_movie_year(k):\n", - " # Our solution used 2 lines.\n", - " ...\n", - " ...\n", - "\n", - "# Example calls to your function:\n", - "print_kth_top_movie_year(2)\n", - "print_kth_top_movie_year(3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# interact also allows you to pass in an array for a function argument. It will\n", - "# then present a dropdown menu of options.\n", - "_ = interact(print_kth_top_movie_year, k=np.arange(1, 10))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q2_5.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Print is not the same as Return\n", - "The `print_kth_top_movie_year(k)` function prints the total gross movie sales for the year that was provided! However, since we did not return any value in this function, we can not use it after we call it. Let's look at an example of a function that prints a value but does not return it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def print_number_five():\n", - " print(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print_number_five()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "However, if we try to use the output of `print_number_five()`, we see that we get an error when we try to add the number 5 to it!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print_number_five_output = print_number_five()\n", - "print_number_five_output + 5" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It may seem that `print_number_five()` is returning a value, 5. In reality, it just displays the number 5 to you without giving you the actual value! If your function prints out a value without returning it and you try to use it, you will run into errors so be careful!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. `apply`ing functions\n", - "\n", - "Defining a function is a lot like giving a name to a value with `=`. In fact, a function is a value just like the number 1 or the text \"the\"!\n", - "\n", - "For example, we can make a new name for the built-in function `max` if we want:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "our_name_for_max = max\n", - "our_name_for_max(2, 6)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The old name for `max` is still around:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "max(2, 6)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Try just writing `max` or `our_name_for_max` (or the name of any other function) in a cell, and run that cell. Python will print out a (very brief) description of the function." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "max" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Why is this useful? Since functions are just values, it's possible to pass them as arguments to other functions. Here's a simple but not-so-practical example: we can make an array of functions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "make_array(max, np.average, are.equal_to)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 3.1.**
Make an array containing any 3 other functions you've seen. Call it `some_functions`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "some_functions = ...\n", - "some_functions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q3_1.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Working with functions as values can lead to some funny-looking code. For example, see if you can figure out why this works:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "make_array(max, np.average, are.equal_to).item(0)(4, -2, 7)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here's a simpler example that's actually useful: the table method `apply`.\n", - "\n", - "`apply` calls a function many times, once on *each* element in a column of a table. It produces an array of the results. Here we use `apply` to convert every CEO's pay to a number, using the function you defined:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "raw_compensation.apply(convert_pay_string_to_number, \"Total Pay\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here's an illustration of what that did:\n", - "\n", - "\"For\n", - "\n", - "Note that we didn't write something like `convert_pay_string_to_number()` or `convert_pay_string_to_number(\"Total Pay\")`. The job of `apply` is to call the function we give it, so instead of calling `convert_pay_string_to_number` ourselves, we just write its name as an argument to `apply`.\n", - "\n", - "**Question 3.2.**
Using `apply`, make a table that's a copy of `raw_compensation` with one more column called \"Total Pay (\\$)\". It should be the result of applying `convert_pay_string_to_number` to the \"Total Pay\" column, as we did above, and creating a new table which is the old one, but with the \"Total Pay\" column redone. Call the new table `compensation`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "compensation = raw_compensation.with_column(\n", - " \"Total Pay ($)\",\n", - " ...\n", - "compensation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q3_2.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that we have the pay in numbers, we can compute things about them.\n", - "\n", - "**Question 3.3.**
Compute the average total pay of the CEOs in the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "average_total_pay = ...\n", - "average_total_pay" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q3_3.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 3.4.**
Companies pay executives in a variety of ways: directly in cash; by granting stock or other \"equity\" in the company; or with ancillary benefits (like private jets). Compute the proportion of each CEO's pay that was cash. (Your answer should be an array of numbers, one for each CEO in the dataset.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "cash_proportion = ...\n", - "cash_proportion" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q3_4.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check out the \"% Change\" column in `compensation`. It shows the percentage increase in the CEO's pay from the previous year. For CEOs with no previous year on record, it instead says \"(No previous year)\". The values in this column are *strings*, not numbers, so like the \"Total Pay\" column, it's not usable without a bit of extra work.\n", - "\n", - "Given your current pay and the percentage increase from the previous year, you can compute your previous year's pay. For example, if your pay is \\$100 this year, and that's an increase of 50% from the previous year, then your previous year's pay was $\\frac{\\$100}{1 + \\frac{50}{100}}$, or around \\$66.66.\n", - "\n", - "**Question 3.5.**
Create a new table called `with_previous_compensation`. It should be a copy of `compensation`, but with the \"(No previous year)\" CEOs filtered out, and with an extra column called \"2014 Total Pay ($)\". That column should have each CEO's pay in 2014.\n", - "\n", - "*Hint:* This question takes several steps, but each one is still something you've seen before. Take it one step at a time, using as many lines as you need. You can print out your results after each step to make sure you're on the right track.\n", - "\n", - "*Hint 2:* You'll need to define a function. You can do that just above your other code." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "for_assignment_type": "student" - }, - "outputs": [], - "source": [ - "# Definition to turn percent to number\n", - "def percent_string_to_num(percent_string):\n", - " return ...\n", - "\n", - "# Compensation table where there is a previous year\n", - "having_previous_year = ...\n", - "\n", - "# Get the percent changes as numbers instead of strings\n", - "percent_changes = ...\n", - "\n", - "# Calculate the previous years pay\n", - "previous_pay = ...\n", - "\n", - "# Put the previous pay column into the compensation table\n", - "with_previous_compensation = ...\n", - "\n", - "with_previous_compensation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q3_5.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 3.6.**
What was the average pay of these CEOs in 2014?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "average_pay_2014 = ...\n", - "average_pay_2014" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q3_6.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Histograms\n", - "Earlier, we computed the average pay among the CEOs in our 102-CEO dataset. The average doesn't tell us everything about the amounts CEOs are paid, though. Maybe just a few CEOs make the bulk of the money, even among these 102.\n", - "\n", - "We can use a *histogram* to display more information about a set of numbers. The table method `hist` takes a single argument, the name of a column of numbers. It produces a histogram of the numbers in that column.\n", - "\n", - "**Question 4.1.**
Make a histogram of the pay of the CEOs in `compensation`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "..." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 4.2.**
Looking at the histogram, how many CEOs made more than \\$30 million? (Answer the question by filling in your answer manually. You'll have to do a bit of arithmetic; feel free to use Python as a calculator.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "num_ceos_more_than_30_million = ..." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 4.3.**
Answer the same question with code. *Hint:* Use the table method `where` and the property `num_rows`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "num_ceos_more_than_30_million_2 = ...\n", - "num_ceos_more_than_30_million_2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q4_3.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Submission" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Great job! :D You're finished with lab 4! Be sure to...\n", - "- **run all the tests and verify that they all pass** (the next cell has a shortcut for that), \n", - "- **Review the notebook one last time, we will be grading the final state of your notebook after the deadline**,\n", - "- **Save and Checkpoint** from the `File` menu," - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# For your convenience, you can run this cell to run all the tests at once!\n", - "import glob\n", - "from gofer.ok import grade_notebook\n", - "if not globals().get('__GOFER_GRADER__', False):\n", - " display(grade_notebook('lab04.ipynb', sorted(glob.glob('tests/q*.py'))))" - ] - } - ], - "metadata": { - "anaconda-cloud": {}, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/materials/x18/lab/2/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb b/materials/x18/lab/2/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb deleted file mode 100644 index 69c51bb..0000000 --- a/materials/x18/lab/2/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb +++ /dev/null @@ -1,729 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Lab 1: Simulations\n", - "\n", - "Welcome to Lab 1 of Data 8.2x! \n", - "\n", - "We will go over [iteration and simulations](https://www.inferentialthinking.com/chapters/10/sampling-and-empirical-distributions.html), as well as introduce the concept of [randomness](https://www.inferentialthinking.com/chapters/09/randomness.html).\n", - "\n", - "The data used in this lab will contain salary data and other statistics for basketball players from the 2014-2015 NBA season. This data was collected from the following sports analytic sites: [Basketball Reference](http://www.basketball-reference.com) and [Spotrac](http://www.spotrac.com).\n", - "\n", - "First, set up the tests and imports by running the cell below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "from datascience import *\n", - "\n", - "from gofer.ok import check" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Nachos and Conditionals" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In Python, Boolean values can either be `True` or `False`. We get Boolean values when using comparison operators such as `<` (less than), `>` (greater than), and `==` (equal to). A list of common comparison operators can be found below!\n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "3 > 1 + 1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can even assign the result of a comparison operation to a variable." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "result = 10 / 2 == 5\n", - "result" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Arrays are compatible with comparison operators. The output is an array of boolean values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "make_array(1, 5, 7, 8, 3, -1) > 3" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Waiting on the dining table just for you is a hot bowl of nachos! Let's say that whenever you take a nacho, it will have cheese, salsa, both, or neither (just a plain tortilla chip). \n", - "\n", - "Using the function call `np.random.choice(array_name)`, let's simulate taking nachos from the bowl at random. Start by running the cell below several times, and observe how the results change." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "nachos = make_array('cheese', 'salsa', 'both', 'neither')\n", - "np.random.choice(nachos)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 1.1**
Assume we took ten nachos at random, and stored the results in an array called `ten_nachos` as done below. Find the number of nachos with only cheese using code (do not hardcode the answer). \n", - "\n", - "*Hint:* Our solution involves a comparison operator and the `np.count_nonzero` method." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "ten_nachos = make_array('neither', 'cheese', 'both', 'both', 'cheese', 'salsa', 'both', 'neither', 'cheese', 'both')\n", - "number_cheese = ...\n", - "number_cheese" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q1_1.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Conditional Statements**\n", - "\n", - "A conditional statement is made up of many lines that allow Python to choose from different alternatives based on whether some condition is true.\n", - "\n", - "Here is a basic example.\n", - "\n", - "```\n", - "def sign(x):\n", - " if x > 0:\n", - " return 'Positive'\n", - "```\n", - "\n", - "How the function works is if the input `x` is greater than `0`, we get the string `'Positive'` back.\n", - "\n", - "If we want to test multiple conditions at once, we use the following general format.\n", - "\n", - "```\n", - "if :\n", - " \n", - "elif :\n", - " \n", - "elif :\n", - " \n", - "...\n", - "else:\n", - " \n", - "```\n", - "\n", - "Only one of the bodies will ever be executed. Each `if` and `elif` expression is evaluated and considered in order, starting at the top. As soon as a true value is found, the corresponding body is executed, and the rest of the expression is skipped. If none of the `if` or `elif` expressions are true, then the `else body` is executed. For more examples and explanation, refer to [Section 9.1](https://www.inferentialthinking.com/chapters/09/1/conditional-statements.html)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 1.2**
Complete the following conditional statement so that the string `'More please'` is assigned to `say_please` if the number of nachos with cheese in `ten_nachos` is less than `5`.\n", - "*Hint*: You should not have to reference the variable `ten_nachos`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "for_assignment_type": "student" - }, - "outputs": [], - "source": [ - "say_please = '?'\n", - "\n", - "if ...:\n", - " say_please = 'More please'\n", - " \n", - "say_please" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q1_2.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 1.3**
Write a function called `nacho_reaction` that returns a string based on the type of nacho passed in as an argument. From top to bottom, the conditions should correspond to: `'cheese'`, `'salsa'`, `'both'`, `'neither'`. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "for_assignment_type": "student" - }, - "outputs": [], - "source": [ - "def nacho_reaction(nacho):\n", - " if ...:\n", - " return 'Cheesy!'\n", - " # next condition should return 'Spicy!'\n", - " ...\n", - " # next condition should return 'Wow!'\n", - " ...\n", - " # next condition should return 'Meh.'\n", - " ...\n", - "\n", - "spicy_nacho = nacho_reaction('salsa')\n", - "spicy_nacho" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q1_3.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 1.4**
Add a column `'Reactions'` to the table `ten_nachos_reactions` that consists of reactions for each of the nachos in `ten_nachos`. \n", - "\n", - "*Hint:* Use the `apply` method. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "for_assignment_type": "student" - }, - "outputs": [], - "source": [ - "ten_nachos_reactions = Table().with_column('Nachos', ten_nachos)\n", - "...\n", - "ten_nachos_reactions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q1_4.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 1.5**
Using code, find the number of `'Wow!'` reactions for the nachos in `ten_nachos_reactions`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "number_wow_reactions = ...\n", - "number_wow_reactions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q1_5.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Simulations and For Loops\n", - "Using a `for` statement, we can perform a task multiple times. This is known as iteration. Here, we'll simulate drawing different suits from a deck of cards. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "suits = make_array(\"♤\", \"♡\", \"♢\", \"♧\")\n", - "\n", - "draws = make_array()\n", - "\n", - "repetitions = 6\n", - "\n", - "for i in np.arange(repetitions):\n", - " draws = np.append(draws, np.random.choice(suits))\n", - "\n", - "draws" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The unrolled version of this `for` loop can be found below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "draws = make_array()\n", - "\n", - "draws = np.append(draws, np.random.choice(suits))\n", - "draws = np.append(draws, np.random.choice(suits))\n", - "draws = np.append(draws, np.random.choice(suits))\n", - "draws = np.append(draws, np.random.choice(suits))\n", - "draws = np.append(draws, np.random.choice(suits))\n", - "draws = np.append(draws, np.random.choice(suits))\n", - "\n", - "draws" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the example above, the `for` loop appends a random draw to the `draws` array for every number in `np.arange(repetitions)`. \n", - "\n", - "Here's a nice way to think of what we did above. We had a deck of 4 cards of different suits, we randomly drew one card, saw the suit, kept track of it in `draws`, and put the card back into the deck. We repeated this for a total of 6 times without having to repeat code, thanks to the `for` loop. We simulated this experiment using a `for` loop. \n", - "\n", - "Another use of iteration is to loop through a set of values. For instance, we can print out all of the colors of the rainbow.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "rainbow = make_array(\"red\", \"orange\", \"yellow\", \"green\", \"blue\", \"indigo\", \"violet\")\n", - "\n", - "for color in rainbow:\n", - " print(color)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see that the indented part of the `for` loop, known as the body, is executed once for each item in `rainbow`. Note that the name `color` is arbitrary; we could easily have named it something else. The important thing is we stay consistent throughout the for loop. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "for another_name in rainbow:\n", - " print(another_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In general, however, we would like the variable name to be somewhat informative. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 2.1**
Clay is playing darts. His dartboard contains ten equal-sized zones with point values from 1 to 10. Write code that simulates his total score after 1000 dart tosses. Make sure to use a `for` loop.\n", - "\n", - "*Hint:* There are three steps to this problem (and most simulations): \n", - "1. Deciding the possible values you can take in the experiment (point values in this case)\n", - "2. Running through the experiment a certain amount of times (running through 1000 dart tosses, and randomly getting a value per toss in this case)\n", - "3. Keeping track of the total information of each time you ran through the experiment (the total score in this case)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "for_assignment_type": "student" - }, - "outputs": [], - "source": [ - "possible_point_values = ...\n", - "tosses = 1000\n", - "total_score = ...\n", - "\n", - "total_score" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q2_1.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 2.2**
In the following cell, we've loaded the text of _Pride and Prejudice_ by Jane Austen, split it into individual words, and stored these words in an array. Using a `for` loop, assign `longer_than_five` to the number of words in the novel that are more than 5 letters long.\n", - "\n", - "*Hint*: You can find the number of letters in a word with the `len` function." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "for_assignment_type": "student" - }, - "outputs": [], - "source": [ - "austen_string = open('Austen_PrideAndPrejudice.txt', encoding='utf-8').read()\n", - "p_and_p_words = np.array(austen_string.split())\n", - "\n", - "longer_than_five = ...\n", - "\n", - "# a for loop would be useful here\n", - "\n", - "\n", - "longer_than_five" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q2_2.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 2.3**
Using simulation with 10,000 trials, assign `chance_of_all_different` to an estimate of the chance that if you pick three words from Pride and Prejudice uniformly at random (with replacement), they all have different lengths. \n", - "\n", - "*Hint*: Remember that `!=` only checks for non-equality between two items, not three. However, you can use `!=` more than once in the same line. \n", - "\n", - "For example, `2 != 3 != 4` first checks for non-equality between `2` and `3`, then `3` and `4`, but NOT `2` and `4`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "for_assignment_type": "student" - }, - "outputs": [], - "source": [ - "trials = 10000\n", - "different = ...\n", - "\n", - "for ... in ...:\n", - " ...\n", - "\n", - "chance_of_all_different = ...\n", - "\n", - "chance_of_all_different" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q2_3.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Finding Probabilities\n", - "After a long day of class, Clay decides to go to Crossroads for dinner. Today's menu has Clay's four favorite foods: enchiladas, hamburgers, pizza, and spaghetti. However, each dish has a 30% chance of running out before Clay can get to Crossroads." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 3.1**
What is the probability that Clay will be able to eat pizza at Crossroads?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "pizza_prob = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q3_1.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 3.2**
What is the probability that Clay will be able to eat all four of these foods at Crossroads?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "all_prob = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q3_2.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 3.3**
What is the probability that Crossroads will have run out of something before Clay can get there?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "something_is_out = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q3_3.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To make up for their unpredictable food supply, Crossroads decides to hold a contest for some free Cal Dining swag. There is a bag with two red marbles, two green marbles, and two blue marbles. Clay has to draw three marbles separately. In order to win, all three of these marbles must be of different colors." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 3.4**
What is the probability of Clay winning the contest?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "winning_prob = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q3_4.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Submission" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# For your convenience, you can run this cell to run all the tests at once!\n", - "import glob\n", - "from gofer.o import grade_notebook\n", - "if not globals().get('__GOFER_GRADER__', False):\n", - " display(grade_notebook('lab01.ipynb', glob.glob('tests/q*.py')))" - ] - } - ], - "metadata": { - "anaconda-cloud": {}, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/materials/x18/lab/2/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb b/materials/x18/lab/2/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb deleted file mode 100644 index c1f5560..0000000 --- a/materials/x18/lab/2/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb +++ /dev/null @@ -1,664 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Lab 2: Sampling\n", - "\n", - "Welcome to Lab 2! In this lab, we will learn about sampling strategies. More information about sampling in the textbook can be found [here!](https://www.inferentialthinking.com/chapters/10/sampling-and-empirical-distributions.html)\n", - "\n", - "The data used in this lab will contain salary data and statistics for basketball players from the 2014-2015 NBA season. This data was collected from [basketball-reference](http://www.basketball-reference.com) and [spotrac](http://www.spotrac.com)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Run this cell, but please don't change it.\n", - "\n", - "# These lines import the Numpy and Datascience modules.\n", - "import numpy as np\n", - "from datascience import *\n", - "\n", - "# These lines do some fancy plotting magic\n", - "import matplotlib\n", - "%matplotlib inline\n", - "import matplotlib.pyplot as plots\n", - "plots.style.use('fivethirtyeight')\n", - "\n", - "# Don't change this cell; just run it. \n", - "from gofer.ok import check" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Dungeons and Dragons and Sampling\n", - "In the game Dungeons & Dragons, each player plays the role of a fantasy character.\n", - "\n", - "A player performs actions by rolling a 20-sided die, adding a \"modifier\" number to the roll, and comparing the total to a threshold for success. The modifier depends on her character's competence in performing the action.\n", - "\n", - "For example, suppose Alice's character, a barbarian warrior named Roga, is trying to knock down a heavy door. She rolls a 20-sided die, adds a modifier of 11 to the result (because her character is good at knocking down doors), and succeeds if the total is greater than 15.\n", - "\n", - "** Question 1.1 **
Write code that simulates that procedure. Compute three values: the result of Alice's roll (`roll_result`), the result of her roll plus Roga's modifier (`modified_result`), and a boolean value indicating whether the action succeeded (`action_succeeded`). **Do not fill in any of the results manually**; the entire simulation should happen in code.\n", - "\n", - "*Hint:* A roll of a 20-sided die is a number chosen uniformly from the array `make_array(1, 2, 3, 4, ..., 20)`. So a roll of a 20-sided die *plus 11* is a number chosen uniformly from that array, plus 11." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "for_assignment_type": "student" - }, - "outputs": [], - "source": [ - "possible_rolls = ...\n", - "roll_result = ...\n", - "modified_result = ...\n", - "action_succeeded = ...\n", - "\n", - "# The next line just prints out your results in a nice way\n", - "# once you're done. You can delete it if you want.\n", - "print(\"On a modified roll of {:d}, Alice's action {}.\".format(modified_result, \"succeeded\" if action_succeeded else \"failed\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q1_1.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "** Question 1.2 **
Run your cell 7 times to manually estimate the chance that Alice succeeds at this action. (Don't use math or an extended simulation.). Your answer should be a fraction. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "rough_success_chance = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q1_2.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Suppose we don't know that Roga has a modifier of 11 for this action. Instead, we observe the modified roll (that is, the die roll plus the modifier of 11) from each of 7 of her attempts to knock down doors. We would like to estimate her modifier from these 7 numbers.\n", - "\n", - "** Question 1.3 **
Write a Python function called `simulate_observations`. It should take no arguments, and it should return an array of 7 numbers. Each of the numbers should be the modified roll from one simulation. **Then**, call your function once to compute an array of 7 simulated modified rolls. Name that array `observations`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "for_assignment_type": "student" - }, - "outputs": [], - "source": [ - "modifier = 11\n", - "num_observations = 7\n", - "\n", - "def simulate_observations():\n", - " \"\"\"Produces an array of 7 simulated modified die rolls\"\"\"\n", - " ...\n", - "\n", - "observations = ...\n", - "observations" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q1_3.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "** Question 1.4 **
Draw a histogram to display the *probability distribution* of the modified rolls we might see.\n", - "\n", - "Question 1.4 does not have an autograder test, so it is not graded and not in the overall lab grade." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# We suggest using these bins.\n", - "roll_bins = np.arange(1, modifier+2+20, 1)\n", - "\n", - "...\n", - "np.arange(1+modifier, 20+modifier+1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Your histogram should have values 12 to 31 each with a probability of 5%.\n", - "\n", - "Now let's imagine we don't know the modifier and try to estimate it from `observations`.\n", - "\n", - "One straightforward (but clearly suboptimal) way to do that is to find the *smallest* total roll, since the smallest roll on a 20-sided die is 1.\n", - "\n", - "** Question 1.5 **
Using that method, estimate `modifier` from `observations`. Name your estimate `min_estimate`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "min_estimate = ...\n", - "min_estimate" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q1_5.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Another way to estimate the modifier involves the mean of `observations`.\n", - "\n", - "** Question 1.6 **
Figure out a good estimate based on that quantity. \n", - "\n", - "**Then**, write a function named `mean_based_estimator` that computes your estimate. It should take an array of modified rolls (like the array `observations`) as its argument and return an estimate of `modifier` based on those numbers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def mean_based_estimator(nums):\n", - " \"\"\"Estimate the roll modifier based on observed modified rolls in the array nums.\"\"\"\n", - " ...\n", - "\n", - "# Here is an example call to your function. It computes an estimate\n", - "# of the modifier from our 7 observations.\n", - "mean_based_estimate = mean_based_estimator(observations)\n", - "mean_based_estimate" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q1_6.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Sampling Basketball Data\n", - "\n", - "Run the cell below to load the player and salary data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "player_data = Table().read_table(\"player_data.csv\")\n", - "salary_data = Table().read_table(\"salary_data.csv\")\n", - "full_data = salary_data.join(\"PlayerName\", player_data, \"Name\")\n", - "# The show method immediately displays the contents of a table. \n", - "# This way, we can display the top of two tables using a single cell.\n", - "player_data.show(3)\n", - "salary_data.show(3)\n", - "full_data.show(3)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Rather than getting data on every player, imagine that we had gotten data on only a smaller subset of the players. For 492 players, it's not so unreasonable to expect to see all the data, but usually we aren't so lucky. Instead, we often make *statistical inferences* about a large underlying population using a smaller sample.\n", - "\n", - "A statistical inference is a statement about some statistic of the underlying population, such as \"the average salary of NBA players in 2014 was $3\". You may have heard the word \"inference\" used in other contexts. It's important to keep in mind that statistical inferences, unlike, say, logical inferences, can be wrong.\n", - "\n", - "A general strategy for inference using samples is to estimate statistics of the population by computing the same statistics on a sample. This strategy sometimes works well and sometimes doesn't. The degree to which it gives us useful answers depends on several factors, and we'll touch lightly on a few of those today.\n", - "\n", - "One very important factor in the utility of samples is how they were gathered. We have prepared some example sample datasets to simulate inference from different kinds of samples for the NBA player dataset. Later we'll ask you to create your own samples to see how they behave." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To save typing and increase the clarity of your code, we will package the loading and analysis code into two functions. This will be useful in the rest of the lab as we will repeatedly need to create histograms and collect summary statistics from that data." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 2.1**.
Complete the `histograms` function, which takes a table with columns `Age` and `Salary` and draws a histogram for each one. Use the min and max functions to pick the bin boundaries so that all data appears for any table passed to your function. Use the same bin widths as before (1 year for `Age` and $1,000,000 for `Salary`).\n", - "\n", - "*Hint*: When creating the bins for the the histograms, think critically about what the stop argument should be for `np.arange`. Histograms are inclusive on the left hand side of the interval, but not the right. So, if we have a maximum age of 80, we need a 80-81 bin in order to capture this in the histogram. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def histograms(t):\n", - " ages = t.column('Age')\n", - " salaries = t.column('Salary')\n", - " age_bins = ...\n", - " salary_bins = ...\n", - " t.hist('Age', bins=age_bins, unit='year')\n", - " t.hist('Salary', bins=salary_bins, unit='$')\n", - " return age_bins # Keep this statement so that your work can be checked\n", - " \n", - "histograms(full_data)\n", - "print('Two histograms should be displayed below')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q2_1.py') # Warning: Charts will be displayed while running this test" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 2.2**.
Create a function called `compute_statistics` that takes a Table containing ages and salaries and:\n", - "- Draws a histogram of ages\n", - "- Draws a histogram of salaries\n", - "- Returns a two-element array containing the average age and average salary\n", - "\n", - "You can call your `histograms` function to draw the histograms!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def compute_statistics(age_and_salary_data):\n", - " ...\n", - " age = ...\n", - " salary = ...\n", - " ...\n", - " \n", - "\n", - "full_stats = compute_statistics(full_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q2_2.py') # Warning: Charts will be displayed while running this test" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Convenience sampling\n", - "One sampling methodology, which is **generally a bad idea**, is to choose players who are somehow convenient to sample. For example, you might choose players from one team that's near your house, since it's easier to survey them. This is called, somewhat pejoratively, *convenience sampling*.\n", - "\n", - "Suppose you survey only *relatively new* players with ages less than 22. (The more experienced players didn't bother to answer your surveys about their salaries.)\n", - "\n", - "**Question 2.3**
Assign `convenience_sample_data` to a subset of `full_data` that contains only the rows for players under the age of 22." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "convenience_sample = ...\n", - "convenience_sample" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q2_3.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 2.4**
Assign `convenience_stats` to a list of the average age and average salary of your convenience sample, using the `compute_statistics` function. Since they're computed on a sample, these are called *sample averages*. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "convenience_stats = ...\n", - "convenience_stats" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q2_4.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we'll compare the convenience sample salaries with the full data salaries in a single histogram. To do that, we'll need to use the `bin_column` option of the `hist` method, which indicates that all columns are counts of the bins in a particular column. The following cell should not require any changes; just run it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def compare_salaries(first, second, first_title, second_title):\n", - " \"\"\"Compare the salaries in two tables.\"\"\"\n", - " max_salary = max(np.append(first.column('Salary'), second.column('Salary')))\n", - " bins = np.arange(0, max_salary+1e6+1, 1e6)\n", - " first_binned = first.bin('Salary', bins=bins).relabeled(1, first_title)\n", - " second_binned = second.bin('Salary', bins=bins).relabeled(1, second_title)\n", - " first_binned.join('bin', second_binned).hist(bin_column='bin')\n", - "\n", - "compare_salaries(full_data, convenience_sample, 'All Players', 'Convenience Sample')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Simple random sampling\n", - "A more principled approach is to sample uniformly at random from the players. If we ensure that each player is selected at most once, this is a *simple random sample without replacement*, sometimes abbreviated to \"simple random sample\" or \"SRSWOR\". Imagine writing down each player's name on a card, putting the cards in an urn, and shuffling the urn. Then, pull out cards one by one and set them aside, stopping when the specified *sample size* is reached.\n", - "\n", - "We've produced two samples of the `salary_data` table in this way: `small_srswor_salary.csv` and `large_srswor_salary.csv` contain, respectively, a sample of size 44 (the same as the convenience sample) and a larger sample of size 100. \n", - "\n", - "The `load_data` function below loads a salary table and joins it with `player_data`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def load_data(salary_file):\n", - " return player_data.join('Name', Table.read_table(salary_file), 'PlayerName')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 2.5**
Run the same analyses on the small and large samples that you previously ran on the full dataset and on the convenience sample. Compare the accuracy of the estimates of the population statistics that we get from the convenience sample, the small simple random sample, and the large simple random sample. (Just notice this for yourself -- the autograder will check your sample statistics but will not validate whatever you do to compare.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Original:\n", - "small_srswor_data = ...\n", - "small_stats = ...\n", - "large_srswor_data = ...\n", - "large_stats = ...\n", - "print('Full data stats: ', full_stats)\n", - "print('Small simple random sample stats:', small_stats)\n", - "print('Large simple random sample stats:', large_stats)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "check('tests/q2_5.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Producing simple random samples\n", - "Often it's useful to take random samples even when we have a larger dataset available. Another is to help us understand how inaccurate other samples are.\n", - "\n", - "Tables provide the method `sample()` for producing random samples. Note that its default is to sample with replacement. To see how to call `sample()`, search the documentation on the [datascience documentation](http://data8.org/datascience/) of the course website, or enter `full_data.sample?` into a code cell and press Shift + Enter." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 2.6**
\n", - "Produce a simple random sample of size 44 from `full_data`. (You don't need to bother with a join this time -- just use `full_data.sample(...)` directly. That will have the same result as sampling from `salary_data` and joining with `player_data`.) Run your analysis on it again and think about these following questions.\n", - "- Are your results roughly similar to those in the small sample we provided you? Run your code several times to get new samples. \n", - "- How much does the average age change across samples? \n", - "- What about average salary?\n", - "\n", - "Question 2.6 does not have an autograder test, so it is not graded and not in the overall lab grade." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "my_small_srswor_data = ...\n", - "my_small_stats = ...\n", - "my_small_stats" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notice that the results are similar, but not the same, to the sample we were given. The average age tends to stay around the same value as there is a limited range of ages for NBA players, but the salary changes by a sizeable factor due to larger variability in salary." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 2.7**
As in the previous question, analyze several simple random samples of size 100 from `full_data`. \n", - "- Do the histogram statistics seem to change more or less across samples of 100 than across samples of size 44? \n", - "- Are the sample averages and histograms closer to their true values for age or for salary? What did you expect to see?\n", - "\n", - "Question 2.7 does not have an autograder test, so it is not graded and not in the overall lab grade." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "my_large_srswor_data = ...\n", - "..." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The average and histogram statistics seem to change less across samples of this size. They are closer to their true values, which is what we'd expect to see because we are sampling a larger subset of the population. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Submission" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You're finished with lab 2! In order to successfully submit your assignment, follow these steps...\n", - "- **IMPORTANT** Before you do anything, **Save and Checkpoint** from the `File` menu. Please do this first before running the cell below,\n", - "- **run all the tests and verify that they all pass** (the next cell has a shortcut for that), \n", - "- **Review the notebook one last time, we will be grading the final state of your notebook** If you make any changes, please **Save and Checkpoint** again." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# For your convenience, you can run this cell to run all the tests at once!\n", - "import glob\n", - "from gofer.ok import grade_notebook\n", - "if not globals().get('__GOFER_GRADER__', False):\n", - " display(grade_notebook('lab02.ipynb', sorted(glob.glob('tests/q*.py'))))" - ] - } - ], - "metadata": { - "anaconda-cloud": {}, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/materials/x18/lab/2/lab05/.ipynb_checkpoints/lab05-checkpoint.ipynb b/materials/x18/lab/2/lab05/.ipynb_checkpoints/lab05-checkpoint.ipynb deleted file mode 100644 index 42d7cd6..0000000 --- a/materials/x18/lab/2/lab05/.ipynb_checkpoints/lab05-checkpoint.ipynb +++ /dev/null @@ -1,667 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Lab 5: Resampling and the Bootstrap\n", - "\n", - "Welcome to Lab 5!\n", - "\n", - "We will attempt to estimate the number `N`, a *population parameter*, that represents the number of elements in a population. We get to observe a uniform random sample of the elements, and for each one we can observe its serial number (from 1 to `N`). All elements are labeled with consecutive serial numbers from 1 to `N`, so `N` is the total number of elements. \n", - "\n", - "Given *just* a random sample of elements, we'll estimate `N`, and then we'll use simulation to find a confidence interval around our estimate, all without ever looking at the whole population. This is an example of *statistical inference*.\n", - "\n", - "As usual, **run the cell below** to prepare the lab and the automatic tests." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Run this cell to set up the notebook, but please don't change it.\n", - "\n", - "# These lines import the Numpy and Datascience modules.\n", - "import numpy as np\n", - "from datascience import *\n", - "\n", - "# These lines do some fancy plotting magic.\n", - "import matplotlib\n", - "%matplotlib inline\n", - "import matplotlib.pyplot as plt\n", - "plt.style.use('fivethirtyeight')\n", - "import warnings\n", - "warnings.simplefilter('ignore', UserWarning)\n", - "\n", - "# Don't change this cell; just run it. \n", - "from gofer.ok import check" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Preliminaries\n", - "The setup: We want to know the number of elements in the population. That number is `N`. Each element is numbered from 1 to `N`.\n", - "\n", - "We only see a small number of elements (assumed to be a uniform random sample with replacement from among all the elements), so we have to use estimation." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Question 1.1\n", - "Is `N` a population parameter or a statistic? If we compute a number using our random sample that's an estimate of `N`, is that a population parameter or a statistic?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "*Write your answer here, replacing this text.*" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check your answer by posting on the discussion forum.\n", - "\n", - "To make the situation realistic, we're going to hide the true number of elements from you. You'll have access only to this random sample:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "observations = Table.read_table(\"serial_numbers.csv\")\n", - "num_observations = observations.num_rows\n", - "observations" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Question 1.2\n", - "Define a function named `plot_serial_numbers` to make a histogram of any table of serial numbers. It should take one argument, a table like `observations` with one column called `\"serial number\"`. It should plot a histogram of the values in the column **using bins of width 1** ranging from **1 to 200** but return nothing. Then, call that function to make a histogram of `observations`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def plot_serial_numbers(numbers):\n", - " ...\n", - " \n", - " # Assuming the lines above produce a histogram, this next\n", - " # line may make your histograms look nicer. Feel free to\n", - " # delete it if you want.\n", - " plt.ylim(0, .25)\n", - "\n", - "plot_serial_numbers(observations)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Question 1.3\n", - "By looking at the histogram, what can we say about `N` immediately? (Hint: What is the relationship between `N` and the largest serial number in `observations`?) What does each little bar in the histogram represent? Why are all the bars the same height?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "*Write your answer here, replacing this text.*" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Question 1.4\n", - "One way to estimate `N` is to take twice the mean of the serial numbers we observe. Write a function that computes that statistic. It should take as its argument an array of serial numbers and return twice their mean. Call it `mean_based_estimator`. \n", - "\n", - "After that, use it to compute an estimate of `N` called `mean_based_estimate`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def mean_based_estimator(nums):\n", - " ...\n", - "\n", - "mean_based_estimate = ...\n", - "mean_based_estimate" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q1_4.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Question 1.5\n", - "We can also estimate `N` using the biggest serial number in the sample. Compute it, giving it the name `max_estimate`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "max_estimate = ...\n", - "max_estimate" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q1_5.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Question 1.6\n", - "Look at the values of `max_estimate` and `mean_based_estimate` that we happened to get for our dataset. The value of `max_estimate` tells you something about `mean_based_estimate`. For these specific values, is it possible for our value of `mean_based_estimate` to be equal to `N` (at least, if we round it to the nearest integer)? If not, is it definitely higher, definitely lower, or can we not tell? Can you make a statement like the value of our \"`mean_based_estimate` is at least *[fill in a number]* away from `N`\"?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "*Write your answer here, replacing this text.*" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check your answer by posting on the discussion forum.\n", - "\n", - "We can't just confidently proclaim that `max_estimate` or `mean_based_estimate` is equal to `N`. What if we're really far off? So we want to get a sense of the accuracy of our estimates." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Resampling\n", - "To do this, we'll use resampling. That is, we won't exactly simulate new observations. Rather we sample from our current sample, or \"resample\" the data.\n", - "\n", - "Why does that make any sense?\n", - "\n", - "When we tried to estimate `N`, we would have liked to use the whole population. Since we had only a sample, we used that to estimate `N` instead.\n", - "\n", - "This time, we would like to use the population of serial numbers to *run a simulation* about estimates of `N`. But we still only have our sample. We use our sample in place of the population to run the simulation.\n", - "\n", - "So there is a simple analogy between estimating `N` and simulating the variability of estimates.\n", - "\n", - "$$\\text{computing }N\\text{ from the population}$$\n", - "$$:$$\n", - "$$\\text{computing an estimate of }N\\text{ from a sample}$$\n", - "\n", - "$$\\text{as}$$\n", - "\n", - "$$\\text{simulating the distribution of estimates of }N\\text{ using samples from the population}$$\n", - "$$:$$\n", - "$$\\text{simulating an (approximate) distribution of estimates of }N\\text{ using resamples from a sample}$$" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Question 2.1\n", - "Write a function called `simulate_resample`. It should generate a resample from the observed serial numbers in `observations` and return that resample. (The resample should be a table like `observations`.) It should take no arguments." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def simulate_resample():\n", - " ..." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's make one resample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# This line is a little magic to make sure that you see the same results\n", - "# we did.\n", - "np.random.seed(123)\n", - "\n", - "one_resample = simulate_resample()\n", - "one_resample" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q2_1.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Later, we'll use many resamples at once to see what estimates typically look like. We don't often pay attention to single resamples, so it's easy to misunderstand them. Let's examine some individual resamples before we start using them." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Question 2.2\n", - "In preparation for answering the next question, generate a histogram of your resample using the plotting function you defined earlier in this lab, **and** generate a separate histogram of the original observations." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "...\n", - "..." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Question 2.3\n", - "Which of the following are true:\n", - "1. In the plot of the resample, there are no bars at locations that weren't there in the plot of the original observations.\n", - "2. In the plot of the original observations, there are no bars at locations that weren't there in the plot of the resample.\n", - "3. The resample has exactly one copy of each serial number.\n", - "4. The sample has exactly one copy of each serial number.\n", - "\n", - "Assign true_statements to a list of the correct statements." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "true_statements = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q2_3.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Question 2.4\n", - "Create two more resamples using the function `simulate_resample` from above. For each resampled data, plot it and compute its max- and mean-based estimates." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "resample_0 = ...\n", - "...\n", - "mean_based_estimate_0 = ...\n", - "max_based_estimate_0 = ...\n", - "print(\"Mean-based estimate for resample 0:\", mean_based_estimate_0)\n", - "print(\"Max-based estimate for resample 0:\", max_based_estimate_0)\n", - "\n", - "resample_1 = ...\n", - "...\n", - "mean_based_estimate_1 = ...\n", - "max_based_estimate_1 = ...\n", - "print(\"Mean-based estimate for resample 1:\", mean_based_estimate_1)\n", - "print(\"Max-based estimate for resample 1:\", max_based_estimate_1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You may find that the max-based estimates from the resamples are both exactly 135. You will probably find that the two mean-based estimates do differ from the sample mean-based estimate (and from each other).\n", - "\n", - "#### Question 2.5\n", - "Using probability that you've learned, compute the exact chance that a max-based estimate from *one* resample is 135.\n", - "\n", - "Using your intuition, explain why a mean-based estimate from a resample is less often exactly equal to the mean-based estimate from the original sample as compared to a max-based estimate.\n", - "\n", - "As a refresher, here are some rules of probability that may be helpful:\n", - "\n", - "- When all outcomes are equally likely: P(event happens) $=$ $\\frac{\\text{# outcomes that make event happen}}{\\text{# of all outcomes}}$\n", - "\n", - "- When an event can happen in 2 ways: P(event) $=$ P(event happening first way) $+$ P(event happening second way)\n", - "\n", - "- When 2 events must both happen: P(2 events both happen) $=$ P(one event happens) $*$ P(other event happens, given the first one happened)\n", - "\n", - "- When an event doesn't happen: P(event doesn't happen) $=$ 1 $-$ P(event does happen)\n", - "\n", - "- P(at least one success) $= 1 - $ P(no successes)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "*Write your answer here, replacing this text.*" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Discuss your answers on the edX discussion forums. If you have difficulty with the probability calculation, ask for help; don't stay stuck on it for too long." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Simulating with resampling\n", - "\n", - "**Note**: *The last part of this lab is difficult to check automatically, so it will not be graded. We strongly suggest that you try to complete it. We will release solutions to this lab so that you can compare to them.*\n", - "\n", - "Since resampling from a sample is very similar to sampling from a population, the code should look almost the same. That means we can write a function that simulates either sampling from a population or resampling from a sample. If we pass it a population as its argument, it will do the former; if we pass it a sample, it will do the latter.\n", - "\n", - "#### Question 3.1\n", - "Write a function called `simulate_estimates`. It should take 4 arguments:\n", - "1. A table from which the data should be sampled. The table will have 1 column named `\"serial number\"`.\n", - "2. The size of each sample from that table, an integer. (For example, to do resampling, we would pass for this argument the number of rows in the table.)\n", - "3. A function that computes a statistic of a sample. This argument is a *function* that takes an array of serial numbers as its argument and returns a number.\n", - "4. The number of replications to perform.\n", - "\n", - "It should simulate many samples with replacement from the given table. (The number of samples is the 4th argument.) For each of those samples, it should compute the statistic on that sample. Then it should return an array containing each of those statistics. The code below provides an example use of your function and describes how you can verify that you've written it correctly.\n", - "\n", - "**Hint**: Your implementation should contain the following line, which extracts the \"serial number\" column from some table ``t`` and calls the `statistic` function on it, storing the result in the name `s`.\n", - "\n", - "``s = statistic(t.column(\"serial number\"))``" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "for_assignment_type": "student" - }, - "outputs": [], - "source": [ - "def simulate_estimates(original_table, sample_size, statistic, num_replications):\n", - " # Our implementation of this function took 5 short lines of code.\n", - " ...\n", - "\n", - "# This should generate an empirical histogram of twice-mean estimates\n", - "# of N from samples of size 50 if N is 1000. This should be a bell-shaped\n", - "# curve centered at 1000 with most of its mass in [800, 1200]. To verify your\n", - "# answer, make sure that's what you see!\n", - "example_estimates = simulate_estimates(\n", - " Table().with_column(\"serial number\", np.arange(1, 1000+1)),\n", - " 50,\n", - " mean_based_estimator,\n", - " 10000)\n", - "Table().with_column(\"mean-based estimate\", example_estimates).hist(bins=np.arange(0, 1500, 25))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can go back to the sample we actually observed (the table `observations`) and estimate how much our mean-based estimate of `N` would have varied from sample to sample." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Question 3.2\n", - "Using the bootstrap and the sample `observations`, simulate the approximate distribution of *mean-based estimates* of `N`. Use 5,000 replications. \n", - "We have provided code that plots a histogram, allowing you to visualize the simulated estimates." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "bootstrap_estimates = ...\n", - "Table().with_column(\"mean-based estimate\", bootstrap_estimates).hist(bins=np.arange(0, 200, 4)) " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Question 3.3\n", - "Compute an interval that covers the middle 95% of the bootstrap estimates. Verify that your interval looks like it covers 95% of the area in the histogram above." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "left_end = ...\n", - "right_end = ...\n", - "print(\"Middle 95% of bootstrap estimates: [{:f}, {:f}]\".format(left_end, right_end))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Question 3.4\n", - "Your mean-based estimate of `N` should have been around 122. Given the above calculations, is it likely that `N` is exactly 122? If not, what is the typical range of values of the mean-based estimates of `N` for samples of size 17?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "*Write your answer here, replacing this text.*" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check your solutions with someone on the edX discussion forums" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Question 3.5\n", - "`N` was actually 150! Write code that simulates the sampling and bootstrapping process again, as follows:\n", - "\n", - "1. Generate a new set of random observations by sampling from the population table we have created for you below. \n", - "2. Compute an estimate of `N` from these new observations, using `mean_based_estimator`.\n", - "3. Using only the new observations, compute 5,000 bootstrap estimates of `N`.\n", - "4. Plot these bootstrap estimates and compute an interval covering the middle 95%." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "population = Table().with_column(\"serial number\", np.arange(1, 150+1))\n", - "\n", - "new_observations = ...\n", - "new_mean_based_estimate = ...\n", - "new_bootstrap_estimates = ...\n", - "...\n", - "new_left_end = ...\n", - "new_right_end = ...\n", - "\n", - "print(\"New mean-based estimate: {:f}\".format(new_mean_based_estimate))\n", - "print(\"Middle 95% of bootstrap estimates: [{:f}, {:f}]\".format(new_left_end, new_right_end))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Question 3.6\n", - "Does the interval covering the middle 95% of the new bootstrap estimates include `N`? If you ran that cell many times, what is the probability that it will include `N`?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "*Write your answer here, replacing this text.*" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check your solutions with someone on the edX discussion forums" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Submission" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Conratulations! You're finished with lab 5 and Data 8.2x! In order to successfully submit your assignment, follow these steps...\n", - "- **IMPORTANT** Before you do anything, **Save and Checkpoint** from the `File` menu. Please do this first before running the cell below,\n", - "- **run all the tests and verify that they all pass** (the next cell has a shortcut for that), \n", - "- **Review the notebook one last time, we will be grading the final state of your notebook** If you make any changes, please **Save and Checkpoint** again." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# For your convenience, you can run this cell to run all the tests at once!\n", - "import glob\n", - "from gofer.ok import grade_notebook\n", - "if not globals().get('__GOFER_GRADER__', False):\n", - " display(grade_notebook('lab05.ipynb', sorted(glob.glob('tests/q*.py'))))" - ] - } - ], - "metadata": { - "anaconda-cloud": {}, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/materials/x18/lab/2/lab06/.ipynb_checkpoints/lab06-checkpoint.ipynb b/materials/x18/lab/2/lab06/.ipynb_checkpoints/lab06-checkpoint.ipynb deleted file mode 100644 index d373c5f..0000000 --- a/materials/x18/lab/2/lab06/.ipynb_checkpoints/lab06-checkpoint.ipynb +++ /dev/null @@ -1,881 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Lab 6: Conditional Probability\n", - "\n", - "This lab is an introduction conditional probabilities. \n", - "\n", - "The lab includes a visualization called an *icon array*. It's meant to be an instructional part of the lab to help build intuitions about conditional probability. These visualizations do not appear in the textbook." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Run this cell to set up the notebook, but please don't change it.\n", - "\n", - "# These lines import the Numpy and Datascience modules.\n", - "import numpy as np\n", - "from datascience import *\n", - "\n", - "# These lines do some fancy plotting magic.\n", - "import matplotlib\n", - "%matplotlib inline\n", - "import matplotlib.pyplot as plt\n", - "plt.style.use('fivethirtyeight')\n", - "import warnings\n", - "warnings.simplefilter('ignore', UserWarning)\n", - "\n", - "# This line loads the visualization code for this lab.\n", - "import visualizations\n", - "\n", - "# Don't change this cell; just run it. \n", - "from gofer.ok import check" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. What is conditional probability good for?\n", - "\n", - "Suppose we have a known population, like all dogs in California. So far, we've seen 3 ways of *predicting* something about an individual in that population, given incomplete knowledge about the identity of the individual:\n", - "\n", - "* If we know nothing about the individual dog, we could predict that its speed is the *average* or *median* of all the speeds in the population.\n", - "* If we know the dog's height but not its speed, we could use *linear regression* to predict its speed from its height. The resulting prediction is still imperfect, but it might be more accurate than the population average.\n", - "* If we know the dog's breed, height, and age, we could use *nearest-neighbor classification* (or *multiple regression*) to predict its speed by comparing to a collection of dogs with known speed.\n", - "\n", - "Computing conditional probabilities is a different way of making predictions. It differs in at least two important ways from the methods we've seen:\n", - "1. We will obtain a probability for each outcome \n", - "2. In the simple (but important) cases we'll look at today, conditional probabilities can be calculated exactly from assumptions, rather than being estimated from data. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Icon arrays\n", - "Parts 3 and 4 of this lab are about disease, but first let's start with a simple, contrived example.\n", - "\n", - "Imagine you are a marble. You don't know what you look like (since you obviously have no eyes), but you know that Samantha drew you **uniformly at random** from a bag that contained the following marbles:\n", - "* 4 large shiny marbles,\n", - "* 1 large dull marble,\n", - "* 6 small shiny marbles, and\n", - "* 2 small dull marbles." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 2.1.**
Knowing only what we've told you so far, what's the probability that you're a large shiny marble?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "probability_large_shiny = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q21.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here's a table with those marbles:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "marbles = Table.read_table(\"marbles.csv\")\n", - "marbles.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here are the counts of each type of marble in a pivot table." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "marbles.pivot('surface', 'size')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here are all the differnet combinations of surface and size, with counts, where each type of marble appears in its own row." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "marbles.group(['surface', 'size'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We've included some code to display something called an *icon array*. The functions in the cell below create icon arrays from various kinds of tables. Make sure to read the doc strings for each function so you understand what they do! Refer back to this cell later when you need to make an icon array." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Run this cell.\n", - "\n", - "#######################################################################\n", - "# The functions you'll need to actually use are in here. Each is a\n", - "# way of making an icon array from a differently-formatted table.\n", - "#######################################################################\n", - "\n", - "def display_icon_array(table, groups, individuals_name):\n", - " \"\"\"\n", - " Given a table and some columns to group it on, displays an icon array\n", - " of the groups.\n", - " \n", - " groups should be an array of labels of columns in table.\n", - " \n", - " individuals_name is your name for the individual rows of table.\n", - " For example, if we're talking about a population of people,\n", - " individuals_name should be \"people\".\n", - " \n", - " For example:\n", - " \n", - " display_icon_array(marbles, [\"surface\", \"size\"], \"marbles\")\n", - " \"\"\"\n", - " display_grouped_icon_array(table.group(groups), individuals_name)\n", - "\n", - "def display_grouped_icon_array(grouped_data, individuals_name):\n", - " \"\"\"\n", - " Given a table with counts for data grouped by 1 or more categories,\n", - " displays an icon array of the groups represented in the table.\n", - " \n", - " grouped_data should be a table of frequencies or counts, such as\n", - " a table created by calling the groups method on some table.\n", - " \n", - " individuals_name is your name for the individual members of the\n", - " dataset. For example, if we're talking about a population of\n", - " people, individuals_name should be \"people\".\n", - " \n", - " For example:\n", - " \n", - " display_grouped_icon_array(marbles.group([\"surface\", \"size\"]), \"marbles\")\n", - " \"\"\"\n", - " visualizations.display_combinations(grouped_data, individuals_name=individuals_name)\n", - "\n", - "def display_crosstab_icon_array(crosstabulation, x_label, individuals_name):\n", - " \"\"\"\n", - " Given a crosstabulation table, displays an icon array of the groups\n", - " represented in the table.\n", - " \n", - " crosstabulation should be a table of frequencies or counts created by\n", - " calling pivot on some table.\n", - " \n", - " x_label should be the label of the categories listed as columns (on\n", - " the \"x axis\" when the crosstabulation table is printed).\n", - " \n", - " individuals_name is your name for the individual members of the\n", - " dataset. For example, if we're talking about a population of\n", - " people, individuals_name should be \"people\".\n", - " \n", - " For example:\n", - " \n", - " display_crosstab_icon_array(marbles.pivot(\"surface\", \"size\"), \"surface\", \"marbles\")\n", - " \"\"\"\n", - " display_grouped_icon_array(visualizations.pivot_table_to_groups(crosstabulation, x_label), individuals_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here's an icon array of all the marbles, grouped by surface and size:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Run this cell.\n", - "display_grouped_icon_array(marbles.group([\"surface\", \"size\"]), \"marble\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You (the marble) should imagine that you are a random draw from these 13 icons.\n", - "\n", - "The following is an icon array of the marbles, grouped **only by their surface (shiny/dull)**." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "display_grouped_icon_array(marbles.group(\"surface\"), \"marble\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Knowing nothing else about yourself, you're equally likely to be any of the marbles pictured.\n", - "\n", - "**Question 2.2.**
What's the probability that you're a shiny marble? Calculate this by hand (using Python for arithmetic) by looking at your icon array." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "probability_shiny = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q22.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2.1. Conditional probability\n", - "\n", - "Suppose you overhear Samantha saying that you're a large marble. (Little-known fact: though marbles cannot see, they can sense sound from surface vibrations.) Does this somehow change the chance that you're shiny? Let's find out.\n", - "\n", - "Go back to the full icon array, displayed below for convenience." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "display_grouped_icon_array(marbles.group([\"surface\", \"size\"]), \"marble\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In question 2.3, we assumed you were equally likely to be any of the marbles, because we didn't know any better. That's why we looked at all the marbles to compute the probability you were shiny.\n", - "\n", - "But assuming you're a large marble, we can eliminate some of these possibilities. In particular, you can't be a small shiny marble or a small dull marble.\n", - "\n", - "You're still equally likely to be any of the remaining marbles, because you don't know anything that says otherwise. So here's an icon array of those remaining possibilities:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Just run this cell.\n", - "display_grouped_icon_array(marbles.where(\"size\", \"large\").group(\"surface\"), \"large marble\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 2.1.1.** What's the probability you're a shiny marble, knowing that you're a large marble? Calculate it by hand, using the icon array." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "probability_shiny_given_large = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q211.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You should have found that this is different from the probability that you're a shiny marble (given no size information), which you computed earlier. The distribution of surfaces among the large marbles is a little different from the distribution of surfaces among all the marbles." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 2.1.2.**
Suppose instead Samantha had said you're a **shiny** marble (hooray!). What's the probability you're large? Make an icon array to help you compute this probability, then compute it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# An icon array to help you compute the answer.\n", - "display_grouped_icon_array(marbles.where(\"surface\", \"shiny\").group(\"size\"), \"shiny marbles\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Now compute the answer.\n", - "probability_large_given_shiny = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q212.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 2.1.3.**
Can you answer the last two questions just by looking at the full icon array? (You can run the cell below to see it again.). If you can, how? If not, why not? Check with your lab peers to see if you are on the right track. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Just run this cell. The next cell is where you should write your answer.\n", - "display_grouped_icon_array(marbles.group([\"surface\", \"size\"]), \"marble\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "*Write your answer here, replacing this text.*" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Cancer screening\n", - "Now let's look at a much more realistic application.\n", - "\n", - "### Background\n", - "Medical tests are an important but surprisingly controversial topic. For years, women have been advised to get regular mammograms (tests for breast cancer). Today, there is controversy over whether the tests are useful at all.\n", - "\n", - "Part of the problem with such tests is that they are not perfectly reliable. Someone without cancer, or with only a benign form of cancer, can see a positive result on a test for cancer. Someone with cancer can receive a negative result. (\"Positive\" means \"pointing toward cancer,\" so in this context it's bad!) Doctors and patients often deal poorly with the first case, called *false positives*. For example, a patient may receive dangerous treatment like chemotherapy or radiation despite having no cancer or, as happens more frequently, having a cancer that would not have impacted her health.\n", - "\n", - "Conditional probability is a good way to think about such situations. For example, you can compute the chance that you have cancer, given the result of a test, by combining information from different probability distributions. You'll see that the chance you have cancer can be far from 100% even if you have a positive test result from a test that is usually accurate." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 3.1. Basic cancer statistics\n", - "Suppose that, in a representative group of 10,000 people who are tested for cancer (\"representative\" meaning that the frequencies of different things are the same as the frequencies in the whole population):\n", - "1. 100 have cancer.\n", - "2. Among those 100, 90 have positive results on a cancer test and 10 have negative results. (\"Negative\" means \"not pointing toward cancer.\")\n", - "3. The other 9,900 don't have cancer.\n", - "4. Among these, 198 have positive results on a cancer test and the other 9,702 have negative results. (So 198 see \"false positive\" results.)\n", - "\n", - "Below we've generated a table with data from these 10,000 hypothetical people." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "people = Table().with_columns(\n", - " \"cancer status\", [\"sick\", \"sick\", \"healthy\", \"healthy\"],\n", - " \"test status\", [\"positive\", \"negative\", \"positive\", \"negative\"],\n", - " \"count\", [90, 10, 198, 9702])\n", - "people" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "One way to visualize this dataset is with a contingency table, which you've seen before.\n", - "\n", - "**Question 3.1.1.**
Create a contingency table that looks like this:\n", - "\n", - "|cancer status|negative|positive|\n", - "|-|-|-|\n", - "|sick|||\n", - "|healthy||||\n", - "\n", - "...with the **count** of each group filled in, according to what we've told you above. The counts in the 4 boxes should sum to 10,000.\n", - "\n", - "*Hint:* Use `pivot` with the `sum` function." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "cancer = ...\n", - "cancer" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q311.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here is the `people` data in an icon array." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "..." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's think about how you can use this kind of information when you're tested for cancer.\n", - "\n", - "Before you know any information about yourself, you could imagine yourself as a **uniform random sample** of one of the 10,000 people in this imaginary population of people who have been tested.\n", - "\n", - "What's the chance that you have cancer, knowing nothing else about yourself? It's $\\frac{100}{10000}$, or 1%. We can see that more directly with this icon array:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "by_health = people.select(0, 2).group(0, sum).relabeled(1, 'count')\n", - "display_grouped_icon_array(by_health, \"people who've taken a cancer test\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 3.1.3.**
What's the chance that you have a positive test result, knowing nothing else about yourself?\n", - "\n", - "*Hint:* Make an icon array." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "#First, make an icon table similar to the one above\n", - "#by_test should be almost the same thing as by_health above\n", - "#The only difference is the columns we need from the people table\n", - "by_test = ...\n", - "display_grouped_icon_array(by_test, \"people who've taken a cancer test\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Fill in the probabiliy of having a positive test result.\n", - "probability_positive_test = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q313.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 3.2. Interpreting test results\n", - "Suppose you have a positive test result. This means you can now narrow yourself down to being part of one of two groups:\n", - "1. The people with cancer who have a positive test result.\n", - "2. The people without cancer who have a positive test result.\n", - "\n", - "Here's an icon array for those two groups:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Just run this cell.\n", - "display_grouped_icon_array(people.where(\"test status\", are.equal_to(\"positive\")).drop(1), \"people who have a positive test result\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The *conditional probability* that you have cancer *given* your positive test result is the chance that you're in the first group, assuming you're in one of these two groups.\n", - "\n", - "**Question 3.2.1.**
Eyeballing it, is the conditional probability that you have cancer given your positive test result closest to:\n", - "\n", - "9/10\n", - "\n", - "2/3\n", - "\n", - "1/2\n", - "\n", - "1/3\n", - "\n", - "1/100\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Set this to one of the probabilities above.\n", - "rough_prob_sick_given_positive = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q321.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 3.2.2.**
Now write code to calculate that probability exactly, using the original contingency table you wrote (the `cancer` table)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "#Run this cell to take another look at the cancer cell\n", - "cancer" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "prob_sick_given_positive = ...\n", - "prob_sick_given_positive" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q322.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 3.2.3.**
Look at the full icon array again. Using that, how would you compute (roughly) the conditional probability of cancer given a positive test?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# The full icon array is given here for your convenience.\n", - "# Write your answer in the next cell.\n", - "display_grouped_icon_array(people, \"people who've taken a cancer test\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "*Write your answer here, replacing this text.*" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 3.2.4.**
Is your answer to question 3.2.2 bigger than the overall proportion of people in the population who have cancer? Does that make sense? Check with your peers in lab to see if you have the right idea. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "*Write your answer here, replacing this text.*" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 4. Tree diagrams\n", - "A tree diagram is another useful visualization for conditional probability. It is easiest to draw a tree diagram when the probabilities are presented in a slightly different way. For example, people often summarize the information in your `cancer` table using 3 numbers:\n", - "\n", - "1. The overall probability of having cancer is **`p_cancer`**. (This is called the *base rate* or *marginal probability* of the disease.)\n", - "2. Given that you have cancer, the probability of a positive test result is **`p_pos_given_cancer`**. (This is called the *sensitivity* of the test. Higher values of `p_pos_given_cancer` mean the test is more useful.)\n", - "3. Given that you don't have cancer, the probability of a positive test result is **`p_pos_given_nocancer`**. (This is called the *false positive rate* of the test. Higher values of `p_pos_given_nocancer` mean the test is less useful.)\n", - "\n", - "You already saw that the base rate of cancer was .01 in the previous section. `p_pos_given_cancer` and `p_pos_given_nocancer` can be computed using the same method you used to compute the conditional probability of cancer given a positive test result.\n", - "\n", - "Use the tree diagram below and think about \n", - "\n", - "This corresponds to this tree diagram:\n", - "\n", - " / \\\n", - " .99 / \\ .01\n", - " / \\ \n", - " / \\\n", - " no cancer cancer\n", - " / \\ / \\\n", - " / \\ / \\\n", - " / \\ / \\\n", - " + - + -\n", - "\n", - "**Question 4.1.**
Compute `p_pos_given_cancer` and `p_pos_given_nocancer` for the data in section 3. \n", - "\n", - "**Use Bayes Rule**\n", - "\n", - "You can read about a technique called Bayes Rule in the [course textbook](https://www.inferentialthinking.com/chapters/18/1/more-likely-than-not-binary-classifier.html#Bayes%27-Rule)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Hint: You may find these two tables useful:\n", - "has_cancer = cancer.where(\"cancer status\", are.equal_to(\"sick\"))\n", - "no_cancer = cancer.where(\"cancer status\", are.equal_to(\"healthy\"))\n", - "\n", - "p_cancer = .01\n", - "p_pos_given_cancer = ...\n", - "p_pos_given_nocancer = ...\n", - "\n", - "print('Probability of Cancer:', p_cancer, '\\nProbability of a positive test given no cancer:', p_pos_given_cancer, \n", - " '\\nProbability of a positive test given no cancer:', p_pos_given_nocancer)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q41.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Once you are done...\n", - "- **Save and Checkpoint** from the `File` menu." - ] - } - ], - "metadata": { - "anaconda-cloud": {}, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - }, - "timetravel": { - "allowedContentTypes": [ - "text/plain" - ], - "enabled": true, - "version": "1.0" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/materials/x18/lab/3/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb b/materials/x18/lab/3/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb deleted file mode 100644 index 58f5ad1..0000000 --- a/materials/x18/lab/3/lab01/.ipynb_checkpoints/lab01-checkpoint.ipynb +++ /dev/null @@ -1,772 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Lab 1: Variance of Sample Means and Correlation\n", - "\n", - "Welcome to Lab 1 and Data 8.3x!\n", - "\n", - "In this lab we will learn about [the variance of sample means](https://www.inferentialthinking.com/chapters/14/5/variability-of-the-sample-mean.html) as well as ways to understand and quantify [the association between two variables](https://www.inferentialthinking.com/chapters/15/1/correlation.html)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Run this cell, but please don't change it.\n", - "\n", - "# These lines import the Numpy and Datascience modules.\n", - "import numpy as np\n", - "from datascience import *\n", - "\n", - "# These lines do some fancy plotting magic.\n", - "import matplotlib\n", - "%matplotlib inline\n", - "import matplotlib.pyplot as plots\n", - "plots.style.use('fivethirtyeight')\n", - "import warnings\n", - "warnings.simplefilter('ignore', FutureWarning)\n", - "warnings.simplefilter('ignore', UserWarning)\n", - "\n", - "# These lines load the tests.\n", - "from gofer.ok import check" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. How Faithful is Old Faithful? \n", - "\n", - "(Note: clever title comes from [here](http://web.pdx.edu/~jfreder/M212/oldfaithful.pdf).)\n", - "\n", - "Old Faithful is a geyser in Yellowstone National Park in the central United States. It's famous for erupting on a fairly regular schedule. You can see a video below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# For the curious: this is how to display a YouTube video in a\n", - "# Jupyter notebook. The argument to YouTubeVideo is the part\n", - "# of the URL (called a \"query parameter\") that identifies the\n", - "# video. For example, the full URL for this video is:\n", - "# https://www.youtube.com/watch?v=wE8NDuzt8eg\n", - "from IPython.display import YouTubeVideo\n", - "YouTubeVideo(\"wE8NDuzt8eg\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Some of Old Faithful's eruptions last longer than others. When it has a long eruption, there's generally a longer wait until the next eruption.\n", - "\n", - "If you visit Yellowstone, you might want to predict when the next eruption will happen, so you can see the rest of the park and come to see the geyser when it erupts. To predict one variable from another, the first step is to understand the association between them.\n", - "\n", - "The dataset has one row for each observed eruption. It includes the following columns:\n", - "- **duration**: Eruption duration, in minutes\n", - "- **wait**: Time between this eruption and the next, also in minutes\n", - "\n", - "Run the next cell to load the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "faithful = Table.read_table(\"faithful.csv\")\n", - "faithful" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "** Question 1.1 **\n", - "
\n", - "Make a scatter plot of the data. It's conventional to put the column we will try to predict on the vertical axis and the other column on the horizontal axis." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "..." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Look at the scatter plot. Does the association between wait times and eruption durations appear to be linear? \n", - "\n", - "There's more going on than just a linear association. The eruption durations seem to cluster; there are a bunch of short eruptions and a bunch of longer ones. Within each of the clusters, these values appear to be roughly linearly correlated, but perhaps with a different correlation coefficient.\n", - "\n", - "The overall relationship is positive, which means that longer eruptions have longer waiting times. Even when the association is more nuanced than a simple linear association, we can still compute the correlation." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, we'll plot the data in standard units. Recall that, if `nums` is an array of numbers, then\n", - "\n", - " (nums - np.mean(nums)) / np.std(nums)\n", - "\n", - "is an array of those numbers in standard units." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "** Question 1.2 **\n", - "
\n", - "Compute the mean and standard deviation of the eruption durations and waiting times. **Then** create a table called `faithful_standard` containing the eruption durations and waiting times in standard units. (The columns should be named `\"duration (standard units)\"` and `\"wait (standard units)\"`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "for_assignment_type": "student" - }, - "outputs": [], - "source": [ - "duration_mean = ...\n", - "duration_std = ...\n", - "wait_mean = ...\n", - "wait_std = ...\n", - "\n", - "faithful_standard = Table().with_columns(\n", - " \"duration (standard units)\", ...,\n", - " \"wait (standard units)\", ...)\n", - "faithful_standard" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q1_2.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "** Question 1.3 **\n", - "
\n", - "Plot the data again, but this time in standard units." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "..." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You'll notice that this plot looks exactly the same as the last one! The data really are different, but the axes are scaled differently. (The method `scatter` scales the axes so the data fill up the available space.) So it's important to read the ticks on the axes." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "** Question 1.4 **\n", - "Among the following numbers, which would you guess is closest to the correlation between eruption duration and waiting time in this dataset?\n", - "\n", - "* -1\n", - "* 0\n", - "* 1\n", - "\n", - "Assign your answer to `closest_correlation`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "closest_correlation = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q1_4.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "** Question 1.5 **\n", - "
\n", - "Compute the correlation `r`. *Hint:* Use `faithful_standard`. Section [15.1](https://www.inferentialthinking.com/chapters/15/1/correlation.html) explains how to do this." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "r = ...\n", - "r" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q1_5.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Variability of the Sample Mean\n", - "\n", - "By the Central Limit Theorem, the probability distribution of the mean of a large random sample is roughly normal. The bell curve is centered at the population mean. Some of the sample means are higher, and some lower, but the deviations from the population mean are roughly symmetric on either side, as we have seen repeatedly. Formally, probability theory shows that the sample mean is an unbiased estimate of the population mean.\n", - "\n", - "In our simulations, we also noticed that the means of larger samples tend to be more tightly clustered around the population mean than means of smaller samples. In this section, we will quantify the variability of the sample mean and develop a relation between the variability and the sample size.\n", - "\n", - "Let's take a look at the salaries of employees of the City of San Francisco in 2014. The mean salary reported by the city government was about $75463.92." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "salaries = Table.read_table('sf_salaries_2014.csv').select(\"salary\")\n", - "salaries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "salary_mean = np.mean(salaries.column('salary'))\n", - "salary_mean" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "salaries.hist('salary', bins=np.arange(0, 300000+10000*2, 10000))\n", - "plots.scatter(salary_mean, 0, marker='^', color='red', s=100);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "** Question 2.1 **\n", - "
\n", - "Clearly, the population does not follow a normal distribution. Keep that in mind as we progress through these exercises.\n", - "\n", - "Let's take random samples and look at the probability distribution of the sample mean. As usual, we will use simulation to get an empirical approximation to this distribution.\n", - "\n", - "We will define a function `simulate_sample_mean` to do this, because we are going to vary the sample size later. The arguments are the name of the table, the label of the column containing the variable, the sample size, and the number of simulations.\n", - "\n", - "Complete the function `simulate_sample_mean`. It will not be graded, but if you haven't implemented it correctly, the rest of the lab won't work properly, so this step is crucial." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "\"\"\"Empirical distribution of random sample means\"\"\"\n", - "\n", - "def simulate_sample_mean(table, label, sample_size, repetitions):\n", - " \n", - " means = []\n", - "\n", - " for i in np.arange(repetitions):\n", - " new_sample = ...\n", - " new_sample_mean = ...\n", - " ...\n", - "\n", - " sample_means = Table().with_column('Sample Means', means)\n", - " \n", - " # Display empirical histogram and print all relevant quantities – don't change this!\n", - " sample_means.hist(bins=20)\n", - " plots.xlabel('Sample Means')\n", - " plots.title('Sample Size ' + str(sample_size))\n", - " print(\"Sample size: \", sample_size)\n", - " print(\"Population mean:\", np.mean(table.column(label)))\n", - " print(\"Average of sample means: \", np.mean(means))\n", - " print(\"Population SD:\", np.std(table.column(label)))\n", - " print(\"SD of sample means:\", np.std(means))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "** Question 2.2 **\n", - "
\n", - "In the following cell, we will create a sample of size 100 from the salaries table and graph it using our new `simulate_sample_mean` function." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "simulate_sample_mean(salaries, 'salary', 100, 10000) \n", - "plots.xlim(50000, 100000)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the following two cells, simulate the mean of a random sample of 400 salaries and 625 salaries, respectively. In each case, perform 10,000 repetitions of each of these processes. Don't worry about the `plots.xlim` line – it just makes sure that all of the plots have the same x-axis. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "for_assignment_type": "student" - }, - "outputs": [], - "source": [ - "simulate_sample_mean(..., ..., ..., ...)\n", - "plots.xlim(50000, 100000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "for_assignment_type": "student" - }, - "outputs": [], - "source": [ - "simulate_sample_mean(..., ..., ..., ...)\n", - "plots.xlim(50000, 100000)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see the Central Limit Theorem in action – the histograms of the sample means are roughly normal, even though the histogram of the salaries themselves is far from normal.\n", - "\n", - "We can also see that each of the three histograms of the sample means is centered very close to the population mean. In each case, the \"average of sample means\" is very close to the population mean. Both values are provided in the printout above each histogram. As expected, the sample mean is an unbiased estimate of the population mean." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "** Question 2.3 **\n", - "
\n", - "Below, we'll look at what happens when we take a fixed sample, then bootstrap from it with different numbers of resamples. How does the distribution of the resampled means change?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "simulate_sample_mean(salaries, 'salary', 100, 1000)\n", - "plots.xlim(50000, 100000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "simulate_sample_mean(salaries, 'salary', 100, 5000)\n", - "plots.xlim(50000, 100000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "simulate_sample_mean(salaries, 'salary', 100, 10000)\n", - "plots.xlim(50000, 100000)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Assign the variable `bootstrap_sampled_SD` to the integer corresponding to your answer to the following question:\n", - "\n", - "When I increase the number of bootstrap samples that I take, for a fixed sample size, the SD of my sample mean will...\n", - "\n", - "1. Increase\n", - "2. Decrease\n", - "3. Stay about the same\n", - "4. Vary widly" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "bootstrap_sampled_SD = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q2_3.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Below, we'll look at what happens when we take a fixed sample, then bootstrap from it with different numbers of resamples. How does the distribution of the resampled means change?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "simulate_sample_mean(salaries, 'salary', 100, 500)\n", - "plots.xlim(50000, 100000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "simulate_sample_mean(salaries, 'salary', 100, 1000)\n", - "plots.xlim(50000, 100000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "simulate_sample_mean(salaries, 'salary', 100, 5000)\n", - "plots.xlim(50000, 100000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "simulate_sample_mean(salaries, 'salary', 100, 10000)\n", - "plots.xlim(50000, 100000)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "What did you notice about the sample means of the four bootstrapped samples above?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "** Question 2.4 **\n", - "
\n", - "Next, let's think about how the relationships between population SD, sample SD, and SD of sample means change with varying sample size. Which of the following is true? Again, assign the variable `pop_vs_sample` to the integer corresponding to your answer. To gain some intuition, you can run the simulation cells below.\n", - "\n", - "1. Sample SD gets smaller with increasing sample size, SD of sample means gets smaller with increasing sample size\n", - "2. Sample SD gets larger with increasing sample size, SD of sample means stays the same with increasing sample size\n", - "3. Sample SD becomes more consistent with population SD with increasing sample size, SD of sample means gets smaller with increasing sample size\n", - "4. Sample SD becomes more consistent with populatoin SD with increasing sample size, SD of smaple means stays the same with increasing sample size" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "pop_vs_sample = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q2_4.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's see what happens: First, we calculate the population SD so that we can compare the SD of each sample to the SD of the population." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "pop_sd = np.std(salaries.column(\"salary\"))\n", - "pop_sd" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's then how a small sample behaves. Run the following cells multiple times to see how the SD of the sample changes from sample to sample. Adjust the bins as necessary." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sample_10 = salaries.sample(10)\n", - "sample_10.hist(\"salary\")\n", - "print(\"Sample SD: \", np.std(sample_10.column(\"salary\")))\n", - "simulate_sample_mean(sample_10, 'salary', 10, 1000)\n", - "plots.xlim(5,120000)\n", - "plots.ylim(0, .0001);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sample_200 = salaries.sample(200)\n", - "sample_200.hist(\"salary\")\n", - "print(\"Sample SD: \", np.std(sample_200.column(\"salary\")))\n", - "simulate_sample_mean(sample_200, 'salary', 200, 1000)\n", - "plots.xlim(5,100000)\n", - "plots.ylim(0, .00015);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sample_1000 = salaries.sample(1000)\n", - "sample_1000.hist(\"salary\")\n", - "print(\"Sample SD: \", np.std(sample_1000.column(\"salary\")))\n", - "simulate_sample_mean(sample_1000, 'salary', 1000, 1000)\n", - "plots.xlim(5,100000)\n", - "plots.ylim(0, .00025);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's illustrate this trend. Below, you will see how the average absolute error of SD from the population changes with sample size (N)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Don't change this cell, just run it!\n", - "sample_n_errors = make_array()\n", - "for i in np.arange(10, 200, 10):\n", - " sample_n_errors = np.append(sample_n_errors, np.average([abs(np.std(salaries.sample(i).column(\"salary\"))-pop_sd)\n", - " for d in np.arange(100)]))\n", - "Table().with_column(\"Average absolute error in SD\", sample_n_errors, \"N\", np.arange(10, 200, 10)).plot(\"N\", \"Average absolute error in SD\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You should notice that the distribution of means gets spiker, and that the distribution of the sample increasingly looks like the distribution of the population as we get to larger sample sizes. \n", - "\n", - "Is there a relationship between the sample size and absolute error in standard deviation? Identify this relationship – if you're having trouble, take a look at this [section](https://www.inferentialthinking.com/chapters/14/5/variability-of-the-sample-mean.html) in our textbook about the variability of sample means." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Submission" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You're finished with lab 1! In order to successfully submit your assignment, follow these steps...\n", - "- **IMPORTANT** Before you do anything, **Save and Checkpoint** from the `File` menu. Please do this first before running the cell below,\n", - "- **run all the tests and verify that they all pass** (the next cell has a shortcut for that), \n", - "- **Review the notebook one last time, we will be grading the final state of your notebook** If you make any changes, please **Save and Checkpoint** again." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# For your convenience, you can run this cell to run all the tests at once!\n", - "import glob\n", - "from gofer.ok import grade_notebook\n", - "if not globals().get('__GOFER_GRADER__', False):\n", - " display(grade_notebook('lab01.ipynb', sorted(glob.glob('tests/q*.py'))))" - ] - } - ], - "metadata": { - "anaconda-cloud": {}, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/materials/x18/lab/3/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb b/materials/x18/lab/3/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb deleted file mode 100644 index 9742ba3..0000000 --- a/materials/x18/lab/3/lab02/.ipynb_checkpoints/lab02-checkpoint.ipynb +++ /dev/null @@ -1,716 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Lab 2: Regression\n", - "\n", - "Welcome to Lab 2 of Data 8.3x!\n", - "\n", - "Today we will get some hands-on practice with linear regression. You can find more information about this topic in\n", - "[section 15.2](https://www.inferentialthinking.com/chapters/15/2/Regression_Line)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Run this cell, but please don't change it.\n", - "\n", - "# These lines import the Numpy and Datascience modules.\n", - "import numpy as np\n", - "from datascience import *\n", - "\n", - "# These lines do some fancy plotting magic.\n", - "import matplotlib\n", - "%matplotlib inline\n", - "import matplotlib.pyplot as plots\n", - "plots.style.use('fivethirtyeight')\n", - "import warnings\n", - "warnings.simplefilter('ignore', FutureWarning)\n", - "warnings.simplefilter('ignore', UserWarning)\n", - "\n", - "# These lines load the tests.\n", - "from gofer.ok import check" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. How Faithful is Old Faithful? Revisited\n", - "\n", - "Let's revisit a question from lab 1. Last lab, we investigated Old Faithful, a geyser in Yellowstone National Park in the central United States. It's famous for erupting on a fairly regular schedule.\n", - "\n", - "To recap, some of Old Faithful's eruptions last longer than others. Today, we will use the same dataset on eruption durations and waiting times to see if we can make predict the wait time from the eruption duration using linear regression.\n", - "\n", - "The dataset has one row for each observed eruption. It includes the following columns:\n", - "- **duration**: Eruption duration, in minutes\n", - "- **wait**: Time between this eruption and the next, also in minutes\n", - "\n", - "Run the next cell to load the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "faithful = Table.read_table(\"faithful.csv\")\n", - "faithful" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Remember from last lab that we concluded eruption time and waiting time are positively correlated. The table below called `faithful_standard` contains the eruption durations and waiting times in standard units." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "duration_mean = np.mean(faithful.column(\"duration\"))\n", - "duration_std = np.std(faithful.column(\"duration\"))\n", - "wait_mean = np.mean(faithful.column(\"wait\"))\n", - "wait_std = np.std(faithful.column(\"wait\"))\n", - "\n", - "faithful_standard = Table().with_columns(\n", - " \"duration (standard units)\", (faithful.column(\"duration\") - duration_mean) / duration_std,\n", - " \"wait (standard units)\", (faithful.column(\"wait\") - wait_mean) / wait_std\n", - ")\n", - "faithful_standard" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The next cell computes the correlation `r`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "r = np.mean(faithful_standard.column(0) * faithful_standard.column(1))\n", - "r" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. The regression line\n", - "The correlation coefficient is the slope of the regression line when the data are expressed in standard units.\n", - "\n", - "The next cell plots the regression line in standard units:\n", - "\n", - "$$\\text{waiting time (standard units)} = r \\times \\text{eruption duration (standard units)}.$$" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def plot_data_and_line(dataset, x, y, point_0, point_1):\n", - " \"\"\"Makes a scatter plot of the dataset, along with a line passing through two points.\"\"\"\n", - " dataset.scatter(x, y, label=\"data\")\n", - " xs, ys = zip(point_0, point_1)\n", - " plots.plot(xs, ys, label=\"regression line\")\n", - " plots.legend(bbox_to_anchor=(1.5,.8))\n", - "\n", - "plot_data_and_line(faithful_standard, \n", - " \"duration (standard units)\", \n", - " \"wait (standard units)\", \n", - " [-2, -2*r], \n", - " [2, 2*r])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "How would you take a point in standard units and convert it back to original units? We'd have to \"stretch\" its horizontal position by `duration_std` and its vertical position by `wait_std`.\n", - "\n", - "That means the same thing would happen to the slope of the line.\n", - "\n", - "Stretching a line horizontally makes it less steep, so we divide the slope by the stretching factor. Stretching a line vertically makes it more steep, so we multiply the slope by the stretching factor.\n", - "\n", - "** Question 2.1 **
\n", - "What is the slope of the regression line in original units?\n", - "\n", - "(If the \"stretching\" explanation is unintuitive, consult section [15.2](https://www.inferentialthinking.com/chapters/15/2/Regression_Line) in the textbook.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "slope = ...\n", - "slope" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We know that the regression line passes through the point `(duration_mean, wait_mean)`. You might recall from high-school algebra that the equation for the line is therefore:\n", - "\n", - "$$\\text{waiting time} - \\verb|wait_mean| = \\texttt{slope} \\times (\\text{eruption duration} - \\verb|duration_mean|)$$\n", - "\n", - "After rearranging that equation slightly, the intercept turns out to be:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "intercept = slope*(-duration_mean) + wait_mean\n", - "intercept" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q2_1.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Investigating the regression line\n", - "The slope and intercept tell you exactly what the regression line looks like. To predict the waiting time for an eruption, multiply the eruption's duration by `slope` and then add `intercept`.\n", - "\n", - "** Question 3.1 **
\n", - "Compute the predicted waiting time for an eruption that lasts 2 minutes, and for an eruption that lasts 5 minutes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "two_minute_predicted_waiting_time = ...\n", - "five_minute_predicted_waiting_time = ...\n", - "\n", - "# Here is a helper function to print out your predictions\n", - "# (you don't need to modify it):\n", - "def print_prediction(duration, predicted_waiting_time):\n", - " print(\"After an eruption lasting\", duration,\n", - " \"minutes, we predict you'll wait\", predicted_waiting_time,\n", - " \"minutes until the next eruption.\")\n", - "\n", - "print_prediction(2, two_minute_predicted_waiting_time)\n", - "print_prediction(5, five_minute_predicted_waiting_time)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q3_1.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The next cell plots the line that goes between those two points, which is (a segment of) the regression line." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": false - }, - "outputs": [], - "source": [ - "plot_data_and_line(faithful, \"duration\", \"wait\", \n", - " [2, two_minute_predicted_waiting_time], \n", - " [5, five_minute_predicted_waiting_time])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "** Question 3.2 **
\n", - "Make predictions for the waiting time after each eruption in the `faithful` table. (Of course, we know exactly what the waiting times were! We are doing this so we can see how accurate our predictions are.) Put these numbers into a column in a new table called `faithful_predictions`. Its first row should look like this:\n", - "\n", - "|duration|wait|predicted wait|\n", - "|-|-|-|\n", - "|3.6|79|72.1011|\n", - "\n", - "*Hint:* Your answer can be just one line. There is no need for a `for` loop; use array arithmetic instead." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "faithful_predictions = ...\n", - "faithful_predictions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q3_2.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "** Question 3.3 **
\n", - "How close were we? We computed the *residual* for each eruption in the dataset. The residual is the difference (not the absolute difference) between the actual waiting time and the predicted waiting time. Add the residuals to `faithful_predictions` as a new column called `\"residual\"`, naming the resulting table `faithful_residuals`.\n", - "\n", - "*Hint:* Again, your code will be much simpler if you don't use a `for` loop." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "residual = faithful_predictions.column(1) - faithful_predictions.column(2)\n", - "faithful_residuals = ...\n", - "faithful_residuals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q3_3.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here is a plot of the residuals you computed. Each point corresponds to one eruption. It shows how much our prediction over- or under-estimated the waiting time." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "faithful_residuals.scatter(\"duration\", \"residual\", color=\"r\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There isn't too much of a pattern in the residuals, which confirms that it's reasonable to use linear regression for prediction. It's true that there are two separate clouds; the eruption durations seemed to fall into two distinct clusters. But that's just a pattern in the eruption durations, not a pattern in the relationship between eruption durations and waiting times. A larger concern is that there may be more positive than negative residuals in a particular region of the horizontal axis. For both clusters, the points are distributed fairly evenly above and below zero, which is a confirmation that the association is mostly linear." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. How accurate are different predictions?\n", - "The correlation coefficient is close to 1, implying that the observed values are tightly clustered around the regression line. The residuals are overall small (close to 0) in comparison to the waiting times.\n", - "\n", - "We can see that visually by plotting the waiting times and residuals together:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "faithful_residuals.scatter(\"duration\", \"wait\", label=\"actual waiting time\", color=\"blue\")\n", - "plots.scatter(faithful_residuals.column(\"duration\"), faithful_residuals.column(\"residual\"), label=\"residual\", color=\"r\")\n", - "plots.plot([2, 5], [two_minute_predicted_waiting_time, five_minute_predicted_waiting_time], label=\"regression line\")\n", - "plots.legend(bbox_to_anchor=(1.7,.8));" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "** Question 4.1 **
\n", - "In `faithful`, no eruption lasted exactly 0, 2.5, or 60 minutes. Using this line, what is the predicted waiting time for an eruption that lasts 0 minutes? 2.5 minutes? An hour?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "zero_minute_predicted_waiting_time = ...\n", - "two_point_five_minute_predicted_waiting_time = ...\n", - "hour_predicted_waiting_time = ...\n", - "\n", - "print_prediction(0, zero_minute_predicted_waiting_time)\n", - "print_prediction(2.5, two_point_five_minute_predicted_waiting_time)\n", - "print_prediction(60, hour_predicted_waiting_time)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q4_1.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Divide and Conquer" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's see what happens if we treat the two clusters of observations differently. It appears from the scatter diagram that there are two clusters of points: one for durations around 2 and another for durations between 3.5 and 5. A vertical line at 3 divides the two clusters." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "faithful.scatter(\"duration\", \"wait\", label=\"actual waiting time\", color=\"blue\")\n", - "plots.plot([3, 3], [40, 100]);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `standardize` function from lecture appears below, which returns a table of values in standard units." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def standard_units(any_numbers):\n", - " \"Convert any array of numbers to standard units.\"\n", - " return (any_numbers - np.mean(any_numbers)) / np.std(any_numbers) \n", - "\n", - "def standardize(t):\n", - " \"\"\"Return a table in which all columns of t are converted to standard units.\"\"\"\n", - " t_su = Table()\n", - " for label in t.labels:\n", - " t_su = t_su.with_column(label + ' (su)', standard_units(t.column(label)))\n", - " return t_su" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 5.1**
\n", - "Separately compute the regression coefficients *r* for all the points with a duration below 3 **and then** for all the points with a duration above 3. To do so, create a function that computes `r` from a table and pass it two different tables of points, `below_3` and `above_3`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def reg_coeff(t):\n", - " \"\"\"Return the regression coefficient for columns 0 & 1.\"\"\"\n", - " t_su = standardize(t)\n", - " ...\n", - "\n", - "below_3 = ...\n", - "above_3 = ...\n", - "below_3_r = reg_coeff(below_3)\n", - "above_3_r = reg_coeff(above_3)\n", - "print(\"For points below 3, r is\", below_3_r, \"; for points above 3, r is\", above_3_r)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q5_1.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 5.2**
\n", - "Write functions `slope_of` and `intercept_of` below. \n", - "\n", - "When you're done, the functions `wait_below_3` and `wait_above_3` should each use a different regression line to predict a wait time for a duration. The first function should use the regression line for all points with duration below 3.2. The second function should use the regression line for all points with duration above 3.2." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def slope_of(t, r):\n", - " \"\"\"Return the slope of the regression line for t in original units.\n", - " \n", - " Assume that column 0 contains x values and column 1 contains y values.\n", - " r is the regression coefficient for x and y.\n", - " \"\"\"\n", - " ...\n", - "\n", - "def intercept_of(t, r):\n", - " \"\"\"Return the slope of the regression line for t in original units.\"\"\"\n", - " s = slope_of(t, r)\n", - " ...\n", - " \n", - "below_3_a = slope_of(below_3, below_3_r)\n", - "below_3_b = intercept_of(below_3, below_3_r)\n", - "above_3_a = slope_of(above_3, above_3_r)\n", - "above_3_b = intercept_of(above_3, above_3_r)\n", - "\n", - "def wait_below_3(duration):\n", - " return below_3_a * duration + below_3_b\n", - "\n", - "def wait_above_3(duration):\n", - " return above_3_a * duration + above_3_b" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q5_2.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The plot below shows two different regression lines, one for each cluster!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "faithful.scatter(0, 1)\n", - "plots.plot([1, 3], [wait_below_3(1), wait_below_3(3)])\n", - "plots.plot([3, 6], [wait_above_3(3), wait_above_3(6)]);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 5.3**
\n", - "Write a function `predict_wait` that takes a `duration` and returns the predicted wait time using the appropriate regression line, depending on whether the duration is below 3 or greater than (or equal to) 3." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "for_assignment_type": "student" - }, - "outputs": [], - "source": [ - "def predict_wait(duration):\n", - " \"\"\"Return the wait predicted by the appropriate one of the two regression lines above.\"\"\"\n", - " ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q5_3.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The predicted wait times for each point appear below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "faithful.with_column('predicted', faithful.apply(predict_wait, 'duration')).scatter(0)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Further Exploration (ungraded)**: When drawing a line through each cluster separately, we discovered two different but similar lines. Here are some natural questions to explore, if you want to continue working with these data:\n", - " * How much more accurate do we expect predictions to be using two lines instead of one? Can we measure this improvement using residuals?\n", - " * Are the lines really different, or did they just come out different due to chance because we have only a small number of observations? How could we tell?\n", - " * Could it be that the slopes of the lines are the same, but the intercepts are different? " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Submission" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You're finished with lab 2! In order to successfully submit your assignment, follow these steps...\n", - "- **IMPORTANT** Before you do anything, **Save and Checkpoint** from the `File` menu. Please do this first before running the cell below,\n", - "- **run all the tests and verify that they all pass** (the next cell has a shortcut for that), \n", - "- **Review the notebook one last time, we will be grading the final state of your notebook** If you make any changes, please **Save and Checkpoint** again." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# For your convenience, you can run this cell to run all the tests at once!\n", - "import glob\n", - "from gofer.ok import grade_notebook\n", - "if not globals().get('__GOFER_GRADER__', False):\n", - " display(grade_notebook('lab02.ipynb', sorted(glob.glob('tests/q*.py'))))" - ] - } - ], - "metadata": { - "anaconda-cloud": {}, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/materials/x18/lab/3/lab03/.ipynb_checkpoints/lab03-checkpoint.ipynb b/materials/x18/lab/3/lab03/.ipynb_checkpoints/lab03-checkpoint.ipynb deleted file mode 100644 index 669974b..0000000 --- a/materials/x18/lab/3/lab03/.ipynb_checkpoints/lab03-checkpoint.ipynb +++ /dev/null @@ -1,601 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Lab 3: Regression Inference\n", - "\n", - "Welcome to Lab 3 of Data 8.3x!\n", - "\n", - "Sometimes, the primary purpose of regression analysis is to learn something about the slope or intercept of the best-fitting line. When we use a sample of data to estimate the slope or intercept, our estimate is subject to random error, just as in the simpler case of the mean of a random sample.\n", - "\n", - "In this lab, we'll use regression to get an accurate estimate for the age of the universe, using pictures of exploding stars. Our estimate will come from a sample of all exploding stars. We'll compute a confidence interval to quantify the error caused by sampling." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Run this cell, but please don't change it.\n", - "\n", - "# These lines import the Numpy and Datascience modules.\n", - "import numpy as np\n", - "from datascience import *\n", - "\n", - "# These lines do some fancy plotting magic\n", - "import matplotlib\n", - "%matplotlib inline\n", - "import matplotlib.pyplot as plt\n", - "plt.style.use('fivethirtyeight')\n", - "import warnings\n", - "warnings.simplefilter('ignore', FutureWarning)\n", - "warnings.simplefilter('ignore', UserWarning)\n", - "from matplotlib import patches\n", - "from ipywidgets import interact, interactive, fixed\n", - "import ipywidgets as widgets\n", - "\n", - "# These lines load the tests.\n", - "from gofer.ok import check" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. The Age of the Universe\n", - "\n", - "### The Actual Big Bang Theory\n", - "In the early 20th century, the most popular cosmological theory suggested that the universe had always existed at a fixed size. Today, the Big Bang theory prevails: Our universe started out very small and is still expanding.\n", - "\n", - "A consequence of this is Hubble's Law, which states that every celestial object that's reasonably far away from Earth (for example, another galaxy) is moving away from us at a constant speed. If we extrapolate that motion backwards to the time when everything in the universe was in the same place, that time is (roughly) the beginning of the universe!\n", - "\n", - "Scientists have used this fact, along with measurements of the current *location* and *movement speed* of other celestial objects, to estimate when the universe started.\n", - "\n", - "The cell below simulates a universe in which our sun is the center and every other star is moving away from us. Each star starts at the same place as the sun, then moves away from it over time. Different stars have different directions *and speeds*; the arrows indicate the direction and speed of travel.\n", - "\n", - "Run the cell, then move the slider to see how things change over time." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Just run this cell. (The simulation is actually not\n", - "# that complicated; it just takes a lot of code to draw\n", - "# everything. So you don't need to read this unless you\n", - "# have time and are curious about more advanced plotting.)\n", - "\n", - "num_locations = 15\n", - "example_velocities = Table().with_columns(\n", - " \"x\", np.random.normal(size=num_locations),\n", - " \"y\", np.random.normal(size=num_locations))\n", - "start_of_time = -2\n", - "\n", - "def scatter_after_time(t, start_of_time, end_of_time, velocities, center_name, other_point_name, make_title):\n", - " max_location = 1.1*(end_of_time-start_of_time)*max(max(abs(velocities.column(\"x\"))), max(abs(velocities.column(\"y\"))))\n", - " new_locations = velocities.with_columns(\n", - " \"x\", (t-start_of_time)*velocities.column(\"x\"),\n", - " \"y\", (t-start_of_time)*velocities.column(\"y\"))\n", - " plt.scatter(make_array(0), make_array(0), label=center_name, s=100, c=\"yellow\")\n", - " plt.scatter(new_locations.column(\"x\"), new_locations.column(\"y\"), label=other_point_name)\n", - " for i in np.arange(new_locations.num_rows):\n", - " plt.arrow(\n", - " new_locations.column(\"x\").item(i),\n", - " new_locations.column(\"y\").item(i),\n", - " velocities.column(\"x\").item(i),\n", - " velocities.column(\"y\").item(i),\n", - " fc='black',\n", - " ec='black',\n", - " head_width=0.025*max_location,\n", - " lw=.15)\n", - " plt.xlim(-max_location, max_location)\n", - " plt.ylim(-max_location, max_location)\n", - " plt.gca().set_aspect('equal', adjustable='box')\n", - " plt.gca().set_position(make_array(0, 0, 1, 1))\n", - " plt.legend(bbox_to_anchor=(1.6, .7))\n", - " plt.title(make_title(t))\n", - " plt.show()\n", - "\n", - "interact(\n", - " scatter_after_time,\n", - " t=widgets.FloatSlider(min=start_of_time, max=5, step=.05, value=0, msg_throttle=1),\n", - " start_of_time=fixed(start_of_time),\n", - " end_of_time=fixed(5),\n", - " velocities=fixed(example_velocities),\n", - " center_name=fixed(\"our sun\"),\n", - " other_point_name=fixed(\"other star\"),\n", - " make_title=fixed(lambda t: \"The world {:01g} year{} in the {}\".format(abs(t), \"\" if abs(t) == 1 else \"s\", \"past\" if t < 0 else \"future\")));" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Analogy: driving\n", - "Here's an analogy to illustrate how scientists use information about stars to estimate the age of the universe.\n", - "\n", - "Suppose that at some point in the past, our friend Mei started driving in a car going at a steady speed of 60 miles per hour straight east. We're still standing where she started." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Run this cell to see a picture of Mei's locations over time.\n", - "\n", - "mei_velocity = Table().with_columns(\"x\", make_array(60), \"y\", make_array(0))\n", - "interact(\n", - " scatter_after_time,\n", - " t=widgets.FloatSlider(min=-2, max=1, step=.05, value=0, msg_throttle=1),\n", - " start_of_time=fixed(-2),\n", - " end_of_time=fixed(1),\n", - " velocities=fixed(mei_velocity),\n", - " center_name=fixed(\"Us\"),\n", - " other_point_name=fixed(\"Mei\"),\n", - " make_title=fixed(lambda t: \"Mei's position {:01g} hour{} in the {}\".format(abs(t), \"\" if abs(t) == 1 else \"s\", \"past\" if t < 0 else \"future\")));" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We want to know how long she's been driving, but we forgot to record the time when she left. If we find out that she's 120 miles away, and she's been going 60 miles per hour the whole time, we can infer that she left 2 hours ago.\n", - "\n", - "One way we can compute that number is by fitting a line to a scatter plot of our locations and speeds. It turns out that the *slope* of that line is the amount of time that has passed. Run the next cell to see a picture:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Just run this cell.\n", - "small_driving_example = Table().with_columns(\n", - " \"Name\", make_array(\"Us\", \"Mei\"),\n", - " \"Speed moving away from us (miles per hour)\", make_array(0, 60),\n", - " \"Current distance from us (miles)\", make_array(0, 120))\n", - "\n", - "small_driving_example.scatter(1, 2, s=200, fit_line=True)\n", - "\n", - "# Fancy magic to draw each person's name with their dot.\n", - "with_slope_indicator = small_driving_example.with_row(\n", - " [\"Slope = 2\\ hours\", small_driving_example.column(1).mean(), small_driving_example.column(2).mean()])\n", - "for i in range(with_slope_indicator.num_rows):\n", - " name = with_slope_indicator.column(0).item(i)\n", - " x = with_slope_indicator.column(1).item(i)\n", - " y = with_slope_indicator.column(2).item(i)\n", - " plt.scatter(make_array(x - 15), make_array(y + 15), s=1000*len(name), marker=\"$\\mathrm{\" + name + \"}$\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The slope of the line is 2 hours. (The units are vertical-axis units divided by horizontal-axis units, which are $\\frac{\\texttt{miles}}{\\texttt{miles} / \\texttt{hour}}$, or hours.) So that's our answer.\n", - "\n", - "Imagine that you don't know Mei's exact distance or speed, only rough estimates. Then if you drew this line, you'd get a slightly bad estimate of the time since she left. But if you measured the distance and speed of hundreds of people who left you at the same time going different speeds, and drew a line through them, the slope of that line would be a pretty good estimate of the time they left, even if the individual measurements weren't exactly right.\n", - "\n", - "The `drivers.csv` dataset contains the speeds and distances-from-start of 100 drivers. They all left the same starting location at the same time, driving at a fixed speed on a straight line away from the start. The measurements aren't exact, so they don't fit exactly on a line. We've created a scatter plot and drawn a line through the data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Just run this cell.\n", - "Table.read_table(\"drivers.csv\").scatter(0, 1, fit_line=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "** Question 1.1 **
\n", - "By looking at the fit line, estimate how long ago (in hours) Mei left." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Fill in the start time you infer from the above line.\n", - "driving_start_time_hours = ...\n", - "driving_start_time_hours" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "check('tests/q1_1.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Back to cosmology\n", - "To do the same thing for the universe, we need to know the distance-from-Earth and speed-away-from-Earth of many celestial objects. Using pictures taken by very accurate telescopes and a lot of physics, astronomers have been able to estimate both. It turns out that *nearby supernovae* -- stars that have recently died and exploded -- are among the best sources of this data, because they are very easy to see. This picture taken by the Hubble telescope shows an entire galaxy, with a single supernova - as bright by itself as billions of stars - at the bottom left.\n", - "\n", - "\n", - "\n", - "Our astronomical data for today will come from the [Supernova Cosmology Project](http://supernova.lbl.gov/union/) at Lawrence Berkeley Lab. The original dataset is [here](http://supernova.lbl.gov/union/figures/SCPUnion2.1_mu_vs_z.txt), with (brief) documentation [here](http://supernova.lbl.gov/union/descriptions.html#Magvsz). Each row in the table corresponds to a supernova near Earth that was observed by astronomers. From pictures like the one above, the astronomers deduced how far away each supernova was from Earth and how fast it was moving away from Earth. Their deductions were good, but not perfect.\n", - "\n", - "Run the cell below to load the data into a table called `close_novas` and make a scatter plot. (If you prefer, you can also use the name `close_novae`; both are correct.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Just run this cell.\n", - "close_novas = Table.read_table(\"close_novas.csv\")\n", - "close_novae = close_novas\n", - "\n", - "close_novas.scatter(0, 1, fit_line=True)\n", - "close_novas" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "** Question 1.2 **
\n", - "Looking this plot, make a guess at the age of the universe.\n", - "\n", - "**Note**: Make sure you get the units right! In case you need to know what a parsec is, it's a big unit of distance, equivalent to 30.86 trillion kilometers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Fill this in manually by examining the line above.\n", - "first_guess_universe_age_years = ...\n", - "\n", - "# This just shows your guess as a nice string, in billions of years.\n", - "\"{:,} billion years\".format(round(first_guess_universe_age_years / 1e9, 2))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q1_2.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Fitting the line yourself\n", - "`fit_line=True` is convenient, but we need to be able to calculate the slope as a number. Recall that the least-squares regression line for our supernova data is:\n", - "* the line\n", - "* with the smallest average (over all the supernovae we observe)\n", - "* error,\n", - "* squared,\n", - "* where the error is\n", - "\n", - "$$\\text{the supernova's actual distance from Earth} - \\text{the height of the line at that supernova's speed.}$$" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "** Question 1.3 **
\n", - "Define a function called `errors`. It should take three arguments:\n", - "1. a table like `close_novas` (with the same column names and meanings, but not necessarily the same data)\n", - "2. the slope of a line (a number)\n", - "3. the intercept of a line (a number).\n", - "\n", - "It should return an array of the errors made when a line with that slope and intercept is used to predict distance from speed for each supernova in the given table. (The error is the actual distance minus the predicted distance.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "for_assignment_type": "student" - }, - "outputs": [], - "source": [ - "def errors(t, slope, intercept):\n", - " ...\n", - " return ..." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "** Question 1.4 **
\n", - "Using `errors`, compute the errors for the line with slope `16000` and intercept `0` on the `close_novas` dataset. Name that array `example_errors`. Then make a scatter plot of the errors.\n", - "\n", - "**Hint:** To make a scatter plot of the errors, plot the error for each supernova in the dataset. Put the actual speed on the horizontal axis and the error on the vertical axis." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "example_errors = ...\n", - "..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q1_4.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You should find that the errors are almost all negative. That means our line is a little bit too steep. Let's find a better one." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "** Question 1.5 **
\n", - "Define a function called `fit_line`. It should take a table like `close_novas` (with the same column names and meanings) as its argument. It should return an array containing the slope (as item 0) and intercept (as item 1) of the least-squares regression line predicting distance from speed for that table.\n", - "\n", - "Note: If you haven't tried to use the [`minimize` function](http://data8.org/datascience/util.html#datascience.util.minimize) yet, now is a great time to practice. Here's an [example from the textbook](https://www.inferentialthinking.com/chapters/15/3/Method_of_Least_Squares)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def fit_line(tbl):\n", - " # Your code may need more than 1 line below here.\n", - " def mse(..., ...):\n", - " ... \n", - " return ... \n", - " \n", - "# Here is an example call to your function. To test your function,\n", - "# figure out the right slope and intercept by hand.\n", - "example_table = Table().with_columns(\n", - " \"Speed (parsecs/year)\", make_array(0, 1),\n", - " \"Distance (million parsecs)\", make_array(1, 3))\n", - "fit_line(example_table)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q1_5.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "** Question 1.6 **
\n", - "Use your function to fit a line to `close_novas`.\n", - "\n", - "Then, set `new_errors` equal to the errors that we get calling `errors` with our new line. The cell below will graph the corresponding residual plot with a best fit line.\n", - "\n", - "Make sure that the residual plot makes sense (Hint: what qualities should the best fit line of a residual plot have?)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": false - }, - "outputs": [], - "source": [ - "best_line = ...\n", - "best_line_slope = ...\n", - "best_line_intercept = ...\n", - "\n", - "new_errors = ...\n", - "\n", - "# This code displays the residual plot, given your values for the best_line_slope and best_line_intercept\n", - "Table().with_column(\"Speed (parsecs/year)\", \n", - " close_novas.column(\"Speed (parsecs/year)\"), \n", - " \"Distance errors (million parsecs)\", \n", - " new_errors\n", - " ).scatter(0, 1, fit_line=True)\n", - "\n", - "# This just shows your answer as a nice string, in billions of years.\n", - "\"Slope: {:g} (corresponding to an estimated age of {:,} billion years)\".format(best_line_slope, round(best_line_slope/1000, 4))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "That slope (multiplied by 1 million) is an estimate of the age of the universe. The current best estimate of the age of the universe (using slightly more sophisticated techniques) is 13.799 billion years. Did we get close?\n", - "\n", - "One reason our answer might be a little off is that we are using a sample of only some of the supernovae in the universe. Our sample isn't exactly random, since astronomers presumably chose the novae that were easiest to measure (or used some other nonrandom criteria). But let's assume it is. How can we produce a confidence interval for the age of the universe?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "** Question 1.7 **
\n", - "It's time to bootstrap so that we can quantify the variability in our estimate! Simulate 1000 resamples from `close_novas`. For each resample, compute the slope of the least-squares regression line, and multiply it by 1 million to compute an estimate of the age of the universe. Store these ages in an array called `bootstrap_ages`, and then use them to compute a 95% confidence interval for the age of the universe.\n", - "\n", - "**Note:** This might take up to a minute, and more repetitions will take even longer." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "bootstrap_ages = make_array()\n", - "for i in np.arange(1000):\n", - " bootstrap_ages = ...\n", - "\n", - "lower_end = ...\n", - "upper_end = ...\n", - "Table().with_column(\"Age estimate\", bootstrap_ages*1e-9).hist(bins=np.arange(12, 16, .1), unit=\"billion years\")\n", - "print(\"95% confidence interval for the age of the universe: [{:g}, {:g}] billion years\".format(lower_end*1e-9, upper_end*1e-9))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "check('tests/q1_7.py')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Nice work, data astronomer! You can compare your result to the [Planck project 2015 results](https://arxiv.org/pdf/1502.01589.pdf), which estimated the age of the universe to be 13.799±0.021 billion years. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Submission" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You're finished with lab 3! In order to successfully submit your assignment, follow these steps...\n", - "- **IMPORTANT** Before you do anything, **Save and Checkpoint** from the `File` menu. Please do this first before running the cell below,\n", - "- **run all the tests and verify that they all pass** (the next cell has a shortcut for that), \n", - "- **Review the notebook one last time, we will be grading the final state of your notebook** If you make any changes, please **Save and Checkpoint** again." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# For your convenience, you can run this cell to run all the tests at once!\n", - "import glob\n", - "from gofer.ok import grade_notebook\n", - "if not globals().get('__GOFER_GRADER__', False):\n", - " display(grade_notebook('lab03.ipynb', sorted(glob.glob('tests/q*.py'))))" - ] - } - ], - "metadata": { - "anaconda-cloud": {}, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - }, - "widgets": { - "state": { - "6c09ba2474d24e10bdd21db7b9699237": { - "views": [ - { - "cell_index": 9 - } - ] - }, - "ef0a0194fbdd498787d3894efa009a7e": { - "views": [ - { - "cell_index": 3 - } - ] - } - }, - "version": "1.2.0" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} From efece5cb046cf99849a3942992f9c80cdaa28d5f Mon Sep 17 00:00:00 2001 From: Yanay Rosen Date: Mon, 26 Nov 2018 14:41:04 -0800 Subject: [PATCH 3/8] Removed checkpoints --- materials/x18/.DS_Store | Bin 6148 -> 0 bytes materials/x18/lab/.DS_Store | Bin 6148 -> 0 bytes materials/x18/lab/2/.DS_Store | Bin 6148 -> 0 bytes 3 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 materials/x18/.DS_Store delete mode 100644 materials/x18/lab/.DS_Store delete mode 100644 materials/x18/lab/2/.DS_Store diff --git a/materials/x18/.DS_Store b/materials/x18/.DS_Store deleted file mode 100644 index 732df4c51a33dd343999c518772416a1966ef230..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK!AiqG5Z!I7ZYyFBg5oI`!9x#?l`4o3svf)v5k08b#1PzF_`?pulc+tsEHCZpBn_gZ-80Eis|_LNr%}?< z!-gItt-e{uw-ZVgRq?XCIvTm#wXKTdZq&yWXXLuIYQ@>C)yLzaT3&N^51Ti4_oIjL zL0RRi&HUhT!`3H2+0q7BC7Qq7|Z7QHmmDFNL>K&v_hk1|UGmAD| zNb_b`hu$oyLy^>aNXLW==~?8F8DIuJGl0DxNEz?{gP;3<6T|~Ezzj?$1G2F1?>C?} zx?8_giFd67+5|O5Z!|}VMGrqI%9$kjMI`#3~P+cd+ffz zK8p{qkF!>FSAy~9szIut`m3(4>Y4s}=$;Zn$VRv62~iP3C{V;g37U5V$5G!%fjrZI z66Y`xSUA--jWA3)18(oU{lr{G$=Lp_{>Y&Jbwu`|G0DgdqocUhJFooMHE9;cX{&9M z<7Nv&PLJcXX?hLQO`9D%Pv8cWDyh03@`%| z$N(LAYHkAGjbDiwUsV9?$D(RNyUexCrheOqY2bz#_*4d-0nxH(vj6}9 diff --git a/materials/x18/lab/2/.DS_Store b/materials/x18/lab/2/.DS_Store deleted file mode 100644 index 894b1eedb037356b79646dcd96d3c97eb5981c21..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%}yIJ5Vk`ACkQnMB;e*tB@Q|4W+_6dkX9)@aI3Ucd!SX=ZKAT$vPQ|K5rkBG z;U&rw@G4w-?1S_HV8$LGe|xG_p~xeRzp>|=-S|^%uNh;!*>Agy)fi&}9I;%5W{zPV zea1>OQUT=p8O~9nzD0?OX1I31Dqt0Oc?#g#U1kXjS;QPRzJ75_cniLBuTB5-XQn&0 z(vf*v%!~%TbJ`Q`qVpu~WsdWQmzK)QE2~0OMRmRUH8@p+APcj8uNiiavG*a&P9usx zDwRCNVe~2P3|h5~cPh=oIPG+Gaop|z<;&+dZL2|3_0x7wKPNB=Au6KMs%;O4jmExL zm!3DO%c1LfPF?PLd!tcBY`%66j!w=lE{9j6pTG2c8HasM+bhCHxPf3yjUPcbO;q|F zMzP{K)0otWZ$fsyqTUqqx{$d$#r#{yd^5=`g@w$$Ddz7fCUmVWm;&_Q7HbPOFFPHp zfK}k%6~O&Lf+PBbxk0^kK%=h!fMpog1~GmPV2vd733G#x0pq3=(3EnzVsKLq=dq$2!2Hcne$` Date: Tue, 27 Nov 2018 11:19:27 -0800 Subject: [PATCH 4/8] Removed DS Store --- .DS_Store | Bin 6148 -> 0 bytes materials/.DS_Store | Bin 6148 -> 0 bytes 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .DS_Store delete mode 100644 materials/.DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index f9ad8639c2efffdf647cfbffce82547c18915f3f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKO>fgM7=GOrmKFgOLK=cgE=U{-YUnhO(4Z3>9B+n^qC8qgtD?b-U>i}@gkd4P7YIXpnR z^<5e~q751$)~67wJ)u!4`co;oDtClIF^X-LqUv;n9y9qA5xkhv665`PMlegq`nf*` z<2XvQcKa)W%lvGbcKyjqQF`KM2Z8pVaUP%fe(*dR&ic)( zk9d;#Q8JuJauf~`^6EvDgnZWJ(sRm2xo;n$KE7~c``z)VBT03kZuB0Z!$ZxVcv$_j1WTByG>V!vJgUnBIb%Pd?PrIIwm>tG$J!qfc(?O74hh!gW9e>$DcoemI=jDZ6ouomOwAv;)YPKNc^f*eI zx>wiTq}eg+_;y06qAH%3m-~HptF~Ej+_jBC#p%0lty*!`Ya4?>Q7x{xJNp+m50Cw) z!SjpV3kjd8k$J))yn;am-udk$*2xukhS9?)MrME+UDT$ zMtAF2m3Y@WpckMh7?)Z6N&&-M#Sn{E@e-&J@H=P#dW4xp@PP1-fT4j0X5dd5_yiOU BSWo}} From 7e76f62d1650cc93231d5a47295066ca1600bd22 Mon Sep 17 00:00:00 2001 From: Yanay Rosen Date: Tue, 27 Nov 2018 11:21:41 -0800 Subject: [PATCH 5/8] Removed DS Store --- materials/x18/lab/1/lab01/lab01.ipynb | 32 +++++---------------------- 1 file changed, 5 insertions(+), 27 deletions(-) diff --git a/materials/x18/lab/1/lab01/lab01.ipynb b/materials/x18/lab/1/lab01/lab01.ipynb index 76b9247..9e4ae9b 100644 --- a/materials/x18/lab/1/lab01/lab01.ipynb +++ b/materials/x18/lab/1/lab01/lab01.ipynb @@ -93,44 +93,22 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "scrolled": true }, - "outputs": [ - { - "data": { - "text/plain": [ - "-724.0" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "3+6*5-6*3**2*2**3/4*7" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "scrolled": true }, - "outputs": [ - { - "data": { - "text/plain": [ - "2018.0" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "3+(6*5-(6*3))**2*((2**3)/4*7)" ] @@ -163,7 +141,7 @@ }, "outputs": [], "source": [ - "5 * (43/11) - (49 + 1/)" + "..." ] }, { From 9864dd42a965e0f54c421c62c17635ee0b6be5e6 Mon Sep 17 00:00:00 2001 From: Yanay Rosen Date: Tue, 27 Nov 2018 11:28:29 -0800 Subject: [PATCH 6/8] Removed DS Store --- materials/x18/lab/2/lab01/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 materials/x18/lab/2/lab01/.DS_Store diff --git a/materials/x18/lab/2/lab01/.DS_Store b/materials/x18/lab/2/lab01/.DS_Store deleted file mode 100644 index 731da27837247aba4976d7a4ec83ef932d1622c3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKu};G<5Pc4<6x4x^3=A2WDmt>IRT0CFgj$j!{oX6mPH}bB|4pBh6QeMg;&hCvMul* z6_B$#!3ZlZDB zC7$tudwnA+juu>Jz{mm*O?EUQ<70!}LqT4KGYrX?lk-ki)?~RGvNqR@8>|vft^Chc zeph!bd#{?A(RnMALo-vr6fgz$L;==pvAE~aT2sIjFa_2M$oCcSPnc{<}s$V&s(9z7i{Ts~Y_*@YX5)73eDqUms{ zM{7+1Q=qB9fgkpz{$Kol|8J6PWD1x9d!>NuOs*$mZYk8(t<6cTP3U)YG0Cevt|{zj gDQ2#e;$ylRLX~5beBQ*OFP#LT-1^!flA9mSTe*gdg From 41160739230f8cb0e063705241aa76bc59d9e7bb Mon Sep 17 00:00:00 2001 From: Yanay Rosen Date: Tue, 27 Nov 2018 11:46:57 -0800 Subject: [PATCH 7/8] Updated 2.3 Format --- materials/x18/lab/2/lab01/lab01.ipynb | 4 +-- materials/x18/lab/3/lab01/lab01.ipynb | 40 +-------------------------- 2 files changed, 2 insertions(+), 42 deletions(-) diff --git a/materials/x18/lab/2/lab01/lab01.ipynb b/materials/x18/lab/2/lab01/lab01.ipynb index 2f1baea..cbe2a9c 100644 --- a/materials/x18/lab/2/lab01/lab01.ipynb +++ b/materials/x18/lab/2/lab01/lab01.ipynb @@ -332,9 +332,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "suits = make_array(\"♤\", \"♡\", \"♢\", \"♧\")\n", diff --git a/materials/x18/lab/3/lab01/lab01.ipynb b/materials/x18/lab/3/lab01/lab01.ipynb index 58f5ad1..4ebc25c 100644 --- a/materials/x18/lab/3/lab01/lab01.ipynb +++ b/materials/x18/lab/3/lab01/lab01.ipynb @@ -433,45 +433,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "** Question 2.3 **\n", - "
\n", - "Below, we'll look at what happens when we take a fixed sample, then bootstrap from it with different numbers of resamples. How does the distribution of the resampled means change?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "simulate_sample_mean(salaries, 'salary', 100, 1000)\n", - "plots.xlim(50000, 100000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "simulate_sample_mean(salaries, 'salary', 100, 5000)\n", - "plots.xlim(50000, 100000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "simulate_sample_mean(salaries, 'salary', 100, 10000)\n", - "plots.xlim(50000, 100000)" + "** Question 2.3 **" ] }, { From c34264024858d5ca40d7d7fdbd0ea1f22407a01b Mon Sep 17 00:00:00 2001 From: Yanay Rosen Date: Mon, 3 Dec 2018 19:49:19 -0800 Subject: [PATCH 8/8] Removed crossroads references from 8.2-1 --- materials/x18/lab/2/lab01/lab01.ipynb | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/materials/x18/lab/2/lab01/lab01.ipynb b/materials/x18/lab/2/lab01/lab01.ipynb index cbe2a9c..0b024b6 100644 --- a/materials/x18/lab/2/lab01/lab01.ipynb +++ b/materials/x18/lab/2/lab01/lab01.ipynb @@ -332,7 +332,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "suits = make_array(\"♤\", \"♡\", \"♢\", \"♧\")\n", @@ -551,14 +553,14 @@ "metadata": {}, "source": [ "## 3. Finding Probabilities\n", - "After a long day of class, Clay decides to go to Crossroads for dinner. Today's menu has Clay's four favorite foods: enchiladas, hamburgers, pizza, and spaghetti. However, each dish has a 30% chance of running out before Clay can get to Crossroads." + "After a long day of class, Clay decides to go to a food court for dinner. Today's menu has Clay's four favorite foods: enchiladas, hamburgers, pizza, and spaghetti. However, each dish has a 30% chance of running out before Clay can get to the food court." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Question 3.1**
What is the probability that Clay will be able to eat pizza at Crossroads?" + "**Question 3.1**
What is the probability that Clay will be able to eat pizza at the food court?" ] }, { @@ -587,7 +589,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Question 3.2**
What is the probability that Clay will be able to eat all four of these foods at Crossroads?" + "**Question 3.2**
What is the probability that Clay will be able to eat all four of these foods at the food court?" ] }, { @@ -616,7 +618,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Question 3.3**
What is the probability that Crossroads will have run out of something before Clay can get there?" + "**Question 3.3**
What is the probability that the food court will have run out of something before Clay can get there?" ] }, { @@ -645,7 +647,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To make up for their unpredictable food supply, Crossroads decides to hold a contest for some free Cal Dining swag. There is a bag with two red marbles, two green marbles, and two blue marbles. Clay has to draw three marbles separately. In order to win, all three of these marbles must be of different colors." + "To make up for their unpredictable food supply, the food court decides to hold a contest for some free food. There is a bag with two red marbles, two green marbles, and two blue marbles. Clay has to draw three marbles separately. In order to win, all three of these marbles must be of different colors." ] }, {