318 lines
30 KiB
HTML
318 lines
30 KiB
HTML
<!DOCTYPE html>
|
||
<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
|
||
<head>
|
||
<meta charset="utf-8" />
|
||
<meta name="generator" content="pandoc" />
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" />
|
||
<title>4.1 Tabular Data</title>
|
||
<style>
|
||
code{white-space: pre-wrap;}
|
||
span.smallcaps{font-variant: small-caps;}
|
||
span.underline{text-decoration: underline;}
|
||
div.column{display: inline-block; vertical-align: top; width: 50%;}
|
||
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
|
||
ul.task-list{list-style: none;}
|
||
pre > code.sourceCode { white-space: pre; position: relative; }
|
||
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
|
||
pre > code.sourceCode > span:empty { height: 1.2em; }
|
||
code.sourceCode > span { color: inherit; text-decoration: inherit; }
|
||
div.sourceCode { margin: 1em 0; }
|
||
pre.sourceCode { margin: 0; }
|
||
@media screen {
|
||
div.sourceCode { overflow: auto; }
|
||
}
|
||
@media print {
|
||
pre > code.sourceCode { white-space: pre-wrap; }
|
||
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
|
||
}
|
||
pre.numberSource code
|
||
{ counter-reset: source-line 0; }
|
||
pre.numberSource code > span
|
||
{ position: relative; left: -4em; counter-increment: source-line; }
|
||
pre.numberSource code > span > a:first-child::before
|
||
{ content: counter(source-line);
|
||
position: relative; left: -1em; text-align: right; vertical-align: baseline;
|
||
border: none; display: inline-block;
|
||
-webkit-touch-callout: none; -webkit-user-select: none;
|
||
-khtml-user-select: none; -moz-user-select: none;
|
||
-ms-user-select: none; user-select: none;
|
||
padding: 0 4px; width: 4em;
|
||
color: #aaaaaa;
|
||
}
|
||
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
|
||
div.sourceCode
|
||
{ }
|
||
@media screen {
|
||
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
|
||
}
|
||
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
|
||
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
|
||
code span.at { color: #7d9029; } /* Attribute */
|
||
code span.bn { color: #40a070; } /* BaseN */
|
||
code span.bu { } /* BuiltIn */
|
||
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
|
||
code span.ch { color: #4070a0; } /* Char */
|
||
code span.cn { color: #880000; } /* Constant */
|
||
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
|
||
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
|
||
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
|
||
code span.dt { color: #902000; } /* DataType */
|
||
code span.dv { color: #40a070; } /* DecVal */
|
||
code span.er { color: #ff0000; font-weight: bold; } /* Error */
|
||
code span.ex { } /* Extension */
|
||
code span.fl { color: #40a070; } /* Float */
|
||
code span.fu { color: #06287e; } /* Function */
|
||
code span.im { } /* Import */
|
||
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
|
||
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
|
||
code span.op { color: #666666; } /* Operator */
|
||
code span.ot { color: #007020; } /* Other */
|
||
code span.pp { color: #bc7a00; } /* Preprocessor */
|
||
code span.sc { color: #4070a0; } /* SpecialChar */
|
||
code span.ss { color: #bb6688; } /* SpecialString */
|
||
code span.st { color: #4070a0; } /* String */
|
||
code span.va { color: #19177c; } /* Variable */
|
||
code span.vs { color: #4070a0; } /* VerbatimString */
|
||
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
|
||
</style>
|
||
<link rel="stylesheet" href="../tufte.css" />
|
||
<script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js" type="text/javascript"></script>
|
||
<!--[if lt IE 9]>
|
||
<script src="//cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.3/html5shiv-printshiv.min.js"></script>
|
||
<![endif]-->
|
||
</head>
|
||
<body>
|
||
<div style="display:none">
|
||
\(
|
||
\newcommand{\NOT}{\neg}
|
||
\newcommand{\AND}{\wedge}
|
||
\newcommand{\OR}{\vee}
|
||
\newcommand{\XOR}{\oplus}
|
||
\newcommand{\IMP}{\Rightarrow}
|
||
\newcommand{\IFF}{\Leftrightarrow}
|
||
\newcommand{\TRUE}{\text{True}\xspace}
|
||
\newcommand{\FALSE}{\text{False}\xspace}
|
||
\newcommand{\IN}{\,{\in}\,}
|
||
\newcommand{\NOTIN}{\,{\notin}\,}
|
||
\newcommand{\TO}{\rightarrow}
|
||
\newcommand{\DIV}{\mid}
|
||
\newcommand{\NDIV}{\nmid}
|
||
\newcommand{\MOD}[1]{\pmod{#1}}
|
||
\newcommand{\MODS}[1]{\ (\text{mod}\ #1)}
|
||
\newcommand{\N}{\mathbb N}
|
||
\newcommand{\Z}{\mathbb Z}
|
||
\newcommand{\Q}{\mathbb Q}
|
||
\newcommand{\R}{\mathbb R}
|
||
\newcommand{\C}{\mathbb C}
|
||
\newcommand{\cA}{\mathcal A}
|
||
\newcommand{\cB}{\mathcal B}
|
||
\newcommand{\cC}{\mathcal C}
|
||
\newcommand{\cD}{\mathcal D}
|
||
\newcommand{\cE}{\mathcal E}
|
||
\newcommand{\cF}{\mathcal F}
|
||
\newcommand{\cG}{\mathcal G}
|
||
\newcommand{\cH}{\mathcal H}
|
||
\newcommand{\cI}{\mathcal I}
|
||
\newcommand{\cJ}{\mathcal J}
|
||
\newcommand{\cL}{\mathcal L}
|
||
\newcommand{\cK}{\mathcal K}
|
||
\newcommand{\cN}{\mathcal N}
|
||
\newcommand{\cO}{\mathcal O}
|
||
\newcommand{\cP}{\mathcal P}
|
||
\newcommand{\cQ}{\mathcal Q}
|
||
\newcommand{\cS}{\mathcal S}
|
||
\newcommand{\cT}{\mathcal T}
|
||
\newcommand{\cV}{\mathcal V}
|
||
\newcommand{\cW}{\mathcal W}
|
||
\newcommand{\cZ}{\mathcal Z}
|
||
\newcommand{\emp}{\emptyset}
|
||
\newcommand{\bs}{\backslash}
|
||
\newcommand{\floor}[1]{\left \lfloor #1 \right \rfloor}
|
||
\newcommand{\ceil}[1]{\left \lceil #1 \right \rceil}
|
||
\newcommand{\abs}[1]{\left | #1 \right |}
|
||
\newcommand{\xspace}{}
|
||
\newcommand{\proofheader}[1]{\underline{\textbf{#1}}}
|
||
\)
|
||
</div>
|
||
<header id="title-block-header">
|
||
<h1 class="title">4.1 Tabular Data</h1>
|
||
</header>
|
||
<section>
|
||
<p>We’ve seen how Python can store collections of data, such as lists, sets, and dictionaries. Mostly, we’ve focused on collections of integers or strings. But what about collections of collections? We’ve actually encountered this already: our <a href="../03-logic/04-if-statements.html"><code>count_cancelled</code> function</a> had a parameter <code>flights</code> that was a dictionary whose values were lists, and we represented the <span class="math inline">\(Loves\)</span> predicate as a <a href="../03-logic/11-multiple-quantifiers.html">list of lists</a>, storing a two-dimensional table of booleans. In this section, we’ll look at using list of lists to store more complex forms of tabular data, like a table from a spreadsheet, and writing functions to perform computations on this data.</p>
|
||
<h2 id="toronto-getting-married">Toronto getting married</h2>
|
||
<p>Let’s consider a <a href="https://open.toronto.ca/dataset/marriage-licence-statistics/">real data set</a> from the city of Toronto. This data shows information about how many marriage licenses were issued in Toronto at a particular location and month. The data is in a tabular format with four columns: id, civic centre, number of marriage licenses issued, and time period. Each row of the table tells us how many marriage licenses were issued by a civic centre in a specific time period; the id is simply a unique numerical identifier for each row. Suppose we wanted to answer the following question: What is the average number of marriage licenses issued by each civic centre?</p>
|
||
<table>
|
||
<thead>
|
||
<tr class="header">
|
||
<th style="text-align: center;"><strong>ID</strong></th>
|
||
<th style="text-align: center;"><strong>Civic Centre</strong></th>
|
||
<th style="text-align: center;"><strong>Marriage Licenses Issued</strong></th>
|
||
<th style="text-align: center;"><strong>Time Period</strong></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">1657</td>
|
||
<td style="text-align: center;">ET</td>
|
||
<td style="text-align: center;">80</td>
|
||
<td style="text-align: center;">January 1, 2011</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">1658</td>
|
||
<td style="text-align: center;">NY</td>
|
||
<td style="text-align: center;">136</td>
|
||
<td style="text-align: center;">January 1, 2011</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">1659</td>
|
||
<td style="text-align: center;">SC</td>
|
||
<td style="text-align: center;">159</td>
|
||
<td style="text-align: center;">January 1, 2011</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">1660</td>
|
||
<td style="text-align: center;">TO</td>
|
||
<td style="text-align: center;">367</td>
|
||
<td style="text-align: center;">January 1, 2011</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">1661</td>
|
||
<td style="text-align: center;">ET</td>
|
||
<td style="text-align: center;">109</td>
|
||
<td style="text-align: center;">February 1, 2011</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">1662</td>
|
||
<td style="text-align: center;">NY</td>
|
||
<td style="text-align: center;">150</td>
|
||
<td style="text-align: center;">February 1, 2011</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">1663</td>
|
||
<td style="text-align: center;">SC</td>
|
||
<td style="text-align: center;">154</td>
|
||
<td style="text-align: center;">February 1, 2011</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">1664</td>
|
||
<td style="text-align: center;">TO</td>
|
||
<td style="text-align: center;">383</td>
|
||
<td style="text-align: center;">February 1, 2011</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p>To write a program that uses this data, we must first decide on a way to store it. As we did with our <span class="math inline">\(Loves\)</span> table of values, we’ll store this table as a list of lists, where each inner list represents one row of the table. Unlike our previous example, these lists won’t just store boolean values, so we need to determine what data type to use for each column, based on the sample data we have.</p>
|
||
<ul>
|
||
<li>The ids and number of marriage licenses are natural numbers, so we’ll use the <code>int</code> data type for them.</li>
|
||
<li>The civic centre is a two-letter code, and so we’ll store it as a <code>str</code>.</li>
|
||
<li>The time period is a year-month combination; we’ll represent these as dates using the <code>datetime</code> module.<label for="sn-0" class="margin-toggle sidenote-number"></label><input type="checkbox" id="sn-0" class="margin-toggle"/><span class="sidenote"> To review this <code>date</code> data type, check out <a href="../02-functions/04-importing-modules.html">2.4 Importing Modules</a>.</span></li>
|
||
</ul>
|
||
<p>With this in mind, let us see how we can store our data as a nested list<label for="sn-1" class="margin-toggle sidenote-number"></label><input type="checkbox" id="sn-1" class="margin-toggle"/><span class="sidenote"> In tutorial, you will explore how to load the data from a file into a nested list.</span>:</p>
|
||
<div class="sourceCode" id="cb1"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1"></a><span class="op">>>></span> <span class="im">import</span> datetime</span>
|
||
<span id="cb1-2"><a href="#cb1-2"></a><span class="op">>>></span> marriage_data <span class="op">=</span> [</span>
|
||
<span id="cb1-3"><a href="#cb1-3"></a>... [<span class="dv">1657</span>, <span class="st">'ET'</span>, <span class="dv">80</span>, datetime.date(<span class="dv">2011</span>, <span class="dv">1</span>, <span class="dv">1</span>)],</span>
|
||
<span id="cb1-4"><a href="#cb1-4"></a>... [<span class="dv">1658</span>, <span class="st">'NY'</span>, <span class="dv">136</span>, datetime.date(<span class="dv">2011</span>, <span class="dv">1</span>, <span class="dv">1</span>)],</span>
|
||
<span id="cb1-5"><a href="#cb1-5"></a>... [<span class="dv">1659</span>, <span class="st">'SC'</span>, <span class="dv">159</span>, datetime.date(<span class="dv">2011</span>, <span class="dv">1</span>, <span class="dv">1</span>)],</span>
|
||
<span id="cb1-6"><a href="#cb1-6"></a>... [<span class="dv">1660</span>, <span class="st">'TO'</span>, <span class="dv">367</span>, datetime.date(<span class="dv">2011</span>, <span class="dv">1</span>, <span class="dv">1</span>)],</span>
|
||
<span id="cb1-7"><a href="#cb1-7"></a>... [<span class="dv">1661</span>, <span class="st">'ET'</span>, <span class="dv">109</span>, datetime.date(<span class="dv">2011</span>, <span class="dv">2</span>, <span class="dv">1</span>)],</span>
|
||
<span id="cb1-8"><a href="#cb1-8"></a>... [<span class="dv">1662</span>, <span class="st">'NY'</span>, <span class="dv">150</span>, datetime.date(<span class="dv">2011</span>, <span class="dv">2</span>, <span class="dv">1</span>)],</span>
|
||
<span id="cb1-9"><a href="#cb1-9"></a>... [<span class="dv">1663</span>, <span class="st">'SC'</span>, <span class="dv">154</span>, datetime.date(<span class="dv">2011</span>, <span class="dv">2</span>, <span class="dv">1</span>)],</span>
|
||
<span id="cb1-10"><a href="#cb1-10"></a>... [<span class="dv">1664</span>, <span class="st">'TO'</span>, <span class="dv">383</span>, datetime.date(<span class="dv">2011</span>, <span class="dv">2</span>, <span class="dv">1</span>)]</span>
|
||
<span id="cb1-11"><a href="#cb1-11"></a>... ]</span>
|
||
<span id="cb1-12"><a href="#cb1-12"></a><span class="op">>>></span> <span class="bu">len</span>(marriage_data) <span class="co"># There are eight rows of data</span></span>
|
||
<span id="cb1-13"><a href="#cb1-13"></a><span class="dv">8</span></span>
|
||
<span id="cb1-14"><a href="#cb1-14"></a><span class="op">>>></span> <span class="bu">len</span>(marriage_data[<span class="dv">0</span>]) <span class="co"># The first row has four elements</span></span>
|
||
<span id="cb1-15"><a href="#cb1-15"></a><span class="dv">4</span></span>
|
||
<span id="cb1-16"><a href="#cb1-16"></a><span class="op">>>></span> [<span class="bu">len</span>(row) <span class="cf">for</span> row <span class="kw">in</span> marriage_data] <span class="co"># Every row has four elements</span></span>
|
||
<span id="cb1-17"><a href="#cb1-17"></a>[<span class="dv">4</span>, <span class="dv">4</span>, <span class="dv">4</span>, <span class="dv">4</span>, <span class="dv">4</span>, <span class="dv">4</span>, <span class="dv">4</span>, <span class="dv">4</span>]</span>
|
||
<span id="cb1-18"><a href="#cb1-18"></a><span class="op">>>></span> marriage_data[<span class="dv">0</span>]</span>
|
||
<span id="cb1-19"><a href="#cb1-19"></a>[<span class="dv">1657</span>, <span class="st">'ET'</span>, <span class="dv">80</span>, datetime.date(<span class="dv">2011</span>, <span class="dv">1</span>, <span class="dv">1</span>)]</span>
|
||
<span id="cb1-20"><a href="#cb1-20"></a><span class="op">>>></span> marriage_data[<span class="dv">1</span>]</span>
|
||
<span id="cb1-21"><a href="#cb1-21"></a>[<span class="dv">1658</span>, <span class="st">'NY'</span>, <span class="dv">136</span>, datetime.date(<span class="dv">2011</span>, <span class="dv">1</span>, <span class="dv">1</span>)]</span></code></pre></div>
|
||
<p>We can see that by indexing the nested list <code>marriage_data</code>, a list is returned. Specifically, this list represents a row from our table. For each row, we can then access its id via index 0, its civic centre via index 1, and so on.</p>
|
||
<div class="sourceCode" id="cb2"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1"></a><span class="op">>>></span> marriage_data[<span class="dv">0</span>][<span class="dv">0</span>]</span>
|
||
<span id="cb2-2"><a href="#cb2-2"></a><span class="dv">1657</span></span>
|
||
<span id="cb2-3"><a href="#cb2-3"></a><span class="op">>>></span> marriage_data[<span class="dv">0</span>][<span class="dv">1</span>]</span>
|
||
<span id="cb2-4"><a href="#cb2-4"></a><span class="co">'ET'</span></span>
|
||
<span id="cb2-5"><a href="#cb2-5"></a><span class="op">>>></span> marriage_data[<span class="dv">0</span>][<span class="dv">2</span>]</span>
|
||
<span id="cb2-6"><a href="#cb2-6"></a><span class="dv">80</span></span>
|
||
<span id="cb2-7"><a href="#cb2-7"></a><span class="op">>>></span> marriage_data[<span class="dv">0</span>][<span class="dv">3</span>]</span>
|
||
<span id="cb2-8"><a href="#cb2-8"></a>datetime.date(<span class="dv">2011</span>, <span class="dv">1</span>, <span class="dv">1</span>)</span></code></pre></div>
|
||
<h2 id="accessing-columns-and-filtering-rows">Accessing columns and filtering rows</h2>
|
||
<p>Suppose we want to see all of the different values from a single column of this table (e.g., all civic centres or marriage license numbers). We can retrieve a column by using a list comprehension:</p>
|
||
<div class="sourceCode" id="cb3"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1"></a><span class="op">>>></span> [row[<span class="dv">1</span>] <span class="cf">for</span> row <span class="kw">in</span> marriage_data] <span class="co"># The civic centre column</span></span>
|
||
<span id="cb3-2"><a href="#cb3-2"></a>[<span class="st">'ET'</span>, <span class="st">'NY'</span>, <span class="st">'SC'</span>, <span class="st">'TO'</span>, <span class="st">'ET'</span>, <span class="st">'NY'</span>, <span class="st">'SC'</span>, <span class="st">'TO'</span>]</span></code></pre></div>
|
||
<p>Or, using an identically-structured set comprehension, we can obtain all unique values in a column.</p>
|
||
<div class="sourceCode" id="cb4"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1"></a><span class="op">>>></span> {row[<span class="dv">1</span>] <span class="cf">for</span> row <span class="kw">in</span> marriage_data}</span>
|
||
<span id="cb4-2"><a href="#cb4-2"></a>{<span class="st">'NY'</span>, <span class="st">'TO'</span>, <span class="st">'ET'</span>, <span class="st">'SC'</span>}</span></code></pre></div>
|
||
<p>Using our knowledge of filtering using if conditions in comprehensions, we can retrieve all rows corresponding to a specific civic centre.</p>
|
||
<div class="sourceCode" id="cb5"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1"></a><span class="op">>>></span> [row <span class="cf">for</span> row <span class="kw">in</span> marriage_data <span class="cf">if</span> row[<span class="dv">1</span>] <span class="op">==</span> <span class="st">'TO'</span>]</span>
|
||
<span id="cb5-2"><a href="#cb5-2"></a>[[<span class="dv">1660</span>, <span class="st">'TO'</span>, <span class="dv">367</span>, datetime.date(<span class="dv">2011</span>, <span class="dv">1</span>, <span class="dv">1</span>)], [<span class="dv">1664</span>, <span class="st">'TO'</span>, <span class="dv">383</span>, datetime.date(<span class="dv">2011</span>, <span class="dv">2</span>, <span class="dv">1</span>)]]</span></code></pre></div>
|
||
<p>Or we can filter rows based on a threshold for the number of marriage licenses issued:</p>
|
||
<div class="sourceCode" id="cb6"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1"></a><span class="op">>>></span> [row <span class="cf">for</span> row <span class="kw">in</span> marriage_data <span class="cf">if</span> row[<span class="dv">2</span>] <span class="op">></span> <span class="dv">380</span>]</span>
|
||
<span id="cb6-2"><a href="#cb6-2"></a>[[<span class="dv">1664</span>, <span class="st">'TO'</span>, <span class="dv">383</span>, datetime.date(<span class="dv">2011</span>, <span class="dv">2</span>, <span class="dv">1</span>)]]</span></code></pre></div>
|
||
<h2 id="a-worked-example">A worked example</h2>
|
||
<p>Earlier, we asked the question: What is the average number of marriage licenses issued by each civic centre? The question implies a mapping of civic centre names to numbers (i.e., the average). This means we need to create a dictionary comprehension. Let’s start exploring in the Python console. Remember, we saw earlier that we can get all unique civic centre names in the data through a set comprehension.</p>
|
||
<div class="sourceCode" id="cb7"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1"></a><span class="op">>>></span> names <span class="op">=</span> {row[<span class="dv">1</span>] <span class="cf">for</span> row <span class="kw">in</span> marriage_data}</span>
|
||
<span id="cb7-2"><a href="#cb7-2"></a><span class="op">>>></span> names</span>
|
||
<span id="cb7-3"><a href="#cb7-3"></a>{<span class="st">'NY'</span>, <span class="st">'TO'</span>, <span class="st">'ET'</span>, <span class="st">'SC'</span>}</span>
|
||
<span id="cb7-4"><a href="#cb7-4"></a><span class="op">>>></span> {key: <span class="dv">0</span> <span class="cf">for</span> key <span class="kw">in</span> names}</span>
|
||
<span id="cb7-5"><a href="#cb7-5"></a>{<span class="st">'NY'</span>: <span class="dv">0</span>, <span class="st">'TO'</span>: <span class="dv">0</span>, <span class="st">'ET'</span>: <span class="dv">0</span>, <span class="st">'SC'</span>: <span class="dv">0</span>}</span></code></pre></div>
|
||
<p>So far, we’ve created a dictionary where each key is a civic centre name and they all map to the value 0. To proceed, we need to be able to calculate the average number of marriage licenses issued per month by each civic centre.</p>
|
||
<p>Let’s try to do this just for the <code>'TO'</code> civic centre first. We saw earlier how to get all rows for a specific civic centre, and to extract the values for a specific column. We’ll first combine these two operations to retrieve the number of marriage licenses issued by <code>'TO'</code> each month.</p>
|
||
<div class="sourceCode" id="cb8"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1"></a><span class="op">>>></span> [row <span class="cf">for</span> row <span class="kw">in</span> marriage_data <span class="cf">if</span> row[<span class="dv">1</span>] <span class="op">==</span> <span class="st">'TO'</span>] <span class="co"># The 'TO' rows</span></span>
|
||
<span id="cb8-2"><a href="#cb8-2"></a>[[<span class="dv">1660</span>, <span class="st">'TO'</span>, <span class="dv">367</span>, datetime.date(<span class="dv">2011</span>, <span class="dv">1</span>, <span class="dv">1</span>)], [<span class="dv">1664</span>, <span class="st">'TO'</span>, <span class="dv">383</span>, datetime.date(<span class="dv">2011</span>, <span class="dv">2</span>, <span class="dv">1</span>)]]</span>
|
||
<span id="cb8-3"><a href="#cb8-3"></a><span class="op">>>></span> [row[<span class="dv">2</span>] <span class="cf">for</span> row <span class="kw">in</span> marriage_data <span class="cf">if</span> row[<span class="dv">1</span>] <span class="op">==</span> <span class="st">'TO'</span>] <span class="co"># The 'TO' marriages issued</span></span>
|
||
<span id="cb8-4"><a href="#cb8-4"></a>[<span class="dv">367</span>, <span class="dv">383</span>]</span>
|
||
<span id="cb8-5"><a href="#cb8-5"></a><span class="op">>>></span> issued_by_TO <span class="op">=</span> [row[<span class="dv">2</span>] <span class="cf">for</span> row <span class="kw">in</span> marriage_data <span class="cf">if</span> row[<span class="dv">1</span>] <span class="op">==</span> <span class="st">'TO'</span>]</span></code></pre></div>
|
||
<p>So <code>issued_by_TO</code> is now a list containing the number of marriage licenses issued by the <code>'TO'</code> civic centre. We can now calculate their average by dividing its sum by its length:</p>
|
||
<div class="sourceCode" id="cb9"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1"></a><span class="op">>>></span> <span class="bu">sum</span>(issued_by_TO) <span class="op">/</span> <span class="bu">len</span>(issued_by_TO)</span>
|
||
<span id="cb9-2"><a href="#cb9-2"></a><span class="fl">375.0</span></span></code></pre></div>
|
||
<p>Excellent! Through our exploration, we managed to find the average number of marriage licenses issued by one specific civic centre. How can we merge this with our earlier dictionary comprehension? It’s quite a bit to keep in our head at once, and looks like it will quickly get messy. At this point, we should design a function to help us. Specifically, let’s design a function that calculates the average for only one civic centre. As input, we will need the dataset as well as the name of the civic centre we are querying.</p>
|
||
<div class="sourceCode" id="cb10"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1"></a><span class="kw">def</span> average_licenses_issued(data: <span class="bu">list</span>[<span class="bu">list</span>], civic_centre: <span class="bu">str</span>) <span class="op">-></span> <span class="bu">float</span>:</span>
|
||
<span id="cb10-2"><a href="#cb10-2"></a> <span class="co">"""Return the average number of marriage licenses issued by civic_centre in data.</span></span>
|
||
<span id="cb10-3"><a href="#cb10-3"></a></span>
|
||
<span id="cb10-4"><a href="#cb10-4"></a><span class="co"> Return 0.0 if civic_centre does not appear in the given data.</span></span>
|
||
<span id="cb10-5"><a href="#cb10-5"></a></span>
|
||
<span id="cb10-6"><a href="#cb10-6"></a><span class="co"> Preconditions:</span></span>
|
||
<span id="cb10-7"><a href="#cb10-7"></a><span class="co"> - all({len(row) == 4 for row in data})</span></span>
|
||
<span id="cb10-8"><a href="#cb10-8"></a><span class="co"> - data is in the format described in Section 4.1</span></span>
|
||
<span id="cb10-9"><a href="#cb10-9"></a><span class="co"> """</span></span>
|
||
<span id="cb10-10"><a href="#cb10-10"></a> issued_by_civic_centre <span class="op">=</span> [row[<span class="dv">2</span>] <span class="cf">for</span> row <span class="kw">in</span> data <span class="cf">if</span> row[<span class="dv">1</span>] <span class="op">==</span> civic_centre]</span>
|
||
<span id="cb10-11"><a href="#cb10-11"></a></span>
|
||
<span id="cb10-12"><a href="#cb10-12"></a> <span class="cf">if</span> issued_by_civic_centre <span class="op">==</span> []:</span>
|
||
<span id="cb10-13"><a href="#cb10-13"></a> <span class="cf">return</span> <span class="fl">0.0</span></span>
|
||
<span id="cb10-14"><a href="#cb10-14"></a> <span class="cf">else</span>:</span>
|
||
<span id="cb10-15"><a href="#cb10-15"></a> total <span class="op">=</span> <span class="bu">sum</span>(issued_by_civic_centre)</span>
|
||
<span id="cb10-16"><a href="#cb10-16"></a> count <span class="op">=</span> <span class="bu">len</span>(issued_by_civic_centre)</span>
|
||
<span id="cb10-17"><a href="#cb10-17"></a></span>
|
||
<span id="cb10-18"><a href="#cb10-18"></a> <span class="cf">return</span> total <span class="op">/</span> count</span></code></pre></div>
|
||
<p>Let’s test it to make sure we get the same result as before:</p>
|
||
<div class="sourceCode" id="cb11"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1"></a><span class="op">>>></span> average_licenses_issued(marriage_data, <span class="st">'TO'</span>)</span>
|
||
<span id="cb11-2"><a href="#cb11-2"></a><span class="fl">375.0</span></span></code></pre></div>
|
||
<p>Finally, we can combine it with our previous dictionary comprehension by observing that <code>'TO'</code> can be replaced with the <code>key</code> that is changing:</p>
|
||
<div class="sourceCode" id="cb12"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1"></a><span class="op">>>></span> {key: <span class="dv">0</span> <span class="cf">for</span> key <span class="kw">in</span> names}</span>
|
||
<span id="cb12-2"><a href="#cb12-2"></a>{<span class="st">'NY'</span>: <span class="dv">0</span>, <span class="st">'TO'</span>: <span class="dv">0</span>, <span class="st">'ET'</span>: <span class="dv">0</span>, <span class="st">'SC'</span>: <span class="dv">0</span>}</span>
|
||
<span id="cb12-3"><a href="#cb12-3"></a><span class="op">>>></span> {key: average_licenses_issued(marriage_data, key) <span class="cf">for</span> key <span class="kw">in</span> names}</span>
|
||
<span id="cb12-4"><a href="#cb12-4"></a>{<span class="st">'NY'</span>: <span class="fl">143.0</span>, <span class="st">'TO'</span>: <span class="fl">375.0</span>, <span class="st">'ET'</span>: <span class="fl">94.5</span>, <span class="st">'SC'</span>: <span class="fl">156.5</span>}</span></code></pre></div>
|
||
<p>Now that we’ve done this exploration in the Python console, we can save our work by writing this as a function:</p>
|
||
<div class="sourceCode" id="cb13"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1"></a><span class="kw">def</span> average_licenses_by_centre(marriage_data: <span class="bu">list</span>[<span class="bu">list</span>]) <span class="op">-></span> Dict[<span class="bu">str</span>, <span class="bu">float</span>]:</span>
|
||
<span id="cb13-2"><a href="#cb13-2"></a> <span class="co">"""Return a mapping of the average number of marriage licenses issued at each civic centre.</span></span>
|
||
<span id="cb13-3"><a href="#cb13-3"></a></span>
|
||
<span id="cb13-4"><a href="#cb13-4"></a><span class="co"> In the returned mapping:</span></span>
|
||
<span id="cb13-5"><a href="#cb13-5"></a><span class="co"> - Each key is the name of a civic centre</span></span>
|
||
<span id="cb13-6"><a href="#cb13-6"></a><span class="co"> - Each corresponding value is the average number of marriage licenses issued at</span></span>
|
||
<span id="cb13-7"><a href="#cb13-7"></a><span class="co"> that centre.</span></span>
|
||
<span id="cb13-8"><a href="#cb13-8"></a></span>
|
||
<span id="cb13-9"><a href="#cb13-9"></a><span class="co"> Preconditions:</span></span>
|
||
<span id="cb13-10"><a href="#cb13-10"></a><span class="co"> - marriage_data is in the format described in Section 4.1</span></span>
|
||
<span id="cb13-11"><a href="#cb13-11"></a><span class="co"> """</span></span>
|
||
<span id="cb13-12"><a href="#cb13-12"></a> names <span class="op">=</span> {<span class="st">'TO'</span>, <span class="st">'NY'</span>, <span class="st">'ET'</span>, <span class="st">'SC'</span>}</span>
|
||
<span id="cb13-13"><a href="#cb13-13"></a> <span class="cf">return</span> {key: average_licenses_issued(marriage_data, key) <span class="cf">for</span> key <span class="kw">in</span> names}</span></code></pre></div>
|
||
</section>
|
||
<footer>
|
||
<a href="https://www.teach.cs.toronto.edu/~csc110y/fall/notes/">CSC110 Course Notes Home</a>
|
||
</footer>
|
||
</body>
|
||
</html>
|